diff --git a/README.md b/README.md index 7eadb934..183f77db 100644 --- a/README.md +++ b/README.md @@ -238,6 +238,80 @@ const expanded = await store.expandQuery("auth flow", { intent: "user login" }) const results4 = await store.search({ queries: expanded }) ``` +#### Custom Lexical Backends + +QMD uses SQLite FTS5 by default, but the lexical search layer is pluggable. +Custom backends do not replace QMD's store: they return references to documents +that already exist in QMD's SQLite database, and QMD hydrates those hits before +snippets, context lookup, RRF fusion, and reranking. + +```typescript +import { createStore, type LexicalSearchBackend } from '@tobilu/qmd' + +const lexicalBackend: LexicalSearchBackend = { + name: "my-lexical-backend", + async search(request) { + // request: { query, limit, collectionName, dbPath } + return [ + // score is normalized 0..1, higher is better + { filepath: "qmd://docs/auth.md", score: 0.93 }, + ] + }, +} + +const store = await createStore({ + dbPath: "./index.sqlite", + config: { + collections: { + docs: { path: "/path/to/docs", pattern: "**/*.md" }, + }, + }, + lexicalBackend, +}) + +const results = await store.searchLex("auth middleware") +``` + +For CLI deployments, configure a command backend in `index.yml`. QMD writes a +JSON request to the command's stdin and expects either a JSON array of hits or an +object with a `hits` array on stdout: + +```yaml +search: + lexicalBackend: + type: command + name: tantivy + command: qmd-lexical-backend-tantivy + args: ["search", "--index-dir", "/path/to/tantivy-index"] + timeoutMs: 5000 +``` + +Command request: + +```json +{ + "query": "auth middleware", + "limit": 20, + "collectionName": "docs", + "dbPath": "/home/user/.cache/qmd/index.sqlite" +} +``` + +Command response: + +```json +{ + "hits": [ + { "documentId": 123, "score": 0.96, "rawScore": 42.1 }, + { "filepath": "qmd://docs/auth.md", "score": 0.93 } + ] +} +``` + +Supported hit references are `documentId`, `hash`, `docid`, `filepath`, or +`collectionName` + `path`. QMD keeps `SearchResult.source === "fts"` for all +lexical backends and adds `SearchResult.lexicalBackend` for observability. + #### Retrieval ```typescript @@ -436,11 +510,12 @@ The SDK requires explicit `dbPath` — no defaults are assumed. This makes it sa ## Score Normalization & Fusion -### Search Backends +### Lexical Backends | Backend | Raw Score | Conversion | Range | |---------|-----------|------------|-------| | **FTS (BM25)** | SQLite FTS5 BM25 | `Math.abs(score)` | 0 to ~25+ | +| **Custom lexical** | Backend-defined | Must return normalized `score` | 0.0 to 1.0 | | **Vector** | Cosine distance | `1 / (1 + distance)` | 0.0 to 1.0 | | **Reranker** | LLM 0-10 rating | `score / 10` | 0.0 to 1.0 | diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts index 105506d3..c5a8de3d 100755 --- a/src/cli/qmd.ts +++ b/src/cli/qmd.ts @@ -13,7 +13,6 @@ import { homedir, resolve, enableProductionMode, - searchFTS, extractSnippet, getContextForFile, getContextForPath, @@ -110,6 +109,7 @@ import { type CollectionConfig, type ModelsConfig, } from "../collections.js"; +import { createLexicalSearchBackendFromConfig } from "../lexical-backends.js"; // NOTE: enableProductionMode() is intentionally NOT called at module scope here. // Importing this module for its exports (e.g. buildEditorUri, termLink from @@ -128,19 +128,25 @@ let currentIndexName = "index"; function getStore(): ReturnType { if (!store) { - store = createStore(storeDbPathOverride); - // Sync YAML config into SQLite store_collections so store.ts reads from DB + let activeModels: ReturnType | undefined; + let config: CollectionConfig | undefined; try { - const activeModels = ensureModelsConfiguredForCli(); - const config = loadConfig(); + activeModels = ensureModelsConfiguredForCli(); + config = loadConfig(); + } catch { + // Config may not exist yet — that's fine, DB works without it + } + store = createStore(storeDbPathOverride, { + lexicalBackend: createLexicalSearchBackendFromConfig(config?.search?.lexicalBackend), + }); + if (config && activeModels) { + // Sync YAML config into SQLite store_collections so store.ts reads from DB syncConfigToDb(store.db, config); setDefaultLlamaCpp(new LlamaCpp({ embedModel: activeModels.embed, generateModel: activeModels.generate, rerankModel: activeModels.rerank, })); - } catch { - // Config may not exist yet — that's fine, DB works without it } } return store; @@ -2585,8 +2591,9 @@ function parseStructuredQuery(query: string): ParsedStructuredQuery | null { return typed.length > 0 ? { searches: typed, intent } : null; } -function search(query: string, opts: OutputOptions): void { - const db = getDb(); +async function search(query: string, opts: OutputOptions): Promise { + const storeInstance = getStore(); + const db = storeInstance.db; // Validate collection filter (supports multiple -c flags) // Use default collections if none specified @@ -2596,7 +2603,7 @@ function search(query: string, opts: OutputOptions): void { // Use large limit for --all, otherwise fetch more than needed and let outputResults filter const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2); const results = filterByCollections( - searchFTS(db, query, fetchLimit, singleCollection), + await storeInstance.searchLexical(query, fetchLimit, singleCollection), collectionNames ); @@ -4450,7 +4457,7 @@ if (isMain) { console.error("Usage: qmd search [options] "); process.exit(1); } - search(cli.query, cli.opts); + await search(cli.query, cli.opts); break; case "vsearch": diff --git a/src/collections.ts b/src/collections.ts index 6950493d..0758257c 100644 --- a/src/collections.ts +++ b/src/collections.ts @@ -9,6 +9,7 @@ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs"; import { join, dirname, resolve } from "path"; import { qmdHomedir } from "./paths.js"; import YAML from "yaml"; +import type { LexicalBackendConfig } from "./lexical-backends.js"; // ============================================================================ // Types @@ -51,6 +52,10 @@ export interface CollectionConfig { editor_uri_template?: string; // Alias for editor_uri collections: Record; // Collection name -> config models?: ModelsConfig; + search?: { + /** Lexical lexical backend. Defaults to SQLite FTS5. */ + lexicalBackend?: LexicalBackendConfig; + }; } /** diff --git a/src/index.ts b/src/index.ts index f853a974..edb8d115 100644 --- a/src/index.ts +++ b/src/index.ts @@ -80,6 +80,15 @@ import { type NamedCollection, type ContextMap, } from "./collections.js"; +import { + createLexicalSearchBackendFromConfig, + type LexicalBackendConfig, + type LexicalDocumentRef, + type LexicalSearchBackend, + type LexicalSearchBackendContext, + type LexicalSearchHit, + type LexicalSearchRequest, +} from "./lexical-backends.js"; // Re-export types for SDK consumers export type { @@ -103,6 +112,12 @@ export type { CollectionConfig, NamedCollection, ContextMap, + LexicalBackendConfig, + LexicalDocumentRef, + LexicalSearchBackend, + LexicalSearchBackendContext, + LexicalSearchHit, + LexicalSearchRequest, }; // Re-export the internal Store type for advanced consumers @@ -110,6 +125,11 @@ export type { InternalStore }; // Re-export utility functions and types used by frontends export { extractSnippet, addLineNumbers, DEFAULT_MULTI_GET_MAX_BYTES }; +export { + createExternalCommandLexicalBackend, + createLexicalSearchBackendFromConfig, + isSqliteFts5LexicalBackendConfig, +} from "./lexical-backends.js"; export type { ChunkStrategy } from "./store.js"; // Re-export getDefaultDbPath for CLI/MCP that need the default database location @@ -205,6 +225,11 @@ export interface StoreOptions { configPath?: string; /** Inline collection config (mutually exclusive with `configPath`) */ config?: CollectionConfig; + /** + * Custom lexical backend for searchLex() and lex parts of search(). + * Omit to use the built-in SQLite FTS5 backend. + */ + lexicalBackend?: LexicalSearchBackend; } /** @@ -346,27 +371,32 @@ export async function createStore(options: StoreOptions): Promise { throw new Error("Provide either configPath or config, not both"); } - // Create the internal store (opens DB, creates tables) - const internal = createStoreInternal(options.dbPath); - const db = internal.db; - // Track whether we have a YAML config path for write-through const hasYamlConfig = !!options.configPath; - // Sync config into SQLite store_collections + // Load config before creating the internal store so lexical backend selection + // can come from either SDK options or YAML config. let config: CollectionConfig | undefined; if (options.configPath) { - // YAML mode: inject config source for write-through, sync to DB setConfigSource({ configPath: options.configPath }); config = loadConfig(); - syncConfigToDb(db, config); } else if (options.config) { - // Inline config mode: inject config source for mutations, sync to DB setConfigSource({ config: options.config }); config = options.config; + } + + const lexicalBackend = options.lexicalBackend + ?? createLexicalSearchBackendFromConfig(config?.search?.lexicalBackend); + + // Create the internal store (opens DB, creates tables) + const internal = createStoreInternal(options.dbPath, { lexicalBackend }); + const db = internal.db; + + // Sync config into SQLite store_collections + if (config) { syncConfigToDb(db, config); } - // else: DB-only mode — no external config, use existing store_collections + // else: DB-only mode - no external config, use existing store_collections // Create a per-store LlamaCpp instance — lazy-loads models on first use, // auto-unloads after 5 min inactivity to free VRAM. @@ -421,7 +451,7 @@ export async function createStore(options: StoreOptions): Promise { chunkStrategy: opts.chunkStrategy, }); }, - searchLex: async (q, opts) => internal.searchFTS(q, opts?.limit, opts?.collection), + searchLex: async (q, opts) => internal.searchLexical(q, opts?.limit, opts?.collection), searchVector: async (q, opts) => internal.searchVec(q, llm.embedModelName, opts?.limit, opts?.collection), expandQuery: async (q, opts) => internal.expandQuery(q, undefined, opts?.intent), get: async (pathOrDocid, opts) => internal.findDocument(pathOrDocid, opts), diff --git a/src/lexical-backends.ts b/src/lexical-backends.ts new file mode 100644 index 00000000..05738117 --- /dev/null +++ b/src/lexical-backends.ts @@ -0,0 +1,174 @@ +import { spawn } from "node:child_process"; + +type MaybePromise = T | Promise; + +/** + * Stable reference to a document already known to QMD. + * + * Custom lexical backends should return at least one of these identifiers so + * QMD can hydrate the hit from its SQLite store before snippets, contexts, RRF, + * and reranking run. + */ +export type LexicalDocumentRef = { + /** `documents.id` in QMD's SQLite database. */ + documentId?: number; + /** QMD content hash. */ + hash?: string; + /** Short content hash prefix accepted by QMD, with or without leading `#`. */ + docid?: string; + /** Full virtual path, e.g. `qmd://docs/readme.md`. */ + filepath?: string; + /** Collection name plus collection-relative path. */ + collectionName?: string; + path?: string; +}; + +export type LexicalSearchHit = LexicalDocumentRef & { + /** Higher is better. QMD preserves ordering returned by the backend. */ + score: number; + /** Optional unnormalized backend-native score for diagnostics. */ + rawScore?: number; + /** Backend-specific metadata. QMD does not interpret this object. */ + metadata?: Record; +}; + +export type LexicalSearchRequest = { + query: string; + limit: number; + collectionName?: string; + /** Absolute path to QMD's SQLite database. */ + dbPath: string; +}; + +export type LexicalSearchBackendContext = { + /** Absolute path to QMD's SQLite database. */ + dbPath: string; +}; + +export interface LexicalSearchBackend { + /** + * Stable backend identifier. It is surfaced in SearchResult.lexicalBackend + * for observability, while SearchResult.source remains "fts" for pipeline + * compatibility. + */ + name: string; + search( + request: LexicalSearchRequest, + context: LexicalSearchBackendContext + ): MaybePromise; +} + +export type ExternalCommandLexicalBackendOptions = { + name?: string; + command: string; + args?: string[]; + env?: Record; + timeoutMs?: number; +}; + +export type LexicalBackendConfig = + | { type?: "sqlite-fts5" } + | ({ type: "command" } & ExternalCommandLexicalBackendOptions); + +export function isSqliteFts5LexicalBackendConfig(config: LexicalBackendConfig | undefined): boolean { + return !config || !config.type || config.type === "sqlite-fts5"; +} + +export function createExternalCommandLexicalBackend( + options: ExternalCommandLexicalBackendOptions +): LexicalSearchBackend { + return { + name: options.name ?? "external-command", + search: async (request) => { + const stdout = await runBackendCommand(options, request); + const parsed = JSON.parse(stdout) as unknown; + const hits = Array.isArray(parsed) + ? parsed + : isRecord(parsed) && Array.isArray(parsed.hits) + ? parsed.hits + : null; + if (!hits) { + throw new Error("Lexical backend command must return a JSON array or an object with a hits array"); + } + return hits.map(parseHit); + }, + }; +} + +export function createLexicalSearchBackendFromConfig( + config: LexicalBackendConfig | undefined +): LexicalSearchBackend | undefined { + if (!config || !config.type || config.type === "sqlite-fts5") return undefined; + switch (config.type) { + case "command": + return createExternalCommandLexicalBackend(config); + default: + throw new Error(`Unsupported lexical backend config: ${JSON.stringify(config)}`); + } +} + +function runBackendCommand( + options: ExternalCommandLexicalBackendOptions, + request: LexicalSearchRequest +): Promise { + return new Promise((resolve, reject) => { + const child = spawn(options.command, options.args ?? [], { + env: { ...process.env, ...options.env }, + stdio: ["pipe", "pipe", "pipe"], + }); + let stdout = ""; + let stderr = ""; + let settled = false; + const timer = options.timeoutMs + ? setTimeout(() => { + settled = true; + child.kill("SIGTERM"); + reject(new Error(`Lexical backend command timed out after ${options.timeoutMs}ms`)); + }, options.timeoutMs) + : undefined; + + child.stdout.setEncoding("utf8"); + child.stderr.setEncoding("utf8"); + child.stdout.on("data", chunk => { stdout += chunk; }); + child.stderr.on("data", chunk => { stderr += chunk; }); + child.on("error", error => { + if (timer) clearTimeout(timer); + if (!settled) reject(error); + }); + child.on("close", code => { + if (timer) clearTimeout(timer); + if (settled) return; + if (code !== 0) { + reject(new Error(`Lexical backend command exited with ${code}: ${stderr.trim()}`)); + return; + } + resolve(stdout); + }); + child.stdin.end(JSON.stringify(request)); + }); +} + +function parseHit(hit: unknown): LexicalSearchHit { + if (!isRecord(hit)) { + throw new Error("Lexical backend hit must be an object"); + } + const score = Number(hit.score); + if (!Number.isFinite(score)) { + throw new Error("Lexical backend hit must include a finite numeric score"); + } + return { + documentId: typeof hit.documentId === "number" ? hit.documentId : undefined, + hash: typeof hit.hash === "string" ? hit.hash : undefined, + docid: typeof hit.docid === "string" ? hit.docid : undefined, + filepath: typeof hit.filepath === "string" ? hit.filepath : undefined, + collectionName: typeof hit.collectionName === "string" ? hit.collectionName : undefined, + path: typeof hit.path === "string" ? hit.path : undefined, + score, + rawScore: typeof hit.rawScore === "number" ? hit.rawScore : undefined, + metadata: isRecord(hit.metadata) ? hit.metadata : undefined, + }; +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null && !Array.isArray(value); +} diff --git a/src/store.ts b/src/store.ts index 99e36b86..3d2a0a2f 100644 --- a/src/store.ts +++ b/src/store.ts @@ -37,6 +37,7 @@ import type { CollectionConfig, ContextMap, } from "./collections.js"; +import type { LexicalSearchBackend, LexicalSearchHit } from "./lexical-backends.js"; // ============================================================================= // Configuration @@ -1178,6 +1179,8 @@ export type Store = { dbPath: string; /** Optional LlamaCpp instance for this store (overrides the global singleton) */ llm?: LlamaCpp; + /** Optional lexical backend. Omitted means built-in SQLite FTS5. */ + lexicalBackend?: LexicalSearchBackend; close: () => void; ensureVecTable: (dimensions: number) => void; @@ -1215,6 +1218,7 @@ export type Store = { // Search searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[]; + searchLexical: (query: string, limit?: number, collectionName?: string) => Promise; searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => Promise; // Query expansion & reranking @@ -1247,6 +1251,10 @@ export type Store = { insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number, fingerprint?: string) => void; }; +export interface StoreCreateOptions { + lexicalBackend?: LexicalSearchBackend; +} + // ============================================================================= // Reindex & Embed — pure-logic functions for SDK and CLI // ============================================================================= @@ -1845,7 +1853,7 @@ export async function generateEmbeddings( * @param dbPath - Path to the SQLite database file * @returns Store instance with all methods bound to the database */ -export function createStore(dbPath?: string): Store { +export function createStore(dbPath?: string, options: StoreCreateOptions = {}): Store { const resolvedPath = dbPath || getDefaultDbPath(); const db = openDatabase(resolvedPath); initializeDatabase(db); @@ -1853,6 +1861,7 @@ export function createStore(dbPath?: string): Store { const store: Store = { db, dbPath: resolvedPath, + lexicalBackend: options.lexicalBackend, close: () => db.close(), ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions), @@ -1890,6 +1899,7 @@ export function createStore(dbPath?: string): Store { // Search searchFTS: (query: string, limit?: number, collectionName?: string) => searchFTS(db, query, limit, collectionName), + searchLexical: (query: string, limit?: number, collectionName?: string) => searchLexical(db, resolvedPath, query, limit, collectionName, store.lexicalBackend), searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding), // Query expansion & reranking @@ -2030,6 +2040,7 @@ export type SearchResult = DocumentResult & { score: number; // Relevance score (0-1) source: "fts" | "vec"; // Search source (full-text or vector) chunkPos?: number; // Character position of matching chunk (for vector search) + lexicalBackend?: string; // Concrete lexical backend name when source is "fts" }; /** @@ -3520,10 +3531,158 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle context: getContextForFile(db, row.filepath), score, source: "fts" as const, + lexicalBackend: "sqlite-fts5", }; }); } +export async function searchLexical( + db: Database, + dbPath: string, + query: string, + limit: number = 20, + collectionName?: string, + backend?: LexicalSearchBackend +): Promise { + if (!backend) { + return searchFTS(db, query, limit, collectionName); + } + + const hits = await backend.search( + { query, limit, collectionName, dbPath }, + { dbPath } + ); + return hydrateLexicalSearchHits(db, hits, backend.name, collectionName).slice(0, limit); +} + +type LexicalHitRow = { + id: number; + virtual_path: string; + display_path: string; + title: string; + hash: string; + collection: string; + modified_at: string; + body_length: number; + body: string; +}; + +function hydrateLexicalSearchHits( + db: Database, + hits: LexicalSearchHit[], + backendName: string, + collectionName?: string +): SearchResult[] { + const results: SearchResult[] = []; + const seen = new Set(); + + for (const hit of hits) { + const row = findLexicalHitRow(db, hit); + if (!row) continue; + if (collectionName && row.collection !== collectionName) continue; + if (seen.has(row.virtual_path)) continue; + seen.add(row.virtual_path); + + results.push({ + filepath: row.virtual_path, + displayPath: row.display_path, + title: row.title, + context: getContextForFile(db, row.virtual_path), + hash: row.hash, + docid: getDocid(row.hash), + collectionName: row.collection, + modifiedAt: row.modified_at, + bodyLength: row.body_length, + body: row.body, + score: clampSearchScore(hit.score), + source: "fts", + lexicalBackend: backendName, + }); + } + + return results; +} + +function findLexicalHitRow(db: Database, hit: LexicalSearchHit): LexicalHitRow | null { + const selectCols = ` + d.id, + 'qmd://' || d.collection || '/' || d.path as virtual_path, + d.collection || '/' || d.path as display_path, + d.title, + d.hash, + d.collection, + d.modified_at, + LENGTH(content.doc) as body_length, + content.doc as body + `; + + if (hit.documentId !== undefined) { + const row = db.prepare(` + SELECT ${selectCols} + FROM documents d + JOIN content ON content.hash = d.hash + WHERE d.id = ? AND d.active = 1 + `).get(hit.documentId) as LexicalHitRow | null; + if (row) return row; + } + + if (hit.hash) { + const row = db.prepare(` + SELECT ${selectCols} + FROM documents d + JOIN content ON content.hash = d.hash + WHERE d.hash = ? AND d.active = 1 + LIMIT 1 + `).get(hit.hash) as LexicalHitRow | null; + if (row) return row; + } + + if (hit.docid) { + const docid = hit.docid.startsWith("#") ? hit.docid.slice(1) : hit.docid; + const row = db.prepare(` + SELECT ${selectCols} + FROM documents d + JOIN content ON content.hash = d.hash + WHERE d.hash LIKE ? AND d.active = 1 + LIMIT 1 + `).get(`${docid}%`) as LexicalHitRow | null; + if (row) return row; + } + + if (hit.collectionName && hit.path) { + const row = db.prepare(` + SELECT ${selectCols} + FROM documents d + JOIN content ON content.hash = d.hash + WHERE d.collection = ? AND d.path = ? AND d.active = 1 + LIMIT 1 + `).get(hit.collectionName, hit.path) as LexicalHitRow | null; + if (row) return row; + } + + const filepath = hit.filepath ?? (hit.path && hit.collectionName ? `qmd://${hit.collectionName}/${hit.path}` : undefined); + if (!filepath) return null; + const doc = findDocument(db, filepath, { includeBody: true }); + if ("error" in doc) return null; + + return { + id: 0, + virtual_path: doc.filepath, + display_path: doc.displayPath, + title: doc.title, + hash: doc.hash, + collection: doc.collectionName, + modified_at: doc.modifiedAt, + body_length: doc.bodyLength, + body: doc.body ?? "", + }; +} + +function clampSearchScore(score: number): number { + if (!Number.isFinite(score)) return 0; + return Math.max(0, Math.min(1, score)); +} + // ============================================================================= // Vector Search // ============================================================================= @@ -4578,12 +4737,12 @@ export async function hybridQuery( `SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'` ).get(); - // Step 1: BM25 probe — strong signal skips expensive LLM expansion + // Step 1: Lexical probe — strong signal skips expensive LLM expansion // When intent is provided, disable strong-signal bypass — the obvious BM25 // match may not be what the caller wants (e.g. "performance" with intent // "web page load times" should NOT shortcut to a sports-performance doc). // Pass collection directly into FTS query (filter at SQL level, not post-hoc) - const initialFts = store.searchFTS(query, 20, collection); + const initialFts = await store.searchLexical(query, 20, collection); const topScore = initialFts[0]?.score ?? 0; const secondScore = initialFts[1]?.score ?? 0; const hasStrongSignal = !intent && initialFts.length > 0 @@ -4613,14 +4772,14 @@ export async function hybridQuery( // Step 3: Route searches by query type // - // Strategy: run all FTS queries immediately (they're sync/instant), then + // Strategy: run all lexical queries before vector search, then // batch-embed all vector queries in one embedBatch() call, then run // sqlite-vec lookups with pre-computed embeddings. - // 3a: Run FTS for all lex expansions right away (no LLM needed) + // 3a: Run lexical backend for all lex expansions right away (no LLM needed) for (const q of expanded) { if (q.type === 'lex') { - const ftsResults = store.searchFTS(q.query, 20, collection); + const ftsResults = await store.searchLexical(q.query, 20, collection); if (ftsResults.length > 0) { for (const r of ftsResults) docidMap.set(r.filepath, r.docid); rankedLists.push(ftsResults.map(r => ({ @@ -5002,11 +5161,11 @@ export async function structuredSearch( // Helper to run search across collections (or all if undefined) const collectionList = collections ?? [undefined]; // undefined = all collections - // Step 1: Run FTS for all lex searches (sync, instant) + // Step 1: Run lexical backend for all lex searches for (const search of searches) { if (search.type === 'lex') { for (const coll of collectionList) { - const ftsResults = store.searchFTS(search.query, 20, coll); + const ftsResults = await store.searchLexical(search.query, 20, coll); if (ftsResults.length > 0) { for (const r of ftsResults) docidMap.set(r.filepath, r.docid); rankedLists.push(ftsResults.map(r => ({ diff --git a/test/sdk.test.ts b/test/sdk.test.ts index 53764c56..9f1c524c 100644 --- a/test/sdk.test.ts +++ b/test/sdk.test.ts @@ -21,6 +21,7 @@ import { type LexSearchOptions, type VectorSearchOptions, type ExpandQueryOptions, + type LexicalSearchBackend, } from "../src/index.js"; import { setDefaultLlamaCpp } from "../src/llm.js"; @@ -575,6 +576,88 @@ describe("searchLex (BM25)", () => { }); }); +describe("custom lexical backend", () => { + test("searchLex hydrates backend document refs from QMD SQLite", async () => { + const dbPath = freshDbPath(); + const requests: unknown[] = []; + let documentId = 0; + const lexicalBackend: LexicalSearchBackend = { + name: "test-lexical", + search: (request) => { + requests.push(request); + return [{ documentId, score: 0.92, rawScore: 42 }]; + }, + }; + const store = await createStore({ + dbPath, + config: { + collections: { + docs: { path: docsDir, pattern: "**/*.md" }, + }, + }, + lexicalBackend, + }); + + const body = "# Custom Backend\n\nExternal lexical backends return QMD document refs."; + const hash = require("crypto").createHash("sha256").update(body).digest("hex"); + const now = new Date().toISOString(); + store.internal.insertContent(hash, body, now); + store.internal.insertDocument("docs", "custom-backend.md", "Custom Backend", hash, now, now); + const row = store.internal.db.prepare( + `SELECT id FROM documents WHERE collection = ? AND path = ?` + ).get("docs", "custom-backend.md") as { id: number }; + documentId = row.id; + + const results = await store.searchLex("anything", { limit: 5, collection: "docs" }); + + expect(requests).toHaveLength(1); + expect(results).toHaveLength(1); + expect(results[0]!.displayPath).toBe("docs/custom-backend.md"); + expect(results[0]!.body).toContain("External lexical backends"); + expect(results[0]!.source).toBe("fts"); + expect(results[0]!.lexicalBackend).toBe("test-lexical"); + await store.close(); + }); + + test("search() uses custom lexical backend for hybrid retrieval", async () => { + const dbPath = freshDbPath(); + let documentId = 0; + const lexicalBackend: LexicalSearchBackend = { + name: "test-lexical", + search: () => [{ documentId, score: 0.96 }], + }; + const store = await createStore({ + dbPath, + config: { + collections: { + docs: { path: docsDir, pattern: "**/*.md" }, + }, + }, + lexicalBackend, + }); + + const body = "# Hybrid Backend\n\nA strong lexical backend hit can seed hybrid search."; + const hash = require("crypto").createHash("sha256").update(body).digest("hex"); + const now = new Date().toISOString(); + store.internal.insertContent(hash, body, now); + store.internal.insertDocument("docs", "hybrid-backend.md", "Hybrid Backend", hash, now, now); + const row = store.internal.db.prepare( + `SELECT id FROM documents WHERE collection = ? AND path = ?` + ).get("docs", "hybrid-backend.md") as { id: number }; + documentId = row.id; + + const results = await store.search({ + query: "hybrid backend", + rerank: false, + limit: 1, + }); + + expect(results).toHaveLength(1); + expect(results[0]!.displayPath).toBe("docs/hybrid-backend.md"); + await store.close(); + }); +}); + // ============================================================================= // Unified search() API Tests // =============================================================================