diff --git a/db/migrations/014-rebuild-vector-partition-keys.sql b/db/migrations/014-rebuild-vector-partition-keys.sql new file mode 100644 index 00000000..538468cc --- /dev/null +++ b/db/migrations/014-rebuild-vector-partition-keys.sql @@ -0,0 +1,48 @@ +-- Migration: rebuild documents_vec with sqlite-vec partition keys +-- This enables selective KNN queries by library_id and version_id. + +-- Preserve compatible vectors from the existing vec table. This uses a +-- disk-backed staging table because large vector indexes can exceed memory. +DROP TABLE IF EXISTS _documents_vec_partition_migration; + +CREATE TABLE _documents_vec_partition_migration AS +SELECT + d.id AS rowid, + v.library_id, + v.id AS version_id, + dv.embedding +FROM documents_vec dv +JOIN documents d ON dv.rowid = d.id +JOIN pages p ON d.page_id = p.id +JOIN versions v ON p.version_id = v.id +WHERE vec_length(dv.embedding) = 1536; + +DROP TABLE documents_vec; + +CREATE VIRTUAL TABLE documents_vec USING vec0( + library_id INTEGER partition key, + version_id INTEGER partition key, + embedding FLOAT[1536] +); + +INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding) +SELECT rowid, library_id, version_id, embedding +FROM _documents_vec_partition_migration; + +-- Backfill any vectors stored on documents but missing from the vec table. +INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding) +SELECT + d.id, + v.library_id, + v.id AS version_id, + json_extract(d.embedding, '$') AS embedding +FROM documents d +JOIN pages p ON d.page_id = p.id +JOIN versions v ON p.version_id = v.id +WHERE d.embedding IS NOT NULL + AND vec_length(json_extract(d.embedding, '$')) = 1536 + AND NOT EXISTS ( + SELECT 1 FROM documents_vec existing WHERE existing.rowid = d.id + ); + +DROP TABLE _documents_vec_partition_migration; diff --git a/src/store/DocumentStore.test.ts b/src/store/DocumentStore.test.ts index 4492be58..6aad2289 100644 --- a/src/store/DocumentStore.test.ts +++ b/src/store/DocumentStore.test.ts @@ -458,6 +458,92 @@ describe("DocumentStore - With Embeddings", () => { expect(result.score).toBeGreaterThan(0); } }); + + it("should use partition-filtered vector search for hybrid results", async () => { + const originalApiKey = process.env.OPENAI_API_KEY; + try { + process.env.OPENAI_API_KEY = "test-key-for-partition-search"; + await store.shutdown(); + + const cfg = loadConfig(); + const embeddingConfig = EmbeddingConfig.parseEmbeddingConfig( + "openai:text-embedding-3-small", + ); + cfg.app.embeddingModel = embeddingConfig.modelSpec; + store = new DocumentStore(":memory:", cfg); + await store.initialize(); + + await store.addDocuments( + "searchtest", + "1.0.0", + 1, + createScrapeResult( + "JavaScript Programming Guide", + "https://example.com/js-guide", + "JavaScript programming tutorial with code examples and functions", + ["programming", "javascript"], + ), + ); + await store.addDocuments( + "searchtest", + "1.0.0", + 1, + createScrapeResult( + "JavaScript Frameworks", + "https://example.com/js-frameworks", + "Advanced JavaScript frameworks like React and Vue for building applications", + ["programming", "javascript", "frameworks"], + ), + ); + + // @ts-expect-error Accessing private property for testing + const db = store.db; + const ddl = db + .prepare( + "SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'documents_vec'", + ) + .get() as { sql: string }; + expect(ddl.sql).toContain("library_id INTEGER partition key"); + expect(ddl.sql).toContain("version_id INTEGER partition key"); + + const results = await store.findByContent( + "searchtest", + "1.0.0", + "application building", + 10, + ); + // @ts-expect-error Accessing private property for testing + const vector = await store.embeddings.embedQuery("application building"); + + expect(results.length).toBeGreaterThan(0); + + const vectorResults = db + .prepare(` + SELECT dv.rowid, dv.distance + FROM documents_vec dv + WHERE dv.library_id = ( + SELECT id FROM libraries WHERE name = ? + ) + AND dv.version_id = ( + SELECT v.id + FROM versions v + JOIN libraries l ON v.library_id = l.id + WHERE l.name = ? AND v.name = ? + ) + AND dv.embedding MATCH ? + AND dv.k = ? + ORDER BY dv.distance + `) + .all("searchtest", "searchtest", "1.0.0", JSON.stringify(vector), 10); + expect(vectorResults.length).toBeGreaterThan(0); + } finally { + if (originalApiKey === undefined) { + delete process.env.OPENAI_API_KEY; + } else { + process.env.OPENAI_API_KEY = originalApiKey; + } + } + }); }); describe("Embedding Batch Processing", () => { @@ -1777,6 +1863,8 @@ describe("DocumentStore - Embedding Model Change Safety", () => { ) .get() as { sql: string }; expect(ddl.sql).toContain("768"); + expect(ddl.sql).toContain("library_id INTEGER partition key"); + expect(ddl.sql).toContain("version_id INTEGER partition key"); }); it("should update metadata with new model and dimension", async () => { @@ -1804,6 +1892,8 @@ describe("DocumentStore - Embedding Model Change Safety", () => { .get() as { sql: string }; expect(ddl).toBeDefined(); expect(ddl.sql).toContain("1536"); + expect(ddl.sql).toContain("library_id INTEGER partition key"); + expect(ddl.sql).toContain("version_id INTEGER partition key"); // Table should be empty (no backfill) // @ts-expect-error Accessing private property for testing @@ -1844,5 +1934,98 @@ describe("DocumentStore - Embedding Model Change Safety", () => { .get() as { cnt: number }; expect(vecAfter.cnt).toBe(vecBefore.cnt); }); + + it("should rebuild old metadata-column vec table with current partition keys", async () => { + store = await createStore(""); + + // @ts-expect-error Accessing private property for testing + const db = store.db; + db.prepare("INSERT INTO libraries (name) VALUES (?)").run("legacyvec"); + const { id: libraryId } = db + .prepare("SELECT id FROM libraries WHERE name = ?") + .get("legacyvec") as { id: number }; + db.prepare("INSERT INTO versions (library_id, name) VALUES (?, ?)").run( + libraryId, + "1.0.0", + ); + const { id: versionId } = db + .prepare("SELECT id FROM versions WHERE library_id = ? AND name = ?") + .get(libraryId, "1.0.0") as { id: number }; + db.prepare("INSERT INTO versions (library_id, name) VALUES (?, ?)").run( + libraryId, + "2.0.0", + ); + const { id: staleVersionId } = db + .prepare("SELECT id FROM versions WHERE library_id = ? AND name = ?") + .get(libraryId, "2.0.0") as { id: number }; + const pageId = db + .prepare("INSERT INTO pages (version_id, url, title) VALUES (?, ?, ?)") + .run(versionId, "https://example.com/legacy", "Legacy Vec").lastInsertRowid; + const vector = new Array(1536).fill(0).map((_, index) => (index === 0 ? 1 : 0)); + const docId = db + .prepare( + "INSERT INTO documents (page_id, content, metadata, sort_order) VALUES (?, ?, ?, ?)", + ) + .run( + pageId, + "legacy vector content", + JSON.stringify({ path: ["legacy"] }), + 0, + ).lastInsertRowid; + + db.exec(` + DROP TABLE documents_vec; + CREATE VIRTUAL TABLE documents_vec USING vec0( + library_id INTEGER NOT NULL, + version_id INTEGER NOT NULL, + embedding FLOAT[1536] + ); + `); + db.prepare( + "INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)", + ).run( + BigInt(docId), + BigInt(libraryId), + BigInt(staleVersionId), + JSON.stringify(vector), + ); + + // @ts-expect-error Accessing private method for testing + store.ensureVectorTable(); + + const ddl = db + .prepare( + "SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'documents_vec'", + ) + .get() as { sql: string }; + expect(ddl.sql).toContain("library_id INTEGER partition key"); + expect(ddl.sql).toContain("version_id INTEGER partition key"); + + const vectorRows = db + .prepare("SELECT COUNT(*) as cnt FROM documents_vec WHERE rowid = ?") + .get(docId) as { cnt: number }; + expect(vectorRows.cnt).toBe(1); + const partitionKeys = db + .prepare("SELECT library_id, version_id FROM documents_vec WHERE rowid = ?") + .get(docId) as { library_id: number; version_id: number }; + expect(partitionKeys.library_id).toBe(libraryId); + expect(partitionKeys.version_id).toBe(versionId); + + const result = db + .prepare(` + SELECT rowid, distance + FROM documents_vec + WHERE library_id = ? + AND version_id = ? + AND embedding MATCH ? + AND k = 1 + `) + .get(libraryId, versionId, JSON.stringify(vector)) as + | { rowid: number; distance: number } + | undefined; + + expect(result?.rowid).toBe(Number(docId)); + expect(result?.distance).toBeCloseTo(0, 6); + }); }); }); diff --git a/src/store/DocumentStore.ts b/src/store/DocumentStore.ts index f53fd78f..f53da335 100644 --- a/src/store/DocumentStore.ts +++ b/src/store/DocumentStore.ts @@ -574,13 +574,7 @@ export class DocumentStore { // Drop and recreate vec table as empty with the new dimension this.db.exec("DROP TABLE IF EXISTS documents_vec"); - this.db.exec(` - CREATE VIRTUAL TABLE documents_vec USING vec0( - library_id INTEGER NOT NULL, - version_id INTEGER NOT NULL, - embedding FLOAT[${newDimension}] - ); - `); + this.createVectorTable(newDimension); // Update metadata to reflect the new configuration this.setEmbeddingMetadata(newModel, newDimension); @@ -901,14 +895,15 @@ export class DocumentStore { /** * Creates or reconciles the documents_vec virtual table with configurable dimension. * Called after migrations and model change detection. The table is initially created - * by migration 003 with a fixed 1536 dimension; this method reconciles it at runtime - * if the configured dimension differs. - * Idempotent: if the table already exists with the same dimension, no-op; if dimension - * changed in config, drops and recreates so any embedding provider (e.g. 1536 or 3584) works. + * by migrations with a fixed 1536 dimension; this method reconciles it at runtime + * if the configured dimension or partition-key schema differs. + * Idempotent: if the table already has the expected dimension and partition keys, + * no-op; otherwise, drops and recreates so any embedding provider works and KNN + * queries can use selective partition filters. * - * Note: No backfill of existing embeddings is performed. Vectors are populated during - * scraping, not at startup. Old vectors from a different dimension or model are incompatible - * and are handled by the model change detection system (checkEmbeddingModelChange). + * Compatible existing vectors are preserved, and missing rows are backfilled from + * documents.embedding when available. Old vectors from a different dimension or model + * are handled by the model change detection system (checkEmbeddingModelChange). */ private ensureVectorTable(): void { const dim = this.config.embeddings.vectorDimension; @@ -927,21 +922,103 @@ export class DocumentStore { if (existingSql) { const match = existingSql.sql.match(/embedding\s+FLOAT\s*\[\s*(\d+)\s*]/i); const existingDim = match ? Number(match[1]) : null; - if (existingDim === dim) { + if (existingDim === dim && this.hasVectorPartitionKeys(existingSql.sql)) { return; } - this.db.exec("DROP TABLE documents_vec;"); } + logger.info( + existingSql + ? "🔄 Rebuilding vector index with partition-key schema" + : "🔄 Creating vector index with partition-key schema", + ); + this.rebuildVectorTable(dim, Boolean(existingSql)); + } + + private hasVectorPartitionKeys(sql: string): boolean { + return ( + /library_id\s+INTEGER\s+partition\s+key/i.test(sql) && + /version_id\s+INTEGER\s+partition\s+key/i.test(sql) + ); + } + + private createVectorTable(dimension: number): void { this.db.exec(` CREATE VIRTUAL TABLE documents_vec USING vec0( - library_id INTEGER NOT NULL, - version_id INTEGER NOT NULL, - embedding FLOAT[${dim}] + library_id INTEGER partition key, + version_id INTEGER partition key, + embedding FLOAT[${dimension}] ); `); } + private backfillVectorTable(dimension: number): void { + this.db + .prepare<[number]>(` + INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding) + SELECT + d.id, + v.library_id, + v.id, + json_extract(d.embedding, '$') + FROM documents d + JOIN pages p ON d.page_id = p.id + JOIN versions v ON p.version_id = v.id + WHERE d.embedding IS NOT NULL + AND vec_length(json_extract(d.embedding, '$')) = ? + AND NOT EXISTS ( + SELECT 1 FROM documents_vec existing WHERE existing.rowid = d.id + ) + `) + .run(dimension); + } + + private rebuildVectorTable(dimension: number, preserveExisting: boolean): void { + const transaction = this.db.transaction(() => { + this.db.exec("DROP TABLE IF EXISTS _documents_vec_migration"); + + if (preserveExisting) { + this.db + .prepare<[number]>(` + CREATE TABLE _documents_vec_migration AS + SELECT + d.id AS rowid, + v.library_id, + v.id AS version_id, + dv.embedding + FROM documents_vec dv + JOIN documents d ON dv.rowid = d.id + JOIN pages p ON d.page_id = p.id + JOIN versions v ON p.version_id = v.id + WHERE vec_length(dv.embedding) = ? + `) + .run(dimension); + this.db.exec("DROP TABLE documents_vec"); + } else { + this.db.exec(` + CREATE TABLE _documents_vec_migration( + rowid INTEGER PRIMARY KEY, + library_id INTEGER NOT NULL, + version_id INTEGER NOT NULL, + embedding BLOB NOT NULL + ) + `); + } + + this.createVectorTable(dimension); + + this.db.exec(` + INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding) + SELECT rowid, library_id, version_id, embedding + FROM _documents_vec_migration + `); + this.backfillVectorTable(dimension); + this.db.exec("DROP TABLE _documents_vec_migration"); + }); + + transaction(); + } + /** * Resolves a library name and version string to version_id. * Creates library and version records if they don't exist. @@ -1798,7 +1875,7 @@ export class DocumentStore { return []; } - const { id: versionId } = versionRow; + const { id: versionId, library_id: libraryId } = versionRow; if (this.isVectorSearchEnabled) { // Hybrid search: vector + full-text search with RRF ranking @@ -1812,17 +1889,13 @@ export class DocumentStore { const vectorSearchK = overfetchLimit * this.vectorSearchMultiplier; const stmt = this.db.prepare(` - WITH vec_distances AS ( + WITH vec_distances AS NOT MATERIALIZED ( SELECT dv.rowid as id, dv.distance as vec_distance FROM documents_vec dv - JOIN documents d ON dv.rowid = d.id - JOIN pages p ON d.page_id = p.id - JOIN versions v ON p.version_id = v.id - JOIN libraries l ON v.library_id = l.id - WHERE l.name = ? - AND COALESCE(v.name, '') = COALESCE(?, '') + WHERE dv.library_id = ? + AND dv.version_id = ? AND dv.embedding MATCH ? AND dv.k = ? ORDER BY dv.distance @@ -1861,8 +1934,8 @@ export class DocumentStore { `); const rawResults = stmt.all( - library.toLowerCase(), - normalizedVersion, + libraryId, + versionId, JSON.stringify(embedding), vectorSearchK, versionId, diff --git a/src/store/applyMigrations.test.ts b/src/store/applyMigrations.test.ts index e333f858..7261d72c 100644 --- a/src/store/applyMigrations.test.ts +++ b/src/store/applyMigrations.test.ts @@ -30,7 +30,7 @@ describe("Database Migrations", () => { const tableNames = (tables as TableRow[]).map((t) => t.name); expect(tableNames).toContain("documents"); expect(tableNames).toContain("documents_fts"); - // documents_vec is created by migration 003 (with fixed 1536 dimension); + // documents_vec is created by migrations with a fixed 1536 dimension; // DocumentStore.ensureVectorTable() reconciles it at runtime if the configured dimension differs expect(tableNames).toContain("documents_vec"); expect(tableNames).toContain("libraries"); @@ -103,14 +103,16 @@ describe("Database Migrations", () => { .get() as { sql: string } | undefined; expect(ftsTableInfo?.sql).toContain("VIRTUAL TABLE documents_fts USING fts5"); - // documents_vec is created by migration 003 (with fixed 1536 dimension) and survives through all - // subsequent migrations. DocumentStore.ensureVectorTable() reconciles it at runtime if needed. + // documents_vec is created by migrations with fixed 1536 dimension and partition keys. + // DocumentStore.ensureVectorTable() reconciles it at runtime if needed. const vecTableInfo = db .prepare( "SELECT sql FROM sqlite_master WHERE type='table' AND name='documents_vec';", ) .get() as { sql: string } | undefined; expect(vecTableInfo).toBeDefined(); + expect(vecTableInfo?.sql).toContain("library_id INTEGER partition key"); + expect(vecTableInfo?.sql).toContain("version_id INTEGER partition key"); }); it("should handle vector search with empty results gracefully", () => { @@ -162,6 +164,122 @@ describe("Database Migrations", () => { expect(searchResults).toEqual([]); }); + it("should preserve and backfill vectors when migrating to partition keys", async () => { + await expect(applyMigrations(db)).resolves.toBeUndefined(); + + const ddl = db + .prepare( + "SELECT sql FROM sqlite_master WHERE type='table' AND name='documents_vec';", + ) + .get() as { sql: string }; + expect(ddl.sql).toContain("library_id INTEGER partition key"); + expect(ddl.sql).toContain("version_id INTEGER partition key"); + + db.prepare("INSERT INTO libraries (name) VALUES (?)").run("partition-lib"); + const { id: libraryId } = db + .prepare("SELECT id FROM libraries WHERE name = ?") + .get("partition-lib") as { id: number }; + db.prepare("INSERT INTO versions (library_id, name) VALUES (?, ?)").run( + libraryId, + "1.0.0", + ); + const { id: versionId } = db + .prepare("SELECT id FROM versions WHERE library_id = ? AND name = ?") + .get(libraryId, "1.0.0") as { id: number }; + + const pageId = db + .prepare("INSERT INTO pages (version_id, url, title) VALUES (?, ?, ?)") + .run(versionId, "https://example.com/partition", "Partitioned").lastInsertRowid as + | number + | bigint; + const preservedVector = new Array(1536) + .fill(0) + .map((_, index) => (index === 0 ? 1 : 0)); + const backfillVector = new Array(1536) + .fill(0) + .map((_, index) => (index === 1 ? 1 : 0)); + const preservedDocId = db + .prepare( + "INSERT INTO documents (page_id, content, metadata, sort_order) VALUES (?, ?, ?, ?)", + ) + .run(pageId, "Preserved vector content", JSON.stringify({ path: "/partition" }), 0) + .lastInsertRowid as number | bigint; + const backfilledDocId = db + .prepare( + "INSERT INTO documents (page_id, content, metadata, sort_order, embedding) VALUES (?, ?, ?, ?, ?)", + ) + .run( + pageId, + "Backfilled vector content", + JSON.stringify({ path: "/partition" }), + 1, + JSON.stringify(backfillVector), + ).lastInsertRowid as number | bigint; + + db.exec(` + CREATE TABLE _test_existing_vectors AS + SELECT rowid, library_id, version_id, embedding FROM documents_vec; + DROP TABLE documents_vec; + CREATE VIRTUAL TABLE documents_vec USING vec0( + library_id INTEGER NOT NULL, + version_id INTEGER NOT NULL, + embedding FLOAT[1536] + ); + INSERT INTO documents_vec (rowid, library_id, version_id, embedding) + SELECT rowid, library_id, version_id, embedding FROM _test_existing_vectors; + DROP TABLE _test_existing_vectors; + DELETE FROM documents_vec; + `); + db.prepare( + "INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)", + ).run( + BigInt(preservedDocId), + BigInt(libraryId), + BigInt(versionId), + JSON.stringify(preservedVector), + ); + + db.prepare("DELETE FROM _schema_migrations WHERE id = ?").run( + "014-rebuild-vector-partition-keys.sql", + ); + + await expect(applyMigrations(db)).resolves.toBeUndefined(); + + const migratedDdl = db + .prepare( + "SELECT sql FROM sqlite_master WHERE type='table' AND name='documents_vec';", + ) + .get() as { sql: string }; + expect(migratedDdl.sql).toContain("library_id INTEGER partition key"); + expect(migratedDdl.sql).toContain("version_id INTEGER partition key"); + + const vectorRows = db + .prepare("SELECT COUNT(*) as cnt FROM documents_vec WHERE rowid = ?") + .get(preservedDocId) as { cnt: number }; + expect(vectorRows.cnt).toBe(1); + + const backfilledVectorRows = db + .prepare("SELECT COUNT(*) as cnt FROM documents_vec WHERE rowid = ?") + .get(backfilledDocId) as { cnt: number }; + expect(backfilledVectorRows.cnt).toBe(1); + + const result = db + .prepare(` + SELECT rowid, distance + FROM documents_vec + WHERE library_id = ? + AND version_id = ? + AND embedding MATCH ? + AND k = 1 + `) + .get(libraryId, versionId, JSON.stringify(preservedVector)) as + | { rowid: number; distance: number } + | undefined; + + expect(result?.rowid).toBe(Number(preservedDocId)); + expect(result?.distance).toBeCloseTo(0, 6); + }); + it("should perform vector search and return similar vectors correctly", () => { // Apply all migrations (documents_vec exists from migration 003 with 1536d) expect(() => applyMigrations(db)).not.toThrow();