From 2b4e886c4dc4e1ccab7ef4f50843fb4557015207 Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Mon, 18 May 2026 06:26:03 -0700 Subject: [PATCH 1/2] fix(store): migrate vectors to partition keys --- .../014-rebuild-vector-partition-keys.sql | 40 +++++ src/store/DocumentStore.test.ts | 167 ++++++++++++++++++ src/store/DocumentStore.ts | 124 ++++++++++--- src/store/applyMigrations.test.ts | 100 ++++++++++- 4 files changed, 399 insertions(+), 32 deletions(-) create mode 100644 db/migrations/014-rebuild-vector-partition-keys.sql diff --git a/db/migrations/014-rebuild-vector-partition-keys.sql b/db/migrations/014-rebuild-vector-partition-keys.sql new file mode 100644 index 00000000..eeb6dc10 --- /dev/null +++ b/db/migrations/014-rebuild-vector-partition-keys.sql @@ -0,0 +1,40 @@ +-- Migration: rebuild documents_vec with sqlite-vec partition keys +-- This enables selective KNN queries by library_id and version_id. + +-- Preserve compatible vectors from the existing vec table. +DROP TABLE IF EXISTS temp_documents_vec_partition_migration; + +CREATE TEMPORARY TABLE temp_documents_vec_partition_migration AS +SELECT rowid, library_id, version_id, embedding +FROM documents_vec +WHERE vec_length(embedding) = 1536; + +DROP TABLE documents_vec; + +CREATE VIRTUAL TABLE documents_vec USING vec0( + library_id INTEGER partition key, + version_id INTEGER partition key, + embedding FLOAT[1536] +); + +INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding) +SELECT rowid, library_id, version_id, embedding +FROM temp_documents_vec_partition_migration; + +-- Backfill any vectors stored on documents but missing from the vec table. +INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding) +SELECT + d.id, + v.library_id, + v.id AS version_id, + json_extract(d.embedding, '$') AS embedding +FROM documents d +JOIN pages p ON d.page_id = p.id +JOIN versions v ON p.version_id = v.id +WHERE d.embedding IS NOT NULL + AND vec_length(json_extract(d.embedding, '$')) = 1536 + AND NOT EXISTS ( + SELECT 1 FROM documents_vec existing WHERE existing.rowid = d.id + ); + +DROP TABLE temp_documents_vec_partition_migration; diff --git a/src/store/DocumentStore.test.ts b/src/store/DocumentStore.test.ts index 4492be58..c7500be5 100644 --- a/src/store/DocumentStore.test.ts +++ b/src/store/DocumentStore.test.ts @@ -458,6 +458,92 @@ describe("DocumentStore - With Embeddings", () => { expect(result.score).toBeGreaterThan(0); } }); + + it("should use partition-filtered vector search for hybrid results", async () => { + const originalApiKey = process.env.OPENAI_API_KEY; + try { + process.env.OPENAI_API_KEY = "test-key-for-partition-search"; + await store.shutdown(); + + const cfg = loadConfig(); + const embeddingConfig = EmbeddingConfig.parseEmbeddingConfig( + "openai:text-embedding-3-small", + ); + cfg.app.embeddingModel = embeddingConfig.modelSpec; + store = new DocumentStore(":memory:", cfg); + await store.initialize(); + + await store.addDocuments( + "searchtest", + "1.0.0", + 1, + createScrapeResult( + "JavaScript Programming Guide", + "https://example.com/js-guide", + "JavaScript programming tutorial with code examples and functions", + ["programming", "javascript"], + ), + ); + await store.addDocuments( + "searchtest", + "1.0.0", + 1, + createScrapeResult( + "JavaScript Frameworks", + "https://example.com/js-frameworks", + "Advanced JavaScript frameworks like React and Vue for building applications", + ["programming", "javascript", "frameworks"], + ), + ); + + // @ts-expect-error Accessing private property for testing + const db = store.db; + const ddl = db + .prepare( + "SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'documents_vec'", + ) + .get() as { sql: string }; + expect(ddl.sql).toContain("library_id INTEGER partition key"); + expect(ddl.sql).toContain("version_id INTEGER partition key"); + + const results = await store.findByContent( + "searchtest", + "1.0.0", + "application building", + 10, + ); + // @ts-expect-error Accessing private property for testing + const vector = await store.embeddings.embedQuery("application building"); + + expect(results.length).toBeGreaterThan(0); + + const vectorResults = db + .prepare(` + SELECT dv.rowid, dv.distance + FROM documents_vec dv + WHERE dv.library_id = ( + SELECT id FROM libraries WHERE name = ? + ) + AND dv.version_id = ( + SELECT v.id + FROM versions v + JOIN libraries l ON v.library_id = l.id + WHERE l.name = ? AND v.name = ? + ) + AND dv.embedding MATCH ? + AND dv.k = ? + ORDER BY dv.distance + `) + .all("searchtest", "searchtest", "1.0.0", JSON.stringify(vector), 10); + expect(vectorResults.length).toBeGreaterThan(0); + } finally { + if (originalApiKey === undefined) { + delete process.env.OPENAI_API_KEY; + } else { + process.env.OPENAI_API_KEY = originalApiKey; + } + } + }); }); describe("Embedding Batch Processing", () => { @@ -1777,6 +1863,8 @@ describe("DocumentStore - Embedding Model Change Safety", () => { ) .get() as { sql: string }; expect(ddl.sql).toContain("768"); + expect(ddl.sql).toContain("library_id INTEGER partition key"); + expect(ddl.sql).toContain("version_id INTEGER partition key"); }); it("should update metadata with new model and dimension", async () => { @@ -1804,6 +1892,8 @@ describe("DocumentStore - Embedding Model Change Safety", () => { .get() as { sql: string }; expect(ddl).toBeDefined(); expect(ddl.sql).toContain("1536"); + expect(ddl.sql).toContain("library_id INTEGER partition key"); + expect(ddl.sql).toContain("version_id INTEGER partition key"); // Table should be empty (no backfill) // @ts-expect-error Accessing private property for testing @@ -1844,5 +1934,82 @@ describe("DocumentStore - Embedding Model Change Safety", () => { .get() as { cnt: number }; expect(vecAfter.cnt).toBe(vecBefore.cnt); }); + + it("should rebuild old metadata-column vec table and backfill stored embeddings", async () => { + store = await createStore(""); + + // @ts-expect-error Accessing private property for testing + const db = store.db; + db.prepare("INSERT INTO libraries (name) VALUES (?)").run("legacyvec"); + const { id: libraryId } = db + .prepare("SELECT id FROM libraries WHERE name = ?") + .get("legacyvec") as { id: number }; + db.prepare("INSERT INTO versions (library_id, name) VALUES (?, ?)").run( + libraryId, + "1.0.0", + ); + const { id: versionId } = db + .prepare("SELECT id FROM versions WHERE library_id = ? AND name = ?") + .get(libraryId, "1.0.0") as { id: number }; + const pageId = db + .prepare("INSERT INTO pages (version_id, url, title) VALUES (?, ?, ?)") + .run(versionId, "https://example.com/legacy", "Legacy Vec").lastInsertRowid; + const vector = new Array(1536).fill(0).map((_, index) => (index === 0 ? 1 : 0)); + const docId = db + .prepare( + "INSERT INTO documents (page_id, content, metadata, sort_order, embedding) VALUES (?, ?, ?, ?, ?)", + ) + .run( + pageId, + "legacy vector content", + JSON.stringify({ path: ["legacy"] }), + 0, + JSON.stringify(vector), + ).lastInsertRowid; + + db.exec(` + DROP TABLE documents_vec; + CREATE VIRTUAL TABLE documents_vec USING vec0( + library_id INTEGER NOT NULL, + version_id INTEGER NOT NULL, + embedding FLOAT[1536] + ); + `); + db.prepare( + "INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)", + ).run(BigInt(docId), BigInt(libraryId), BigInt(versionId), JSON.stringify(vector)); + + // @ts-expect-error Accessing private method for testing + store.ensureVectorTable(); + + const ddl = db + .prepare( + "SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'documents_vec'", + ) + .get() as { sql: string }; + expect(ddl.sql).toContain("library_id INTEGER partition key"); + expect(ddl.sql).toContain("version_id INTEGER partition key"); + + const vectorRows = db + .prepare("SELECT COUNT(*) as cnt FROM documents_vec WHERE rowid = ?") + .get(docId) as { cnt: number }; + expect(vectorRows.cnt).toBe(1); + + const result = db + .prepare(` + SELECT rowid, distance + FROM documents_vec + WHERE library_id = ? + AND version_id = ? + AND embedding MATCH ? + AND k = 1 + `) + .get(libraryId, versionId, JSON.stringify(vector)) as + | { rowid: number; distance: number } + | undefined; + + expect(result?.rowid).toBe(Number(docId)); + expect(result?.distance).toBeCloseTo(0, 6); + }); }); }); diff --git a/src/store/DocumentStore.ts b/src/store/DocumentStore.ts index f53fd78f..5f334e5b 100644 --- a/src/store/DocumentStore.ts +++ b/src/store/DocumentStore.ts @@ -574,13 +574,7 @@ export class DocumentStore { // Drop and recreate vec table as empty with the new dimension this.db.exec("DROP TABLE IF EXISTS documents_vec"); - this.db.exec(` - CREATE VIRTUAL TABLE documents_vec USING vec0( - library_id INTEGER NOT NULL, - version_id INTEGER NOT NULL, - embedding FLOAT[${newDimension}] - ); - `); + this.createVectorTable(newDimension); // Update metadata to reflect the new configuration this.setEmbeddingMetadata(newModel, newDimension); @@ -901,14 +895,15 @@ export class DocumentStore { /** * Creates or reconciles the documents_vec virtual table with configurable dimension. * Called after migrations and model change detection. The table is initially created - * by migration 003 with a fixed 1536 dimension; this method reconciles it at runtime - * if the configured dimension differs. - * Idempotent: if the table already exists with the same dimension, no-op; if dimension - * changed in config, drops and recreates so any embedding provider (e.g. 1536 or 3584) works. + * by migrations with a fixed 1536 dimension; this method reconciles it at runtime + * if the configured dimension or partition-key schema differs. + * Idempotent: if the table already has the expected dimension and partition keys, + * no-op; otherwise, drops and recreates so any embedding provider works and KNN + * queries can use selective partition filters. * - * Note: No backfill of existing embeddings is performed. Vectors are populated during - * scraping, not at startup. Old vectors from a different dimension or model are incompatible - * and are handled by the model change detection system (checkEmbeddingModelChange). + * Compatible existing vectors are preserved, and missing rows are backfilled from + * documents.embedding when available. Old vectors from a different dimension or model + * are handled by the model change detection system (checkEmbeddingModelChange). */ private ensureVectorTable(): void { const dim = this.config.embeddings.vectorDimension; @@ -927,21 +922,96 @@ export class DocumentStore { if (existingSql) { const match = existingSql.sql.match(/embedding\s+FLOAT\s*\[\s*(\d+)\s*]/i); const existingDim = match ? Number(match[1]) : null; - if (existingDim === dim) { + if (existingDim === dim && this.hasVectorPartitionKeys(existingSql.sql)) { return; } - this.db.exec("DROP TABLE documents_vec;"); } + logger.info( + existingSql + ? "🔄 Rebuilding vector index with partition-key schema" + : "🔄 Creating vector index with partition-key schema", + ); + this.rebuildVectorTable(dim, Boolean(existingSql)); + } + + private hasVectorPartitionKeys(sql: string): boolean { + return ( + /library_id\s+INTEGER\s+partition\s+key/i.test(sql) && + /version_id\s+INTEGER\s+partition\s+key/i.test(sql) + ); + } + + private createVectorTable(dimension: number): void { this.db.exec(` CREATE VIRTUAL TABLE documents_vec USING vec0( - library_id INTEGER NOT NULL, - version_id INTEGER NOT NULL, - embedding FLOAT[${dim}] + library_id INTEGER partition key, + version_id INTEGER partition key, + embedding FLOAT[${dimension}] ); `); } + private backfillVectorTable(dimension: number): void { + this.db + .prepare<[number]>(` + INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding) + SELECT + d.id, + v.library_id, + v.id, + json_extract(d.embedding, '$') + FROM documents d + JOIN pages p ON d.page_id = p.id + JOIN versions v ON p.version_id = v.id + WHERE d.embedding IS NOT NULL + AND vec_length(json_extract(d.embedding, '$')) = ? + AND NOT EXISTS ( + SELECT 1 FROM documents_vec existing WHERE existing.rowid = d.id + ) + `) + .run(dimension); + } + + private rebuildVectorTable(dimension: number, preserveExisting: boolean): void { + const transaction = this.db.transaction(() => { + this.db.exec("DROP TABLE IF EXISTS temp_documents_vec_migration"); + + if (preserveExisting) { + this.db + .prepare<[number]>(` + CREATE TEMPORARY TABLE temp_documents_vec_migration AS + SELECT rowid, library_id, version_id, embedding + FROM documents_vec + WHERE vec_length(embedding) = ? + `) + .run(dimension); + this.db.exec("DROP TABLE documents_vec"); + } else { + this.db.exec(` + CREATE TEMPORARY TABLE temp_documents_vec_migration( + rowid INTEGER PRIMARY KEY, + library_id INTEGER NOT NULL, + version_id INTEGER NOT NULL, + embedding BLOB NOT NULL + ) + `); + } + + this.createVectorTable(dimension); + + this.db.exec(` + INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding) + SELECT rowid, library_id, version_id, embedding + FROM temp_documents_vec_migration + `); + this.backfillVectorTable(dimension); + this.db.exec("DROP TABLE temp_documents_vec_migration"); + }); + + transaction(); + } + /** * Resolves a library name and version string to version_id. * Creates library and version records if they don't exist. @@ -1798,7 +1868,7 @@ export class DocumentStore { return []; } - const { id: versionId } = versionRow; + const { id: versionId, library_id: libraryId } = versionRow; if (this.isVectorSearchEnabled) { // Hybrid search: vector + full-text search with RRF ranking @@ -1812,17 +1882,13 @@ export class DocumentStore { const vectorSearchK = overfetchLimit * this.vectorSearchMultiplier; const stmt = this.db.prepare(` - WITH vec_distances AS ( + WITH vec_distances AS NOT MATERIALIZED ( SELECT dv.rowid as id, dv.distance as vec_distance FROM documents_vec dv - JOIN documents d ON dv.rowid = d.id - JOIN pages p ON d.page_id = p.id - JOIN versions v ON p.version_id = v.id - JOIN libraries l ON v.library_id = l.id - WHERE l.name = ? - AND COALESCE(v.name, '') = COALESCE(?, '') + WHERE dv.library_id = ? + AND dv.version_id = ? AND dv.embedding MATCH ? AND dv.k = ? ORDER BY dv.distance @@ -1861,8 +1927,8 @@ export class DocumentStore { `); const rawResults = stmt.all( - library.toLowerCase(), - normalizedVersion, + libraryId, + versionId, JSON.stringify(embedding), vectorSearchK, versionId, diff --git a/src/store/applyMigrations.test.ts b/src/store/applyMigrations.test.ts index e333f858..8e465830 100644 --- a/src/store/applyMigrations.test.ts +++ b/src/store/applyMigrations.test.ts @@ -30,7 +30,7 @@ describe("Database Migrations", () => { const tableNames = (tables as TableRow[]).map((t) => t.name); expect(tableNames).toContain("documents"); expect(tableNames).toContain("documents_fts"); - // documents_vec is created by migration 003 (with fixed 1536 dimension); + // documents_vec is created by migrations with a fixed 1536 dimension; // DocumentStore.ensureVectorTable() reconciles it at runtime if the configured dimension differs expect(tableNames).toContain("documents_vec"); expect(tableNames).toContain("libraries"); @@ -103,14 +103,16 @@ describe("Database Migrations", () => { .get() as { sql: string } | undefined; expect(ftsTableInfo?.sql).toContain("VIRTUAL TABLE documents_fts USING fts5"); - // documents_vec is created by migration 003 (with fixed 1536 dimension) and survives through all - // subsequent migrations. DocumentStore.ensureVectorTable() reconciles it at runtime if needed. + // documents_vec is created by migrations with fixed 1536 dimension and partition keys. + // DocumentStore.ensureVectorTable() reconciles it at runtime if needed. const vecTableInfo = db .prepare( "SELECT sql FROM sqlite_master WHERE type='table' AND name='documents_vec';", ) .get() as { sql: string } | undefined; expect(vecTableInfo).toBeDefined(); + expect(vecTableInfo?.sql).toContain("library_id INTEGER partition key"); + expect(vecTableInfo?.sql).toContain("version_id INTEGER partition key"); }); it("should handle vector search with empty results gracefully", () => { @@ -162,6 +164,98 @@ describe("Database Migrations", () => { expect(searchResults).toEqual([]); }); + it("should preserve and backfill vectors when migrating to partition keys", () => { + expect(() => applyMigrations(db)).not.toThrow(); + + const ddl = db + .prepare( + "SELECT sql FROM sqlite_master WHERE type='table' AND name='documents_vec';", + ) + .get() as { sql: string }; + expect(ddl.sql).toContain("library_id INTEGER partition key"); + expect(ddl.sql).toContain("version_id INTEGER partition key"); + + db.prepare("INSERT INTO libraries (name) VALUES (?)").run("partition-lib"); + const { id: libraryId } = db + .prepare("SELECT id FROM libraries WHERE name = ?") + .get("partition-lib") as { id: number }; + db.prepare("INSERT INTO versions (library_id, name) VALUES (?, ?)").run( + libraryId, + "1.0.0", + ); + const { id: versionId } = db + .prepare("SELECT id FROM versions WHERE library_id = ? AND name = ?") + .get(libraryId, "1.0.0") as { id: number }; + + const pageId = db + .prepare("INSERT INTO pages (version_id, url, title) VALUES (?, ?, ?)") + .run(versionId, "https://example.com/partition", "Partitioned").lastInsertRowid as + | number + | bigint; + const vector = new Array(1536).fill(0).map((_, index) => (index === 0 ? 1 : 0)); + const docId = db + .prepare( + "INSERT INTO documents (page_id, content, metadata, sort_order, embedding) VALUES (?, ?, ?, ?, ?)", + ) + .run( + pageId, + "Partitioned vector content", + JSON.stringify({ path: "/partition" }), + 0, + JSON.stringify(vector), + ).lastInsertRowid as number | bigint; + + db.exec(` + CREATE TEMPORARY TABLE temp_existing_vectors AS + SELECT rowid, library_id, version_id, embedding FROM documents_vec; + DROP TABLE documents_vec; + CREATE VIRTUAL TABLE documents_vec USING vec0( + library_id INTEGER NOT NULL, + version_id INTEGER NOT NULL, + embedding FLOAT[1536] + ); + INSERT INTO documents_vec (rowid, library_id, version_id, embedding) + SELECT rowid, library_id, version_id, embedding FROM temp_existing_vectors; + DROP TABLE temp_existing_vectors; + DELETE FROM documents_vec; + `); + + db.prepare("DELETE FROM _schema_migrations WHERE id = ?").run( + "014-rebuild-vector-partition-keys.sql", + ); + + expect(() => applyMigrations(db)).not.toThrow(); + + const migratedDdl = db + .prepare( + "SELECT sql FROM sqlite_master WHERE type='table' AND name='documents_vec';", + ) + .get() as { sql: string }; + expect(migratedDdl.sql).toContain("library_id INTEGER partition key"); + expect(migratedDdl.sql).toContain("version_id INTEGER partition key"); + + const vectorRows = db + .prepare("SELECT COUNT(*) as cnt FROM documents_vec WHERE rowid = ?") + .get(docId) as { cnt: number }; + expect(vectorRows.cnt).toBe(1); + + const result = db + .prepare(` + SELECT rowid, distance + FROM documents_vec + WHERE library_id = ? + AND version_id = ? + AND embedding MATCH ? + AND k = 1 + `) + .get(libraryId, versionId, JSON.stringify(vector)) as + | { rowid: number; distance: number } + | undefined; + + expect(result?.rowid).toBe(Number(docId)); + expect(result?.distance).toBeCloseTo(0, 6); + }); + it("should perform vector search and return similar vectors correctly", () => { // Apply all migrations (documents_vec exists from migration 003 with 1536d) expect(() => applyMigrations(db)).not.toThrow(); From 4b010dee0d6f567d41382019071a7772a993f4fb Mon Sep 17 00:00:00 2001 From: Andre Rabold Date: Mon, 18 May 2026 06:56:06 -0700 Subject: [PATCH 2/2] fix(store): derive vector partition keys during rebuild --- .../014-rebuild-vector-partition-keys.sql | 24 ++++++--- src/store/DocumentStore.test.ts | 24 +++++++-- src/store/DocumentStore.ts | 23 +++++--- src/store/applyMigrations.test.ts | 52 ++++++++++++++----- 4 files changed, 89 insertions(+), 34 deletions(-) diff --git a/db/migrations/014-rebuild-vector-partition-keys.sql b/db/migrations/014-rebuild-vector-partition-keys.sql index eeb6dc10..538468cc 100644 --- a/db/migrations/014-rebuild-vector-partition-keys.sql +++ b/db/migrations/014-rebuild-vector-partition-keys.sql @@ -1,13 +1,21 @@ -- Migration: rebuild documents_vec with sqlite-vec partition keys -- This enables selective KNN queries by library_id and version_id. --- Preserve compatible vectors from the existing vec table. -DROP TABLE IF EXISTS temp_documents_vec_partition_migration; +-- Preserve compatible vectors from the existing vec table. This uses a +-- disk-backed staging table because large vector indexes can exceed memory. +DROP TABLE IF EXISTS _documents_vec_partition_migration; -CREATE TEMPORARY TABLE temp_documents_vec_partition_migration AS -SELECT rowid, library_id, version_id, embedding -FROM documents_vec -WHERE vec_length(embedding) = 1536; +CREATE TABLE _documents_vec_partition_migration AS +SELECT + d.id AS rowid, + v.library_id, + v.id AS version_id, + dv.embedding +FROM documents_vec dv +JOIN documents d ON dv.rowid = d.id +JOIN pages p ON d.page_id = p.id +JOIN versions v ON p.version_id = v.id +WHERE vec_length(dv.embedding) = 1536; DROP TABLE documents_vec; @@ -19,7 +27,7 @@ CREATE VIRTUAL TABLE documents_vec USING vec0( INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding) SELECT rowid, library_id, version_id, embedding -FROM temp_documents_vec_partition_migration; +FROM _documents_vec_partition_migration; -- Backfill any vectors stored on documents but missing from the vec table. INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding) @@ -37,4 +45,4 @@ WHERE d.embedding IS NOT NULL SELECT 1 FROM documents_vec existing WHERE existing.rowid = d.id ); -DROP TABLE temp_documents_vec_partition_migration; +DROP TABLE _documents_vec_partition_migration; diff --git a/src/store/DocumentStore.test.ts b/src/store/DocumentStore.test.ts index c7500be5..6aad2289 100644 --- a/src/store/DocumentStore.test.ts +++ b/src/store/DocumentStore.test.ts @@ -1935,7 +1935,7 @@ describe("DocumentStore - Embedding Model Change Safety", () => { expect(vecAfter.cnt).toBe(vecBefore.cnt); }); - it("should rebuild old metadata-column vec table and backfill stored embeddings", async () => { + it("should rebuild old metadata-column vec table with current partition keys", async () => { store = await createStore(""); // @ts-expect-error Accessing private property for testing @@ -1951,20 +1951,26 @@ describe("DocumentStore - Embedding Model Change Safety", () => { const { id: versionId } = db .prepare("SELECT id FROM versions WHERE library_id = ? AND name = ?") .get(libraryId, "1.0.0") as { id: number }; + db.prepare("INSERT INTO versions (library_id, name) VALUES (?, ?)").run( + libraryId, + "2.0.0", + ); + const { id: staleVersionId } = db + .prepare("SELECT id FROM versions WHERE library_id = ? AND name = ?") + .get(libraryId, "2.0.0") as { id: number }; const pageId = db .prepare("INSERT INTO pages (version_id, url, title) VALUES (?, ?, ?)") .run(versionId, "https://example.com/legacy", "Legacy Vec").lastInsertRowid; const vector = new Array(1536).fill(0).map((_, index) => (index === 0 ? 1 : 0)); const docId = db .prepare( - "INSERT INTO documents (page_id, content, metadata, sort_order, embedding) VALUES (?, ?, ?, ?, ?)", + "INSERT INTO documents (page_id, content, metadata, sort_order) VALUES (?, ?, ?, ?)", ) .run( pageId, "legacy vector content", JSON.stringify({ path: ["legacy"] }), 0, - JSON.stringify(vector), ).lastInsertRowid; db.exec(` @@ -1977,7 +1983,12 @@ describe("DocumentStore - Embedding Model Change Safety", () => { `); db.prepare( "INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)", - ).run(BigInt(docId), BigInt(libraryId), BigInt(versionId), JSON.stringify(vector)); + ).run( + BigInt(docId), + BigInt(libraryId), + BigInt(staleVersionId), + JSON.stringify(vector), + ); // @ts-expect-error Accessing private method for testing store.ensureVectorTable(); @@ -1994,6 +2005,11 @@ describe("DocumentStore - Embedding Model Change Safety", () => { .prepare("SELECT COUNT(*) as cnt FROM documents_vec WHERE rowid = ?") .get(docId) as { cnt: number }; expect(vectorRows.cnt).toBe(1); + const partitionKeys = db + .prepare("SELECT library_id, version_id FROM documents_vec WHERE rowid = ?") + .get(docId) as { library_id: number; version_id: number }; + expect(partitionKeys.library_id).toBe(libraryId); + expect(partitionKeys.version_id).toBe(versionId); const result = db .prepare(` diff --git a/src/store/DocumentStore.ts b/src/store/DocumentStore.ts index 5f334e5b..f53da335 100644 --- a/src/store/DocumentStore.ts +++ b/src/store/DocumentStore.ts @@ -975,21 +975,28 @@ export class DocumentStore { private rebuildVectorTable(dimension: number, preserveExisting: boolean): void { const transaction = this.db.transaction(() => { - this.db.exec("DROP TABLE IF EXISTS temp_documents_vec_migration"); + this.db.exec("DROP TABLE IF EXISTS _documents_vec_migration"); if (preserveExisting) { this.db .prepare<[number]>(` - CREATE TEMPORARY TABLE temp_documents_vec_migration AS - SELECT rowid, library_id, version_id, embedding - FROM documents_vec - WHERE vec_length(embedding) = ? + CREATE TABLE _documents_vec_migration AS + SELECT + d.id AS rowid, + v.library_id, + v.id AS version_id, + dv.embedding + FROM documents_vec dv + JOIN documents d ON dv.rowid = d.id + JOIN pages p ON d.page_id = p.id + JOIN versions v ON p.version_id = v.id + WHERE vec_length(dv.embedding) = ? `) .run(dimension); this.db.exec("DROP TABLE documents_vec"); } else { this.db.exec(` - CREATE TEMPORARY TABLE temp_documents_vec_migration( + CREATE TABLE _documents_vec_migration( rowid INTEGER PRIMARY KEY, library_id INTEGER NOT NULL, version_id INTEGER NOT NULL, @@ -1003,10 +1010,10 @@ export class DocumentStore { this.db.exec(` INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding) SELECT rowid, library_id, version_id, embedding - FROM temp_documents_vec_migration + FROM _documents_vec_migration `); this.backfillVectorTable(dimension); - this.db.exec("DROP TABLE temp_documents_vec_migration"); + this.db.exec("DROP TABLE _documents_vec_migration"); }); transaction(); diff --git a/src/store/applyMigrations.test.ts b/src/store/applyMigrations.test.ts index 8e465830..7261d72c 100644 --- a/src/store/applyMigrations.test.ts +++ b/src/store/applyMigrations.test.ts @@ -164,8 +164,8 @@ describe("Database Migrations", () => { expect(searchResults).toEqual([]); }); - it("should preserve and backfill vectors when migrating to partition keys", () => { - expect(() => applyMigrations(db)).not.toThrow(); + it("should preserve and backfill vectors when migrating to partition keys", async () => { + await expect(applyMigrations(db)).resolves.toBeUndefined(); const ddl = db .prepare( @@ -192,21 +192,32 @@ describe("Database Migrations", () => { .run(versionId, "https://example.com/partition", "Partitioned").lastInsertRowid as | number | bigint; - const vector = new Array(1536).fill(0).map((_, index) => (index === 0 ? 1 : 0)); - const docId = db + const preservedVector = new Array(1536) + .fill(0) + .map((_, index) => (index === 0 ? 1 : 0)); + const backfillVector = new Array(1536) + .fill(0) + .map((_, index) => (index === 1 ? 1 : 0)); + const preservedDocId = db + .prepare( + "INSERT INTO documents (page_id, content, metadata, sort_order) VALUES (?, ?, ?, ?)", + ) + .run(pageId, "Preserved vector content", JSON.stringify({ path: "/partition" }), 0) + .lastInsertRowid as number | bigint; + const backfilledDocId = db .prepare( "INSERT INTO documents (page_id, content, metadata, sort_order, embedding) VALUES (?, ?, ?, ?, ?)", ) .run( pageId, - "Partitioned vector content", + "Backfilled vector content", JSON.stringify({ path: "/partition" }), - 0, - JSON.stringify(vector), + 1, + JSON.stringify(backfillVector), ).lastInsertRowid as number | bigint; db.exec(` - CREATE TEMPORARY TABLE temp_existing_vectors AS + CREATE TABLE _test_existing_vectors AS SELECT rowid, library_id, version_id, embedding FROM documents_vec; DROP TABLE documents_vec; CREATE VIRTUAL TABLE documents_vec USING vec0( @@ -215,16 +226,24 @@ describe("Database Migrations", () => { embedding FLOAT[1536] ); INSERT INTO documents_vec (rowid, library_id, version_id, embedding) - SELECT rowid, library_id, version_id, embedding FROM temp_existing_vectors; - DROP TABLE temp_existing_vectors; + SELECT rowid, library_id, version_id, embedding FROM _test_existing_vectors; + DROP TABLE _test_existing_vectors; DELETE FROM documents_vec; `); + db.prepare( + "INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)", + ).run( + BigInt(preservedDocId), + BigInt(libraryId), + BigInt(versionId), + JSON.stringify(preservedVector), + ); db.prepare("DELETE FROM _schema_migrations WHERE id = ?").run( "014-rebuild-vector-partition-keys.sql", ); - expect(() => applyMigrations(db)).not.toThrow(); + await expect(applyMigrations(db)).resolves.toBeUndefined(); const migratedDdl = db .prepare( @@ -236,9 +255,14 @@ describe("Database Migrations", () => { const vectorRows = db .prepare("SELECT COUNT(*) as cnt FROM documents_vec WHERE rowid = ?") - .get(docId) as { cnt: number }; + .get(preservedDocId) as { cnt: number }; expect(vectorRows.cnt).toBe(1); + const backfilledVectorRows = db + .prepare("SELECT COUNT(*) as cnt FROM documents_vec WHERE rowid = ?") + .get(backfilledDocId) as { cnt: number }; + expect(backfilledVectorRows.cnt).toBe(1); + const result = db .prepare(` SELECT rowid, distance @@ -248,11 +272,11 @@ describe("Database Migrations", () => { AND embedding MATCH ? AND k = 1 `) - .get(libraryId, versionId, JSON.stringify(vector)) as + .get(libraryId, versionId, JSON.stringify(preservedVector)) as | { rowid: number; distance: number } | undefined; - expect(result?.rowid).toBe(Number(docId)); + expect(result?.rowid).toBe(Number(preservedDocId)); expect(result?.distance).toBeCloseTo(0, 6); });