Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions db/migrations/014-rebuild-vector-partition-keys.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
-- Migration: rebuild documents_vec with sqlite-vec partition keys
-- This enables selective KNN queries by library_id and version_id.

-- Preserve compatible vectors from the existing vec table. This uses a
-- disk-backed staging table because large vector indexes can exceed memory.
DROP TABLE IF EXISTS _documents_vec_partition_migration;

CREATE TABLE _documents_vec_partition_migration AS
SELECT
d.id AS rowid,
v.library_id,
v.id AS version_id,
dv.embedding
FROM documents_vec dv
JOIN documents d ON dv.rowid = d.id
JOIN pages p ON d.page_id = p.id
JOIN versions v ON p.version_id = v.id
WHERE vec_length(dv.embedding) = 1536;

DROP TABLE documents_vec;

CREATE VIRTUAL TABLE documents_vec USING vec0(
library_id INTEGER partition key,
version_id INTEGER partition key,
embedding FLOAT[1536]
);

INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding)
SELECT rowid, library_id, version_id, embedding
FROM _documents_vec_partition_migration;

-- Backfill any vectors stored on documents but missing from the vec table.
INSERT OR REPLACE INTO documents_vec (rowid, library_id, version_id, embedding)
SELECT
d.id,
v.library_id,
v.id AS version_id,
json_extract(d.embedding, '$') AS embedding
FROM documents d
JOIN pages p ON d.page_id = p.id
JOIN versions v ON p.version_id = v.id
WHERE d.embedding IS NOT NULL
AND vec_length(json_extract(d.embedding, '$')) = 1536
AND NOT EXISTS (
SELECT 1 FROM documents_vec existing WHERE existing.rowid = d.id
);

DROP TABLE _documents_vec_partition_migration;
183 changes: 183 additions & 0 deletions src/store/DocumentStore.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,92 @@ describe("DocumentStore - With Embeddings", () => {
expect(result.score).toBeGreaterThan(0);
}
});

it("should use partition-filtered vector search for hybrid results", async () => {
const originalApiKey = process.env.OPENAI_API_KEY;
try {
process.env.OPENAI_API_KEY = "test-key-for-partition-search";
await store.shutdown();

const cfg = loadConfig();
const embeddingConfig = EmbeddingConfig.parseEmbeddingConfig(
"openai:text-embedding-3-small",
);
cfg.app.embeddingModel = embeddingConfig.modelSpec;
store = new DocumentStore(":memory:", cfg);
await store.initialize();

await store.addDocuments(
"searchtest",
"1.0.0",
1,
createScrapeResult(
"JavaScript Programming Guide",
"https://example.com/js-guide",
"JavaScript programming tutorial with code examples and functions",
["programming", "javascript"],
),
);
await store.addDocuments(
"searchtest",
"1.0.0",
1,
createScrapeResult(
"JavaScript Frameworks",
"https://example.com/js-frameworks",
"Advanced JavaScript frameworks like React and Vue for building applications",
["programming", "javascript", "frameworks"],
),
);

// @ts-expect-error Accessing private property for testing
const db = store.db;
const ddl = db
.prepare(
"SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'documents_vec'",
)
.get() as { sql: string };
expect(ddl.sql).toContain("library_id INTEGER partition key");
expect(ddl.sql).toContain("version_id INTEGER partition key");

const results = await store.findByContent(
"searchtest",
"1.0.0",
"application building",
10,
);
// @ts-expect-error Accessing private property for testing
const vector = await store.embeddings.embedQuery("application building");

expect(results.length).toBeGreaterThan(0);

const vectorResults = db
.prepare(`
SELECT dv.rowid, dv.distance
FROM documents_vec dv
WHERE dv.library_id = (
SELECT id FROM libraries WHERE name = ?
)
AND dv.version_id = (
SELECT v.id
FROM versions v
JOIN libraries l ON v.library_id = l.id
WHERE l.name = ? AND v.name = ?
)
AND dv.embedding MATCH ?
AND dv.k = ?
ORDER BY dv.distance
`)
.all("searchtest", "searchtest", "1.0.0", JSON.stringify(vector), 10);
expect(vectorResults.length).toBeGreaterThan(0);
} finally {
if (originalApiKey === undefined) {
delete process.env.OPENAI_API_KEY;
} else {
process.env.OPENAI_API_KEY = originalApiKey;
}
}
});
});

describe("Embedding Batch Processing", () => {
Expand Down Expand Up @@ -1777,6 +1863,8 @@ describe("DocumentStore - Embedding Model Change Safety", () => {
)
.get() as { sql: string };
expect(ddl.sql).toContain("768");
expect(ddl.sql).toContain("library_id INTEGER partition key");
expect(ddl.sql).toContain("version_id INTEGER partition key");
});

it("should update metadata with new model and dimension", async () => {
Expand Down Expand Up @@ -1804,6 +1892,8 @@ describe("DocumentStore - Embedding Model Change Safety", () => {
.get() as { sql: string };
expect(ddl).toBeDefined();
expect(ddl.sql).toContain("1536");
expect(ddl.sql).toContain("library_id INTEGER partition key");
expect(ddl.sql).toContain("version_id INTEGER partition key");

// Table should be empty (no backfill)
// @ts-expect-error Accessing private property for testing
Expand Down Expand Up @@ -1844,5 +1934,98 @@ describe("DocumentStore - Embedding Model Change Safety", () => {
.get() as { cnt: number };
expect(vecAfter.cnt).toBe(vecBefore.cnt);
});

it("should rebuild old metadata-column vec table with current partition keys", async () => {
store = await createStore("");

// @ts-expect-error Accessing private property for testing
const db = store.db;
db.prepare("INSERT INTO libraries (name) VALUES (?)").run("legacyvec");
const { id: libraryId } = db
.prepare("SELECT id FROM libraries WHERE name = ?")
.get("legacyvec") as { id: number };
db.prepare("INSERT INTO versions (library_id, name) VALUES (?, ?)").run(
libraryId,
"1.0.0",
);
const { id: versionId } = db
.prepare("SELECT id FROM versions WHERE library_id = ? AND name = ?")
.get(libraryId, "1.0.0") as { id: number };
db.prepare("INSERT INTO versions (library_id, name) VALUES (?, ?)").run(
libraryId,
"2.0.0",
);
const { id: staleVersionId } = db
.prepare("SELECT id FROM versions WHERE library_id = ? AND name = ?")
.get(libraryId, "2.0.0") as { id: number };
const pageId = db
.prepare("INSERT INTO pages (version_id, url, title) VALUES (?, ?, ?)")
.run(versionId, "https://example.com/legacy", "Legacy Vec").lastInsertRowid;
const vector = new Array(1536).fill(0).map((_, index) => (index === 0 ? 1 : 0));
const docId = db
.prepare(
"INSERT INTO documents (page_id, content, metadata, sort_order) VALUES (?, ?, ?, ?)",
)
.run(
pageId,
"legacy vector content",
JSON.stringify({ path: ["legacy"] }),
0,
).lastInsertRowid;

db.exec(`
DROP TABLE documents_vec;
CREATE VIRTUAL TABLE documents_vec USING vec0(
library_id INTEGER NOT NULL,
version_id INTEGER NOT NULL,
embedding FLOAT[1536]
);
`);
db.prepare(
"INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)",
).run(
BigInt(docId),
BigInt(libraryId),
BigInt(staleVersionId),
JSON.stringify(vector),
);

// @ts-expect-error Accessing private method for testing
store.ensureVectorTable();

const ddl = db
.prepare(
"SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'documents_vec'",
)
.get() as { sql: string };
expect(ddl.sql).toContain("library_id INTEGER partition key");
expect(ddl.sql).toContain("version_id INTEGER partition key");

const vectorRows = db
.prepare("SELECT COUNT(*) as cnt FROM documents_vec WHERE rowid = ?")
.get(docId) as { cnt: number };
expect(vectorRows.cnt).toBe(1);
const partitionKeys = db
.prepare("SELECT library_id, version_id FROM documents_vec WHERE rowid = ?")
.get(docId) as { library_id: number; version_id: number };
expect(partitionKeys.library_id).toBe(libraryId);
expect(partitionKeys.version_id).toBe(versionId);

const result = db
.prepare(`
SELECT rowid, distance
FROM documents_vec
WHERE library_id = ?
AND version_id = ?
AND embedding MATCH ?
AND k = 1
`)
.get(libraryId, versionId, JSON.stringify(vector)) as
| { rowid: number; distance: number }
| undefined;

expect(result?.rowid).toBe(Number(docId));
expect(result?.distance).toBeCloseTo(0, 6);
});
});
});
Loading
Loading