From b681cda00e188e0026f6cce7d7d8264396267e90 Mon Sep 17 00:00:00 2001 From: qxxaa Date: Fri, 26 Jun 2026 16:24:20 +0100 Subject: [PATCH 1/5] fix: populate search_vector on observation INSERT/UPDATE in consolidator The consolidator creates and updates observations without populating the search_vector tsvector column. Under the native text search extension, this means observations are invisible to BM25 full-text retrieval - the BM25 arm returns 0 candidates regardless of query content. Four code paths write observation text to memory_units: 1. _dedup_reconcile_create (merge into existing twin) 2. _dedup_reconcile_update (drift-merge into different twin) 3. _execute_update_action (LLM rewrite of existing observation) 4. _create_observation_directly (new observation INSERT) None populated search_vector. This patch adds conditional tsvector generation gated on config.text_search_extension == 'native', matching the existing pattern in ops_postgresql.insert_facts_batch. Non-native backends (pg_textsearch, pgroonga, pg_search) continue to leave search_vector NULL as they index base text columns directly. The INSERT path (Site 4) splits the existing else branch into an explicit elif/else to avoid applying native tsvector logic to backends that don't use it. Fixes: observations invisible to BM25 retrieval arm. --- .../engine/consolidation/consolidator.py | 46 +++++++++++++------ 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/hindsight-api-slim/hindsight_api/engine/consolidation/consolidator.py b/hindsight-api-slim/hindsight_api/engine/consolidation/consolidator.py index 14e0c3f70e..acc26c4cc3 100644 --- a/hindsight-api-slim/hindsight_api/engine/consolidation/consolidator.py +++ b/hindsight-api-slim/hindsight_api/engine/consolidation/consolidator.py @@ -224,13 +224,18 @@ async def _dedup_reconcile_create( # Fold the new source facts into the twin and persist the merged text. We keep the twin's # existing embedding: the merged text is >= threshold similar, so the stored vector stays # representative and we avoid a re-embed + a dialect-specific vector UPDATE. + search_vector_clause = ( + f",\n search_vector = to_tsvector('{config.text_search_extension_native_language}'::regconfig, COALESCE($1, ''))" + if config.text_search_extension == "native" + else "" + ) await conn.execute( f""" UPDATE {fq_table("memory_units")} SET text = $1, source_memory_ids = (SELECT array_agg(DISTINCT e) FROM unnest(source_memory_ids || $2::uuid[]) e), proof_count = (SELECT count(DISTINCT e) FROM unnest(source_memory_ids || $2::uuid[]) e), - updated_at = now() + updated_at = now(){search_vector_clause} WHERE id = $3::uuid """, outcome.merged_text, @@ -279,6 +284,11 @@ async def _dedup_reconcile_update( # the create path) then delete the now-redundant updated row. The all_strict/any tag match # guarantees twin and updated share scope, so dropping the updated row's tags loses no # visibility. Temporal fields follow the surviving twin (minimal scope; matches create). + search_vector_clause = ( + f",\n search_vector = to_tsvector('{config.text_search_extension_native_language}'::regconfig, COALESCE($1, ''))" + if config.text_search_extension == "native" + else "" + ) await conn.execute( f""" UPDATE {fq_table("memory_units")} t @@ -289,7 +299,7 @@ async def _dedup_reconcile_update( proof_count = ( SELECT count(DISTINCT e) FROM unnest(t.source_memory_ids || u.source_memory_ids) e ), - updated_at = now() + updated_at = now(){search_vector_clause} FROM {fq_table("memory_units")} u WHERE t.id = $2::uuid AND u.id = $3::uuid """, @@ -1845,6 +1855,12 @@ async def _execute_update_action( config = get_config() + search_vector_clause = ( + f",\n search_vector = to_tsvector('{config.text_search_extension_native_language}'::regconfig, COALESCE($1, ''))" + if config.text_search_extension == "native" + else "" + ) + t0 = time.time() await conn.execute( f""" @@ -1857,7 +1873,7 @@ async def _execute_update_action( updated_at = now(), occurred_start = LEAST(occurred_start, COALESCE($6, occurred_start)), occurred_end = GREATEST(occurred_end, COALESCE($7, occurred_end)), - mentioned_at = GREATEST(mentioned_at, COALESCE($8, mentioned_at)) + mentioned_at = GREATEST(mentioned_at, COALESCE($8, mentioned_at)){search_vector_clause} WHERE id = $5 """, new_text, @@ -2333,16 +2349,20 @@ async def _create_observation_directly( tokenize($3, 'llmlingua2')::bm25_catalog.bm25vector) RETURNING id """ - else: # native, pg_textsearch, pgroonga, or pg_search - # pg_textsearch / pgroonga / pg_search: indexes operate on base text - # columns directly, so the dummy search_vector column is left NULL. - # Native: the migration p4q5r6s7t8u9 dropped the GENERATED expression on - # search_vector to allow per-deployment language configuration; the - # batch insert path in ops_postgresql.insert_facts_batch now populates - # it via to_tsvector($lang, ...). This single-observation INSERT does - # not, so observations under the native backend currently land with - # NULL search_vector and are not BM25-searchable until reflected/ - # re-ingested. Tracking a separate fix for that gap. + elif config.text_search_extension == "native": + # Native: search_vector is populated with to_tsvector() using the + # configured native language dictionary, matching the batch insert + # path in ops_postgresql.insert_facts_batch. + query = f""" + INSERT INTO {fq_table("memory_units")} ( + id, bank_id, text, fact_type, embedding, proof_count, source_memory_ids, + tags, event_date, occurred_start, occurred_end, mentioned_at, search_vector + ) + VALUES ($1, $2, $3, 'observation', $4::vector, 1, $5, $6, $7, $8, $9, $10, + to_tsvector('{config.text_search_extension_native_language}'::regconfig, COALESCE($3, ''))) + RETURNING id + """ + else: # pg_textsearch, pgroonga, pg_search: indexes operate on base text columns directly query = f""" INSERT INTO {fq_table("memory_units")} ( id, bank_id, text, fact_type, embedding, proof_count, source_memory_ids, From 8a891cc91b1a52b069b1377a6d8bcaa02912b27b Mon Sep 17 00:00:00 2001 From: qxxaa Date: Fri, 26 Jun 2026 16:47:21 +0100 Subject: [PATCH 2/5] Implement test for search vector population in observations Add test for observation creation with native search vector --- .../tests/test_consolidation.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/hindsight-api-slim/tests/test_consolidation.py b/hindsight-api-slim/tests/test_consolidation.py index cd8e6fe2d2..3b981805f2 100644 --- a/hindsight-api-slim/tests/test_consolidation.py +++ b/hindsight-api-slim/tests/test_consolidation.py @@ -3619,3 +3619,42 @@ def test_consolidation_prompt_split_is_cacheable_and_complete(): ) assert "OBSERVATION LIMIT REACHED" in capped assert "OBSERVATION LIMIT REACHED" not in sys_prompt + + +@pytest.mark.asyncio +async def test_create_observation_populates_search_vector_native(memory, request_context): + """Observations created via consolidation must have search_vector populated + when text_search_extension == 'native', so BM25 retrieval finds them.""" + from hindsight_api.config import get_config + + config = get_config() + if config.text_search_extension != "native": + pytest.skip("Only applies to native text search backend") + + bank_id = f"test-search-vector-{uuid.uuid4().hex[:8]}" + await memory.get_bank_profile(bank_id=bank_id, request_context=request_context) + + await memory.retain_async( + bank_id=bank_id, + content="Django uses middleware for request processing.", + request_context=request_context, + ) + + async with memory._pool.acquire() as conn: + row = await conn.fetchrow( + """ + SELECT search_vector + FROM memory_units + WHERE bank_id = $1 AND fact_type = 'observation' + LIMIT 1 + """, + bank_id, + ) + + assert row is not None, "Consolidation should have created an observation" + assert row["search_vector"] is not None, ( + "search_vector must be populated for BM25 retrieval under native backend" + ) + + await memory.delete_bank(bank_id, request_context=request_context) + From b9603075c9d1945de189551f9d042dfc5c833e1b Mon Sep 17 00:00:00 2001 From: qxxaa Date: Fri, 26 Jun 2026 16:46:33 +0000 Subject: [PATCH 3/5] style: run lint --- .githooks/post-checkout | 3 ++ .githooks/post-commit | 3 ++ .githooks/post-merge | 3 ++ .githooks/pre-push | 3 ++ hindsight-api-slim/tests/test_chunking.py | 1 - .../tests/test_consolidation.py | 5 +-- .../tests/test_retain_append_mode.py | 31 +++++++++++-------- 7 files changed, 31 insertions(+), 18 deletions(-) create mode 100755 .githooks/post-checkout create mode 100755 .githooks/post-commit create mode 100755 .githooks/post-merge create mode 100755 .githooks/pre-push diff --git a/.githooks/post-checkout b/.githooks/post-checkout new file mode 100755 index 0000000000..5abf8ed93f --- /dev/null +++ b/.githooks/post-checkout @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs post-checkout "$@" diff --git a/.githooks/post-commit b/.githooks/post-commit new file mode 100755 index 0000000000..b8b76c2c42 --- /dev/null +++ b/.githooks/post-commit @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs post-commit "$@" diff --git a/.githooks/post-merge b/.githooks/post-merge new file mode 100755 index 0000000000..726f909891 --- /dev/null +++ b/.githooks/post-merge @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs post-merge "$@" diff --git a/.githooks/pre-push b/.githooks/pre-push new file mode 100755 index 0000000000..5f26dc4552 --- /dev/null +++ b/.githooks/pre-push @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs pre-push "$@" diff --git a/hindsight-api-slim/tests/test_chunking.py b/hindsight-api-slim/tests/test_chunking.py index ea69a46567..e6e40f610a 100644 --- a/hindsight-api-slim/tests/test_chunking.py +++ b/hindsight-api-slim/tests/test_chunking.py @@ -455,4 +455,3 @@ def test_merged_json_array_routes_to_conversation_chunking(): assert isinstance(parsed, list), f"Chunk must be a JSON array: {chunk[:60]}" assert all(isinstance(e, dict) for e in parsed), f"Every element must be a dict: {chunk[:60]}" assert all("role" in e for e in parsed), f"Every element must have a role key: {chunk[:60]}" - diff --git a/hindsight-api-slim/tests/test_consolidation.py b/hindsight-api-slim/tests/test_consolidation.py index 3b981805f2..932d41e531 100644 --- a/hindsight-api-slim/tests/test_consolidation.py +++ b/hindsight-api-slim/tests/test_consolidation.py @@ -3652,9 +3652,6 @@ async def test_create_observation_populates_search_vector_native(memory, request ) assert row is not None, "Consolidation should have created an observation" - assert row["search_vector"] is not None, ( - "search_vector must be populated for BM25 retrieval under native backend" - ) + assert row["search_vector"] is not None, "search_vector must be populated for BM25 retrieval under native backend" await memory.delete_bank(bank_id, request_context=request_context) - diff --git a/hindsight-api-slim/tests/test_retain_append_mode.py b/hindsight-api-slim/tests/test_retain_append_mode.py index d02b65b353..44060a2eed 100644 --- a/hindsight-api-slim/tests/test_retain_append_mode.py +++ b/hindsight-api-slim/tests/test_retain_append_mode.py @@ -254,10 +254,12 @@ async def test_append_mode_conversation_arrays_produce_valid_json(memory, reques try: # First retain - JSON conversation array - turn1 = json.dumps([ - {"role": "user", "content": "Hello"}, - {"role": "assistant", "content": "Hi there"}, - ]) + turn1 = json.dumps( + [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there"}, + ] + ) await memory.retain_batch_async( bank_id=bank_id, contents=[ @@ -271,10 +273,12 @@ async def test_append_mode_conversation_arrays_produce_valid_json(memory, reques ) # Second retain - append more turns - turn2 = json.dumps([ - {"role": "user", "content": "How are you"}, - {"role": "assistant", "content": "Doing well"}, - ]) + turn2 = json.dumps( + [ + {"role": "user", "content": "How are you"}, + {"role": "assistant", "content": "Doing well"}, + ] + ) await memory.retain_batch_async( bank_id=bank_id, contents=[ @@ -300,10 +304,12 @@ async def test_append_mode_conversation_arrays_produce_valid_json(memory, reques assert len(parsed) == 4, "Should contain all 4 messages from both retains" # Third retain - append again, verify no degradation - turn3 = json.dumps([ - {"role": "user", "content": "What is new"}, - {"role": "assistant", "content": "Not much"}, - ]) + turn3 = json.dumps( + [ + {"role": "user", "content": "What is new"}, + {"role": "assistant", "content": "Not much"}, + ] + ) await memory.retain_batch_async( bank_id=bank_id, contents=[ @@ -329,4 +335,3 @@ async def test_append_mode_conversation_arrays_produce_valid_json(memory, reques finally: await memory.delete_bank(bank_id, request_context=request_context) - From cd8ad9889f55982c43144a33a9eb8384571591f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Boschi?= Date: Mon, 29 Jun 2026 11:43:09 +0200 Subject: [PATCH 4/5] fix(consolidation): backfill search_vector for existing native observations The writer fix only populates search_vector for observations created or updated after deploy. Observations already written under the native backend keep a NULL search_vector and stay invisible to BM25 until re-consolidated. Add migration c3f7a1b9d2e4 to backfill them, gated on the native tsvector column type and scoped to fact_type='observation' with a NULL search_vector (idempotent). Matches the writer's text-only tsvector and the configured native language. --- ...d2e4_backfill_observation_search_vector.py | 144 ++++++++++++++++ ...tion_observation_search_vector_backfill.py | 155 ++++++++++++++++++ 2 files changed, 299 insertions(+) create mode 100644 hindsight-api-slim/hindsight_api/alembic/versions/c3f7a1b9d2e4_backfill_observation_search_vector.py create mode 100644 hindsight-api-slim/tests/test_migration_observation_search_vector_backfill.py diff --git a/hindsight-api-slim/hindsight_api/alembic/versions/c3f7a1b9d2e4_backfill_observation_search_vector.py b/hindsight-api-slim/hindsight_api/alembic/versions/c3f7a1b9d2e4_backfill_observation_search_vector.py new file mode 100644 index 0000000000..1a9aad5391 --- /dev/null +++ b/hindsight-api-slim/hindsight_api/alembic/versions/c3f7a1b9d2e4_backfill_observation_search_vector.py @@ -0,0 +1,144 @@ +"""Backfill search_vector for native-backend observations. + +Observations created or updated by the consolidator landed with a NULL +``search_vector`` under the ``native`` text-search backend: the +single-row INSERT/UPDATE paths in ``consolidator.py`` never populated the +tsvector (only the batch raw-fact path in ``ops_postgresql.insert_facts_batch`` +did). Those observations were therefore invisible to the BM25 retrieval arm +until they were re-written by a later consolidation pass. The writer is fixed +in the same change set (all four consolidator sites now call +``to_tsvector($lang, COALESCE(text, ''))``); this migration repairs the +historical residue so existing observations become BM25-searchable without a +re-ingest. + +Scope mirrors the writer fix exactly: + * Only the ``native`` backend is touched. The gate is the column *type*: + under ``native`` ``search_vector`` is a regular (non-generated) tsvector + column; under ``vchord`` it is a ``bm25vector`` and under + ``pg_textsearch`` / ``pgroonga`` / ``pg_search`` it is a dummy ``text`` + column. ``_is_regular_tsvector`` is true only for ``native``, so every + other backend is a no-op. + * The tsvector is built from the observation's own ``text`` only — matching + the consolidator INSERT/UPDATE paths (entity / source / temporal signals + are intentionally excluded; the other retrieval arms cover those). + * Only ``fact_type = 'observation'`` rows with a NULL ``search_vector`` are + rewritten. Raw facts already carry a populated tsvector, and the + ``IS NULL`` predicate makes the migration idempotent and re-runnable. + +The configured ``HINDSIGHT_API_TEXT_SEARCH_EXTENSION_NATIVE_LANGUAGE`` is used +so backfilled rows are lexically identical to newly-created observations. The +value is validated as a PG identifier (mirroring +``HindsightConfig.validate``) before being embedded as a SQL literal. + +This is a single UPDATE per schema: it locks the targeted observation rows for +its duration. It is one-time and only touches unpopulated rows, so subsequent +online writes (which now carry the tsvector via the writer fix) are unaffected. + +Oracle slot is intentionally absent: the consolidator INSERT/UPDATE paths that +this repairs are PostgreSQL-specific (``ops_postgresql``), and the native +tsvector ``search_vector`` column only exists on PostgreSQL. There is no Oracle +residue to repair. + +Revision ID: c3f7a1b9d2e4 +Revises: f4d1c2b3a5e6 +Create Date: 2026-06-29 +""" + +import os +import re +from collections.abc import Sequence + +from alembic import context, op +from sqlalchemy import Connection, text + +from hindsight_api.alembic._dialect import run_for_dialect +from hindsight_api.config import ( + DEFAULT_TEXT_SEARCH_EXTENSION_NATIVE_LANGUAGE, + ENV_TEXT_SEARCH_EXTENSION_NATIVE_LANGUAGE, +) + +revision: str = "c3f7a1b9d2e4" +down_revision: str | Sequence[str] | None = "f4d1c2b3a5e6" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +# Matches HindsightConfig.validate(): a tsvector regconfig name embedded as a +# SQL literal must be a bare PG identifier. +_PG_IDENTIFIER = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*") + + +def _schema_prefix() -> str: + schema = context.config.get_main_option("target_schema") + return f'"{schema}".' if schema else "" + + +def _schema_name() -> str: + return (context.config.get_main_option("target_schema") or "public").strip('"') + + +def _native_language() -> str: + """Configured native tsvector language, validated as a PG identifier.""" + lang = os.getenv( + ENV_TEXT_SEARCH_EXTENSION_NATIVE_LANGUAGE, + DEFAULT_TEXT_SEARCH_EXTENSION_NATIVE_LANGUAGE, + ) + if not _PG_IDENTIFIER.fullmatch(lang): + return DEFAULT_TEXT_SEARCH_EXTENSION_NATIVE_LANGUAGE + return lang + + +def _is_regular_tsvector(conn: Connection, schema: str, table: str) -> bool: + """True iff ``schema.table.search_vector`` is a non-generated tsvector column. + + This is the ``native`` backend signature. ``vchord`` (bm25vector) and + ``pg_textsearch`` / ``pgroonga`` / ``pg_search`` (dummy text column) all + fail this check, so the backfill is a no-op for them. + """ + row = conn.execute( + text( + """ + SELECT is_generated, udt_name + FROM information_schema.columns + WHERE table_schema = :schema + AND table_name = :table + AND column_name = 'search_vector' + """ + ), + {"schema": schema, "table": table}, + ).fetchone() + if not row: + return False + is_generated, udt_name = row[0], row[1] + return udt_name == "tsvector" and is_generated != "ALWAYS" + + +def _pg_upgrade() -> None: + conn = op.get_bind() + schema_name = _schema_name() + if not _is_regular_tsvector(conn, schema_name, "memory_units"): + # Non-native backend (or column absent) — nothing to backfill. + return + schema_prefix = _schema_prefix() + lang = _native_language() + op.execute( + f""" + UPDATE {schema_prefix}memory_units + SET search_vector = to_tsvector('{lang}'::regconfig, COALESCE(text, '')) + WHERE fact_type = 'observation' AND search_vector IS NULL + """ + ) + + +def _pg_downgrade() -> None: + # No-op: backfilled rows are indistinguishable from observations that were + # populated by the post-fix writer, and reverting either to NULL would + # re-break BM25 retrieval. The column simply stays populated. + pass + + +def upgrade() -> None: + run_for_dialect(pg=_pg_upgrade) + + +def downgrade() -> None: + run_for_dialect(pg=_pg_downgrade) diff --git a/hindsight-api-slim/tests/test_migration_observation_search_vector_backfill.py b/hindsight-api-slim/tests/test_migration_observation_search_vector_backfill.py new file mode 100644 index 0000000000..77c9a7c10f --- /dev/null +++ b/hindsight-api-slim/tests/test_migration_observation_search_vector_backfill.py @@ -0,0 +1,155 @@ +"""Regression for the consolidator search_vector gap (PR #2425). + +Observations written by the consolidator under the ``native`` text-search +backend landed with a NULL ``search_vector`` and were invisible to BM25. The +writer is fixed to populate the tsvector; migration +``c3f7a1b9d2e4`` backfills the historical NULL observations. + +This test seeds an observation with a NULL ``search_vector`` at the revision +just before the backfill, runs the migration to head, and asserts the row is +populated with a valid tsvector — and that already-populated rows are left +untouched. Uses a dedicated pg0 instance (mirrors test_migration_backsweep) so +it controls exactly which migrations have run and never stamps the shared test +instance. +""" + +import asyncio +import uuid +from pathlib import Path + +import pytest +from alembic import command +from alembic.config import Config +from sqlalchemy import create_engine, text + +_SCRIPT_LOCATION = str(Path(__file__).parent.parent / "hindsight_api" / "alembic") + +# Revision immediately before the backfill migration. +_PRE_BACKFILL_REVISION = "f4d1c2b3a5e6" +_BACKFILL_REVISION = "c3f7a1b9d2e4" + + +def _alembic_cfg(db_url: str) -> Config: + cfg = Config() + cfg.set_main_option("script_location", _SCRIPT_LOCATION) + cfg.set_main_option("sqlalchemy.url", db_url) + cfg.set_main_option("prepend_sys_path", ".") + cfg.set_main_option("path_separator", "os") + return cfg + + +@pytest.fixture(scope="module") +def pre_backfill_db_url(): + """pg0 instance brought to the revision just before the backfill so the + migration's UPDATE runs against seeded NULL-search_vector observations.""" + from hindsight_api.pg0 import EmbeddedPostgres + + pg0 = EmbeddedPostgres(name="hindsight-obs-sv-backfill-test", port=5568) + loop = asyncio.new_event_loop() + try: + url = loop.run_until_complete(pg0.ensure_running()) + finally: + loop.close() + + # pg0 data dirs persist across runs, so normalise: go to head, then down to + # just before the backfill. + command.upgrade(_alembic_cfg(url), "heads") + command.downgrade(_alembic_cfg(url), _PRE_BACKFILL_REVISION) + return url + + +def test_backfill_populates_null_observation_search_vector(pre_backfill_db_url): + db_url = pre_backfill_db_url + bank_id = f"obs-sv-{uuid.uuid4().hex[:12]}" + + null_obs_id = uuid.uuid4() + populated_obs_id = uuid.uuid4() + world_id = uuid.uuid4() + + engine = create_engine(db_url) + with engine.connect() as conn: + # Sanity: under the default native backend search_vector is a regular + # tsvector column; otherwise this test wouldn't exercise the gate. + udt = conn.execute( + text( + """ + SELECT udt_name FROM information_schema.columns + WHERE table_name = 'memory_units' AND column_name = 'search_vector' + """ + ) + ).scalar() + assert udt == "tsvector", f"expected native tsvector backend, got {udt!r}" + + conn.execute(text("INSERT INTO banks (bank_id) VALUES (:b)"), {"b": bank_id}) + + # The bug shape: an observation with NULL search_vector. + conn.execute( + text( + """ + INSERT INTO memory_units (id, bank_id, text, fact_type, search_vector) + VALUES (:id, :b, 'Django uses middleware for request processing', 'observation', NULL) + """ + ), + {"id": null_obs_id, "b": bank_id}, + ) + # An observation already populated — must be left byte-for-byte intact. + conn.execute( + text( + """ + INSERT INTO memory_units (id, bank_id, text, fact_type, search_vector) + VALUES (:id, :b, 'Already indexed observation', 'observation', + to_tsvector('english', 'Already indexed observation')) + """ + ), + {"id": populated_obs_id, "b": bank_id}, + ) + # A non-observation row with NULL search_vector — must NOT be touched + # (the migration is scoped to fact_type = 'observation'). + conn.execute( + text( + """ + INSERT INTO memory_units (id, bank_id, text, fact_type, search_vector) + VALUES (:id, :b, 'A world fact', 'world', NULL) + """ + ), + {"id": world_id, "b": bank_id}, + ) + conn.commit() + + # Run the backfill. + command.upgrade(_alembic_cfg(db_url), _BACKFILL_REVISION) + + with engine.connect() as conn: + null_obs_sv, null_obs_match = conn.execute( + text( + """ + SELECT search_vector IS NOT NULL, + search_vector @@ plainto_tsquery('english', 'middleware') + FROM memory_units WHERE id = :id + """ + ), + {"id": null_obs_id}, + ).fetchone() + assert null_obs_sv, "backfill must populate the NULL observation's search_vector" + assert null_obs_match, "backfilled tsvector must be BM25-searchable on its own text" + + populated_match = conn.execute( + text("SELECT search_vector @@ plainto_tsquery('english', 'indexed') FROM memory_units WHERE id = :id"), + {"id": populated_obs_id}, + ).scalar() + assert populated_match, "pre-populated observation must remain searchable" + + world_null = conn.execute( + text("SELECT search_vector IS NULL FROM memory_units WHERE id = :id"), + {"id": world_id}, + ).scalar() + assert world_null, "non-observation rows must be left untouched by the backfill" + + # Idempotency: re-running touches nothing and stays at head. + command.upgrade(_alembic_cfg(db_url), "heads") + with engine.connect() as conn: + still_populated = conn.execute( + text("SELECT search_vector IS NOT NULL FROM memory_units WHERE id = :id"), + {"id": null_obs_id}, + ).scalar() + assert still_populated From 986fd5cbe469ed731f71a393a7657744a2ed2d90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Boschi?= Date: Tue, 30 Jun 2026 09:57:10 +0200 Subject: [PATCH 5/5] chore: remove accidentally committed git-lfs hooks post-checkout/post-commit/post-merge/pre-push were git-lfs stubs picked up from the contributor's local hookspath and committed by mistake. They are unrelated to this change; the project's real .githooks/pre-commit is left intact. --- .githooks/post-checkout | 3 --- .githooks/post-commit | 3 --- .githooks/post-merge | 3 --- .githooks/pre-push | 3 --- 4 files changed, 12 deletions(-) delete mode 100755 .githooks/post-checkout delete mode 100755 .githooks/post-commit delete mode 100755 .githooks/post-merge delete mode 100755 .githooks/pre-push diff --git a/.githooks/post-checkout b/.githooks/post-checkout deleted file mode 100755 index 5abf8ed93f..0000000000 --- a/.githooks/post-checkout +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } -git lfs post-checkout "$@" diff --git a/.githooks/post-commit b/.githooks/post-commit deleted file mode 100755 index b8b76c2c42..0000000000 --- a/.githooks/post-commit +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } -git lfs post-commit "$@" diff --git a/.githooks/post-merge b/.githooks/post-merge deleted file mode 100755 index 726f909891..0000000000 --- a/.githooks/post-merge +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } -git lfs post-merge "$@" diff --git a/.githooks/pre-push b/.githooks/pre-push deleted file mode 100755 index 5f26dc4552..0000000000 --- a/.githooks/pre-push +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } -git lfs pre-push "$@"