Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ The background daemon starts automatically on first use.

| Command | Description |
|---------|-------------|
| `ccc init` | Initialize a project — creates settings files, adds `.cocoindex_code/` to `.gitignore` |
| `ccc init` | Initialize a project — creates settings files, adds `.cocoindex_code/` to `.gitignore`. Use `--backend turbo-quant` (with `--tq-bits`) to pick the compressed backend; see [Vector Backends](#vector-backends) |
| `ccc index` | Build or update the index (auto-inits if needed). Shows streaming progress. |
| `ccc search <query>` | Semantic search across the codebase |
| `ccc status` | Show index stats (chunk count, file count, language breakdown) |
Expand All @@ -189,6 +189,34 @@ ccc search --refresh database schema # update index first, then

By default, `ccc search` scopes results to your current working directory (relative to the project root). Use `--path` to override.

## Vector Backends

`ccc` supports two vector-search backends, chosen at `ccc init` and baked into the index:

| Backend | Index size | Search | Best for |
|---------|-----------|--------|----------|
| `sqlite-vec` (default) | full `float32` | exact KNN ([sqlite-vec](https://github.com/asg017/sqlite-vec)) | most projects — fastest, exact results |
| `turbo-quant` | ~4–8× smaller | approximate, unbiased inner-product | large codebases where index size matters |

**TurboQuant** is a data-oblivious vector quantizer ([Zandieh et al., 2025](https://arxiv.org/abs/2504.19874)): it randomly rotates each embedding, quantizes per coordinate with optimal scalar codebooks, and adds a 1-bit QJL residual for an unbiased inner-product estimate. At 4-bit it compresses the index ~8× on disk with recall@10 ≈ 0.9, with no training or calibration.

```bash
ccc init # interactive — prompts for backend
ccc init --backend turbo-quant # 4-bit (default bit-width)
ccc init --backend turbo-quant --tq-bits 2 # 2-bit — ~16× smaller, lower recall
ccc init --backend sqlite-vec # explicit default
```

Switching backends requires re-initializing and re-indexing:

```bash
ccc reset --all -f
ccc init --backend turbo-quant
ccc index
```

> Higher `--tq-bits` (1–4) means better recall and a larger index. `sqlite-vec` stays the default for exact, low-latency search.

## Docker

A Docker image is available for teams who want a reproducible, dependency-free
Expand Down Expand Up @@ -438,6 +466,9 @@ OpenAI embeddings (`text-embedding-3-*`, `text-embedding-ada-002`) are intention
Per-project. Controls which files to index.

```yaml
backend: sqlite-vec # or "turbo-quant" — see Vector Backends
tq_bits: 4 # TurboQuant bit-width (1–4); only used when backend is turbo-quant

include_patterns:
- "**/*.py"
- "**/*.js"
Expand All @@ -462,6 +493,8 @@ chunkers:
module: example_toml_chunker:toml_chunker
```

> `backend` is set at `ccc init` and baked into the index — changing it requires re-indexing (see [Vector Backends](#vector-backends)).

> `.cocoindex_code/` is automatically added to `.gitignore` during init.

Use `chunkers` when you want to control how a file type is split into chunks before indexing.
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,11 @@ files = ["src"]

[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py"]
python_files = ["test_*.py", "benchmark_*.py"]
python_functions = ["test_*"]
addopts = "-v --tb=short -m 'not docker_e2e'"
addopts = "-v --tb=short -m 'not docker_e2e and not benchmark'"
asyncio_mode = "auto"
markers = [
"docker_e2e: requires Docker; builds the image and runs containerized E2E tests. Run with: pytest -m docker_e2e",
"benchmark: TurboQuant vs sqlite-vec benchmark; prints a metrics table. Run with: pytest -m benchmark -s",
]
88 changes: 87 additions & 1 deletion src/cocoindex_code/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@

from .settings import (
DEFAULT_ST_MODEL,
DEFAULT_TQ_BITS,
SUPPORTED_TQ_BITS,
Backend,
EmbeddingSettings,
cocoindex_db_path,
default_project_settings,
Expand All @@ -34,6 +37,8 @@
save_project_settings,
target_sqlite_db_path,
user_settings_path,
validate_backend,
validate_tq_bits,
)

app = _typer.Typer(
Expand Down Expand Up @@ -383,6 +388,53 @@ def _resolve_embedding_choice(
return EmbeddingSettings(provider=provider, model=model.strip())


def _resolve_backend(backend_flag: str | None, tq_bits_flag: int | None) -> tuple[Backend, int]:
"""Resolve (backend, tq_bits) from flags, an interactive prompt, or defaults.

Explicit ``--backend`` wins. Otherwise prompt when stdin is a TTY; when not
interactive, fall back to the default backend (sqlite-vec).
"""
bits = validate_tq_bits(tq_bits_flag) if tq_bits_flag is not None else DEFAULT_TQ_BITS

if backend_flag is not None:
return validate_backend(backend_flag), bits

if not sys.stdin.isatty():
return "sqlite-vec", bits

import questionary

backend = questionary.select(
"Vector backend",
choices=[
questionary.Choice(
title="sqlite-vec (default, exact nearest-neighbor)",
value="sqlite-vec",
),
questionary.Choice(
title="turbo-quant (compressed, ~4-8x smaller index)",
value="turbo-quant",
),
],
).ask()
if backend is None: # cancelled
raise _typer.Exit(code=1)

if backend == "turbo-quant" and tq_bits_flag is None:
answer = questionary.select(
"TurboQuant bit-width (higher = better recall, larger index)",
# Choice titles are strings; values are ints. The default must match a
# choice *value* (int), not its title (str).
choices=[questionary.Choice(title=str(b), value=b) for b in SUPPORTED_TQ_BITS],
default=DEFAULT_TQ_BITS, # type: ignore[arg-type]
).ask()
if answer is None:
raise _typer.Exit(code=1)
bits = validate_tq_bits(answer)

return validate_backend(backend), bits


def _ok_fail_tag(ok: bool) -> str:
"""Return a colored `[OK]` or `[FAIL]` tag string."""
import click as _click
Expand Down Expand Up @@ -484,9 +536,33 @@ def init(
"--litellm-model",
help="Use the given LiteLLM model and skip provider/model prompts.",
),
backend: str | None = _typer.Option(
None,
"--backend",
help="Vector backend: 'sqlite-vec' (default, exact) or 'turbo-quant' (compressed).",
),
tq_bits: int | None = _typer.Option(
None,
"--tq-bits",
help=f"TurboQuant bit-width {list(SUPPORTED_TQ_BITS)} (only for --backend turbo-quant).",
),
force: bool = _typer.Option(False, "-f", "--force", help="Skip parent directory warning"),
) -> None:
"""Initialize a project for cocoindex-code."""
# Validate backend flags early so bad input fails before any side effects.
if backend is not None:
try:
validate_backend(backend)
except ValueError as e:
_typer.echo(f"Error: {e}", err=True)
raise _typer.Exit(code=1) from e
if tq_bits is not None:
try:
validate_tq_bits(tq_bits)
except ValueError as e:
_typer.echo(f"Error: {e}", err=True)
raise _typer.Exit(code=1) from e

cwd = Path.cwd().resolve()
settings_file = project_settings_path(cwd)

Expand Down Expand Up @@ -520,9 +596,19 @@ def init(
)
raise _typer.Exit(code=1)

# Resolve the vector backend: explicit flag wins; otherwise prompt when
# interactive; otherwise fall back to the default (sqlite-vec).
resolved_backend, resolved_bits = _resolve_backend(backend, tq_bits)

# Create project settings
save_project_settings(cwd, default_project_settings())
project_settings = default_project_settings()
project_settings.backend = resolved_backend
project_settings.tq_bits = resolved_bits
save_project_settings(cwd, project_settings)
_typer.echo(f"Created project settings: {format_path_for_display(settings_file)}")
_typer.echo(f"Vector backend: {resolved_backend}")
if resolved_backend == "turbo-quant":
_typer.echo(f"TurboQuant bit-width: {resolved_bits}")

# Add to .gitignore
add_to_gitignore(cwd)
Expand Down
14 changes: 11 additions & 3 deletions src/cocoindex_code/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,14 +437,22 @@ async def _check_index_status(project_root_str: str) -> DoctorCheckResult:
return DoctorCheckResult(name="Index Status", ok=True, details=details, errors=[])

try:
from .tq_store import index_table_name

conn = coco_sqlite.connect(str(db_path), load_vec=True)
try:
with conn.readonly() as db:
total_chunks = db.execute("SELECT COUNT(*) FROM code_chunks_vec").fetchone()[0]
file_rows = db.execute("SELECT DISTINCT file_path FROM code_chunks_vec").fetchall()
table = index_table_name(db)
if table is None:
details.append("Index not created yet.")
return DoctorCheckResult(
name="Index Status", ok=True, details=details, errors=[]
)
total_chunks = db.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
file_rows = db.execute(f"SELECT DISTINCT file_path FROM {table}").fetchall()
total_files = len(file_rows)
lang_rows = db.execute(
"SELECT language, COUNT(*) FROM code_chunks_vec GROUP BY language"
f"SELECT language, COUNT(*) FROM {table} GROUP BY language"
).fetchall()
languages = {row[0]: row[1] for row in lang_rows}
finally:
Expand Down
97 changes: 73 additions & 24 deletions src/cocoindex_code/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from collections.abc import Iterable
from pathlib import Path, PurePath
from typing import Any

import cocoindex as coco
from cocoindex.connectors import localfs, sqlite
Expand All @@ -15,14 +16,17 @@
from pathspec import GitIgnoreSpec

from .chunking import CHUNKER_REGISTRY
from .schema import TqChunkRow
from .settings import load_gitignore_spec, load_project_settings
from .shared import (
CODEBASE_DIR,
EMBEDDER,
INDEXING_EMBED_PARAMS,
SQLITE_DB,
TURBO_QUANT,
CodeChunk,
)
from .tq_store import TQ_TABLE, quantize_row

# Chunking configuration
CHUNK_SIZE = 1000
Expand Down Expand Up @@ -137,9 +141,14 @@ def is_file_included(self, path: PurePath) -> bool:
@coco.fn(memo=True)
async def process_file(
file: localfs.File,
table: sqlite.TableTarget[CodeChunk],
table: sqlite.TableTarget[Any],
) -> None:
"""Process a single file: chunk, embed, and store."""
"""Process a single file: chunk, embed, and store.

The stored row type depends on the project backend: ``CodeChunk`` (raw
float32 in vec0) for sqlite-vec, or ``TqChunkRow`` (quantized) for
turbo-quant. ``table`` is the matching target built by ``indexer_main``.
"""
embedder = coco.use_context(EMBEDDER)
indexing_params = coco.use_context(INDEXING_EMBED_PARAMS)

Expand Down Expand Up @@ -177,19 +186,37 @@ async def process_file(
)

id_gen = IdGenerator()
backend = ps.backend
tq = coco.use_context(TURBO_QUANT) if backend == "turbo-quant" else None

async def process(chunk: Chunk) -> None:
table.declare_row(
row=CodeChunk(
id=await id_gen.next_id(chunk.text),
file_path=file.file_path.path.as_posix(),
language=language,
content=chunk.text,
start_line=chunk.start.line,
end_line=chunk.end.line,
embedding=await embedder.embed(chunk.text, **indexing_params),
chunk_id = await id_gen.next_id(chunk.text)
embedding = await embedder.embed(chunk.text, **indexing_params)
if tq is not None:
table.declare_row(
row=quantize_row(
tq,
chunk_id=chunk_id,
file_path=file.file_path.path.as_posix(),
language=language,
content=chunk.text,
start_line=chunk.start.line,
end_line=chunk.end.line,
embedding=embedding,
)
)
else:
table.declare_row(
row=CodeChunk(
id=chunk_id,
file_path=file.file_path.path.as_posix(),
language=language,
content=chunk.text,
start_line=chunk.start.line,
end_line=chunk.end.line,
embedding=embedding,
)
)
)

await coco.map(process, chunks)

Expand All @@ -201,18 +228,40 @@ async def indexer_main() -> None:
ps = load_project_settings(project_root)
gitignore_spec = load_gitignore_spec(project_root)

table = await sqlite.mount_table_target(
db=SQLITE_DB,
table_name="code_chunks_vec",
table_schema=await sqlite.TableSchema.from_class(
CodeChunk,
primary_key=["id"],
),
virtual_table_def=Vec0TableDef(
partition_key_columns=["language"],
auxiliary_columns=["file_path", "content", "start_line", "end_line"],
),
)
table: sqlite.TableTarget[Any]
if ps.backend == "turbo-quant":
tq = coco.use_context(TURBO_QUANT)
# Persist index metadata (bits/dim/seed) so the store can regenerate the
# rotation/QJL matrices at query time.
db = coco.use_context(SQLITE_DB)
from .tq_store import create_metadata_table, write_metadata

# The chunk table itself is created by mount_table_target below; here we
# only own the side metadata table.
with db.transaction() as conn:
create_metadata_table(conn)
write_metadata(conn, bits=tq.bits, dim=tq.dim, seed=tq.seed)
table = await sqlite.mount_table_target(
db=SQLITE_DB,
table_name=TQ_TABLE,
table_schema=await sqlite.TableSchema.from_class(
TqChunkRow,
primary_key=["id"],
),
)
else:
table = await sqlite.mount_table_target(
db=SQLITE_DB,
table_name="code_chunks_vec",
table_schema=await sqlite.TableSchema.from_class(
CodeChunk,
primary_key=["id"],
),
virtual_table_def=Vec0TableDef(
partition_key_columns=["language"],
auxiliary_columns=["file_path", "content", "start_line", "end_line"],
),
)

base_matcher = PatternFilePathMatcher(
included_patterns=ps.include_patterns,
Expand Down
Loading