From 1b44516a030e85f164c917af3e37359721059ca4 Mon Sep 17 00:00:00 2001 From: Omar Shaikh Date: Fri, 3 Apr 2026 23:06:48 -0700 Subject: [PATCH 1/2] dense captions on napsack --- pyproject.toml | 2 +- src/napsack/label/__main__.py | 4 +++ src/napsack/label/clients/__init__.py | 4 ++- src/napsack/label/clients/client.py | 17 ++++++++++ src/napsack/label/models.py | 14 ++++++-- src/napsack/label/processor.py | 31 +++++++++++++++--- src/napsack/label/prompts/default.txt | 14 +------- src/napsack/label/prompts/image_mode.txt | 14 +------- src/napsack/label/prompts/output/dense.txt | 32 +++++++++++++++++++ .../label/prompts/output/dense_image.txt | 32 +++++++++++++++++++ src/napsack/label/prompts/output/standard.txt | 13 ++++++++ .../label/prompts/output/standard_image.txt | 13 ++++++++ .../label/prompts/screenshots_only.txt | 14 +------- 13 files changed, 155 insertions(+), 49 deletions(-) create mode 100644 src/napsack/label/prompts/output/dense.txt create mode 100644 src/napsack/label/prompts/output/dense_image.txt create mode 100644 src/napsack/label/prompts/output/standard.txt create mode 100644 src/napsack/label/prompts/output/standard_image.txt diff --git a/pyproject.toml b/pyproject.toml index c2da310..9249bcb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "napsack" -version = "0.1.1" +version = "0.1.2" readme = "README.md" description = "NAPsack records and aggregates your computer use — screenshots plus input events (click, keypress, scroll, cursor move). It groups activity into event bursts and uses a VLM pipeline to generate human-readable captions describing what happened." requires-python = ">=3.11,<=3.13" diff --git a/src/napsack/label/__main__.py b/src/napsack/label/__main__.py index 2588e5b..208bad8 100644 --- a/src/napsack/label/__main__.py +++ b/src/napsack/label/__main__.py @@ -28,6 +28,7 @@ def parse_args(): p.add_argument("--dedupe-threshold", type=int, default=1, help="Hamming distance threshold for deduplication (drop if <= threshold, default: 1)") p.add_argument("--annotate", action="store_true", help="Annotate videos with cursor positions and clicks (only for standard processing)") p.add_argument("--image-mode", action="store_true", help="Send frames as individual images instead of video (for models that don't support video input)") + p.add_argument("--dense-caption", action="store_true", help="Include a dense text caption per chunk describing important text the user focused on, for retrieval") p.add_argument("--skip-existing", action="store_true", help="Skip sessions that have already been processed") p.add_argument("--visualize", action="store_true", help="Create annotated video visualizations after processing") p.add_argument("--encode-only", action="store_true", help="Only encode videos (create chunks), skip labeling. Useful for pre-processing before running the full pipeline.") @@ -121,6 +122,7 @@ def process_with_litellm(args, configs): hash_cache_path=args.hash_cache, dedupe_threshold=args.dedupe_threshold, image_mode=args.image_mode, + dense_caption=args.dense_caption, ) return processor.process_sessions( @@ -151,6 +153,7 @@ def process_with_bigquery(args, configs): hash_cache_path=args.hash_cache, dedupe_threshold=args.dedupe_threshold, image_mode=args.image_mode, + dense_caption=args.dense_caption, ) return processor.process_sessions( @@ -179,6 +182,7 @@ def process_with_tinfoil(args, configs): hash_cache_path=args.hash_cache, dedupe_threshold=args.dedupe_threshold, image_mode=args.image_mode, + dense_caption=args.dense_caption, ) return processor.process_sessions( diff --git a/src/napsack/label/clients/__init__.py b/src/napsack/label/clients/__init__.py index 6dc8fd4..b6f85f0 100644 --- a/src/napsack/label/clients/__init__.py +++ b/src/napsack/label/clients/__init__.py @@ -1,4 +1,4 @@ -from napsack.label.clients.client import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA +from napsack.label.clients.client import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA, DENSE_CAPTION_SCHEMA, DENSE_IMAGE_CAPTION_SCHEMA from napsack.label.clients.litellm import LiteLLMClient from napsack.label.clients.bigquery import BigQueryClient, BigQueryResponse from napsack.label.clients.tinfoil import TinfoilClient @@ -23,5 +23,7 @@ def create_client(client_type: str, **kwargs) -> VLMClient: "TinfoilClient", "CAPTION_SCHEMA", "IMAGE_CAPTION_SCHEMA", + "DENSE_CAPTION_SCHEMA", + "DENSE_IMAGE_CAPTION_SCHEMA", "create_client", ] diff --git a/src/napsack/label/clients/client.py b/src/napsack/label/clients/client.py index 1522a64..69e91af 100644 --- a/src/napsack/label/clients/client.py +++ b/src/napsack/label/clients/client.py @@ -28,6 +28,23 @@ } } +DENSE_CAPTION_SCHEMA = { + "type": "object", + "properties": { + "actions": CAPTION_SCHEMA, + "dense_caption": {"type": "string"} + }, + "required": ["actions", "dense_caption"] +} + +DENSE_IMAGE_CAPTION_SCHEMA = { + "type": "object", + "properties": { + "actions": IMAGE_CAPTION_SCHEMA, + "dense_caption": {"type": "string"} + }, + "required": ["actions", "dense_caption"] +} class VLMClient(ABC): @abstractmethod diff --git a/src/napsack/label/models.py b/src/napsack/label/models.py index fd91a78..6d924bd 100644 --- a/src/napsack/label/models.py +++ b/src/napsack/label/models.py @@ -366,6 +366,7 @@ class Caption: end_seconds: float text: str chunk_index: int = 0 + dense_caption: Optional[str] = None @property def start_formatted(self) -> str: @@ -381,11 +382,12 @@ def from_dict(cls, data: Dict) -> Caption: start_seconds=data['start_seconds'], end_seconds=data['end_seconds'], text=data['caption'], - chunk_index=data.get('chunk_index', 0) + chunk_index=data.get('chunk_index', 0), + dense_caption=data.get('dense_caption'), ) def to_dict(self) -> Dict: - return { + d = { 'start': self.start_formatted, 'end': self.end_formatted, 'start_seconds': self.start_seconds, @@ -393,6 +395,9 @@ def to_dict(self) -> Dict: 'caption': self.text, 'chunk_index': self.chunk_index } + if self.dense_caption is not None: + d['dense_caption'] = self.dense_caption + return d @dataclass @@ -417,7 +422,7 @@ def all_events(self) -> List[Event]: return events def to_dict(self) -> Dict: - return { + d = { 'start_time': self.aggregations[0].timestamp if self.aggregations else 0, 'end_time': self.aggregations[-1].timestamp if self.aggregations else 0, 'start_index': self.start_index, @@ -430,6 +435,9 @@ def to_dict(self) -> Dict: 'end_formatted': self.caption.end_formatted, 'scale_factor': self.screenshot_scale_factor } + if self.caption.dense_caption is not None: + d['dense_caption'] = self.caption.dense_caption + return d @dataclass diff --git a/src/napsack/label/processor.py b/src/napsack/label/processor.py index 31a4693..d97b25e 100644 --- a/src/napsack/label/processor.py +++ b/src/napsack/label/processor.py @@ -9,7 +9,7 @@ from napsack.label.models import SessionConfig, ChunkTask, Caption, Aggregation, VideoPath, MatchedCaption from napsack.label.video import create_video, split_video, compute_max_size -from napsack.label.clients import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA +from napsack.label.clients import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA, DENSE_CAPTION_SCHEMA, DENSE_IMAGE_CAPTION_SCHEMA # ============================================================================ @@ -144,16 +144,21 @@ def __init__( hash_cache_path: Optional[str] = None, dedupe_threshold: int = 1, image_mode: bool = False, + dense_caption: bool = False, ): self.client = client self.encode_workers = encode_workers self.label_workers = label_workers self.screenshots_only = screenshots_only - self.prompt = self._load_prompt(prompt_file) self.max_time_gap = max_time_gap self.dedupe_threshold = dedupe_threshold self.hash_map = load_hash_cache(hash_cache_path) if hash_cache_path else None self.image_mode = image_mode + self.dense_caption = dense_caption + + base_prompt = self._load_prompt(prompt_file) + output_format = self._load_output_format(image_mode, dense_caption) + self.prompt = base_prompt.replace("{{OUTPUT_FORMAT}}", output_format) def _load_prompt(self, path: str) -> str: p = Path(path) @@ -161,6 +166,13 @@ def _load_prompt(self, path: str) -> str: p = Path(__file__).parent / path return p.read_text() + def _load_output_format(self, image_mode: bool, dense_caption: bool) -> str: + if dense_caption: + filename = "prompts/output/dense_image.txt" if image_mode else "prompts/output/dense.txt" + else: + filename = "prompts/output/standard_image.txt" if image_mode else "prompts/output/standard.txt" + return self._load_prompt(filename) + def process_sessions( self, configs: List[SessionConfig], @@ -535,6 +547,7 @@ def _process_tasks(self, tasks: List[ChunkTask], config_map: dict) -> List[Tuple def _process_single_task(self, task: ChunkTask) -> any: """Process single task with schema.""" if self.image_mode: + schema = DENSE_IMAGE_CAPTION_SCHEMA if self.dense_caption else IMAGE_CAPTION_SCHEMA per_frame_text = None if task.aggregations: per_frame_text = [agg.to_prompt(f"Frame {j + 1}") for j, agg in enumerate(task.aggregations)] @@ -542,10 +555,11 @@ def _process_single_task(self, task: ChunkTask) -> any: [str(p) for p in task.image_paths], session_id=task.session_id, per_frame_text=per_frame_text, ) - response = self.client.generate(task.prompt, file_desc, schema=IMAGE_CAPTION_SCHEMA) + response = self.client.generate(task.prompt, file_desc, schema=schema) else: + schema = DENSE_CAPTION_SCHEMA if self.dense_caption else CAPTION_SCHEMA file_desc = self.client.upload_file(str(task.video_path.resolve()), session_id=task.session_id) - response = self.client.generate(task.prompt, file_desc, schema=CAPTION_SCHEMA) + response = self.client.generate(task.prompt, file_desc, schema=schema) return response @@ -597,6 +611,12 @@ def _save_results( def _extract_captions(self, result: any, task: ChunkTask, fps: int = 1) -> List[Caption]: captions = [] + dense_caption_text = None + + # Unwrap dense caption response format + if isinstance(result, dict) and "actions" in result: + dense_caption_text = result.get("dense_caption") if self.dense_caption else None + result = result.get("actions", []) if isinstance(result, str) or not isinstance(result, list): return captions @@ -636,7 +656,8 @@ def _extract_captions(self, result: any, task: ChunkTask, fps: int = 1) -> List[ start_seconds=abs_start, end_seconds=abs_end, text=item.get("caption", item.get("description", "")), - chunk_index=task.chunk_index + chunk_index=task.chunk_index, + dense_caption=dense_caption_text, )) return captions diff --git a/src/napsack/label/prompts/default.txt b/src/napsack/label/prompts/default.txt index 666ae21..c54f771 100644 --- a/src/napsack/label/prompts/default.txt +++ b/src/napsack/label/prompts/default.txt @@ -83,16 +83,4 @@ Generated captions must be in past tense, and at the level of detail as the exam You MUST quote specific things from the screen so it's easy to reproduce your steps. -## Output - -A JSON array of objects: - -```json -[ - { - "start": "MM:SS", - "end": "MM:SS", - "caption": "..." - } -] -``` +{{OUTPUT_FORMAT}} diff --git a/src/napsack/label/prompts/image_mode.txt b/src/napsack/label/prompts/image_mode.txt index 0759d29..ea62814 100644 --- a/src/napsack/label/prompts/image_mode.txt +++ b/src/napsack/label/prompts/image_mode.txt @@ -77,16 +77,4 @@ Generated captions must be in past tense, and at the level of detail as the exam You MUST quote specific things from the screen so it's easy to reproduce your steps. -## Output - -A JSON array of objects. Use **frame numbers** (integers) for start and end: - -```json -[ - { - "start": 1, - "end": 3, - "caption": "..." - } -] -``` +{{OUTPUT_FORMAT}} diff --git a/src/napsack/label/prompts/output/dense.txt b/src/napsack/label/prompts/output/dense.txt new file mode 100644 index 0000000..368380d --- /dev/null +++ b/src/napsack/label/prompts/output/dense.txt @@ -0,0 +1,32 @@ +## Dense Caption + +In addition to the actions array, you MUST also produce a `dense_caption` field: a single string that captures the **important text content the user was focused on or interacting with** during this chunk. This includes: +- Text being read, written, or edited +- URLs, file paths, code snippets, search queries +- Names, labels, or data values visible and relevant to the user's activity + +The dense caption should be a concise but information-rich summary optimized for text search and retrieval. Do NOT describe actions — instead, capture the **textual content** itself. + +### Examples + +- "VS Code editor open to src/utils/auth.py. Function `verify_jwt_token(token: str, secret: str) -> dict` on lines 42-58. Import statements: `from jose import jwt`, `from datetime import datetime, timedelta`. Error highlighted on line 51: `jwt.ExpiredSignatureError`." +- "Chrome browser on GitHub pull request #347 'Fix race condition in connection pool' in repo acme/backend. Files changed: pool.py (+23 -8), test_pool.py (+45). Review comment from @danl: 'Should we add a timeout here?' on line 89 of pool.py." +- "Terminal running `kubectl get pods -n staging`. Output shows 4 pods: api-server-7f8b (Running), worker-3a2c (CrashLoopBackOff), redis-cache-1d4e (Running), nginx-proxy-9b1f (Running). Error on worker pod: OOMKilled, restarts: 12." +- "Google Sheets spreadsheet 'Q2 Marketing Budget'. Editing cell D14 with formula `=SUM(D2:D13)`. Column D header: 'Spend ($)'. Visible values: D2=1200, D3=3400, D5=890. Sheet tab: 'Paid Ads'." + +## Output + +Return a JSON object with two fields: + +```json +{ + "actions": [ + { + "start": "MM:SS", + "end": "MM:SS", + "caption": "..." + } + ], + "dense_caption": "A single string capturing important text content..." +} +``` \ No newline at end of file diff --git a/src/napsack/label/prompts/output/dense_image.txt b/src/napsack/label/prompts/output/dense_image.txt new file mode 100644 index 0000000..f55ada0 --- /dev/null +++ b/src/napsack/label/prompts/output/dense_image.txt @@ -0,0 +1,32 @@ +## Dense Caption + +In addition to the actions array, you MUST also produce a `dense_caption` field: a single string that captures the **important text content the user was focused on or interacting with** during this chunk. This includes: +- Text being read, written, or edited +- URLs, file paths, code snippets, search queries +- Names, labels, or data values visible and relevant to the user's activity + +The dense caption should be a concise but information-rich summary optimized for text search and retrieval. Do NOT describe actions — instead, capture the **textual content** itself. + +### Examples + +- "VS Code editor open to src/utils/auth.py. Function `verify_jwt_token(token: str, secret: str) -> dict` on lines 42-58. Import statements: `from jose import jwt`, `from datetime import datetime, timedelta`. Error highlighted on line 51: `jwt.ExpiredSignatureError`." +- "Chrome browser on GitHub pull request #347 'Fix race condition in connection pool' in repo acme/backend. Files changed: pool.py (+23 -8), test_pool.py (+45). Review comment from @danl: 'Should we add a timeout here?' on line 89 of pool.py." +- "Terminal running `kubectl get pods -n staging`. Output shows 4 pods: api-server-7f8b (Running), worker-3a2c (CrashLoopBackOff), redis-cache-1d4e (Running), nginx-proxy-9b1f (Running). Error on worker pod: OOMKilled, restarts: 12." +- "Google Sheets spreadsheet 'Q2 Marketing Budget'. Editing cell D14 with formula `=SUM(D2:D13)`. Column D header: 'Spend ($)'. Visible values: D2=1200, D3=3400, D5=890. Sheet tab: 'Paid Ads'." + +## Output + +Return a JSON object with two fields. Use **frame numbers** (integers) for start and end: + +```json +{ + "actions": [ + { + "start": 1, + "end": 3, + "caption": "..." + } + ], + "dense_caption": "A single string capturing important text content..." +} +``` \ No newline at end of file diff --git a/src/napsack/label/prompts/output/standard.txt b/src/napsack/label/prompts/output/standard.txt new file mode 100644 index 0000000..4bfe979 --- /dev/null +++ b/src/napsack/label/prompts/output/standard.txt @@ -0,0 +1,13 @@ +## Output + +A JSON array of objects: + +```json +[ + { + "start": "MM:SS", + "end": "MM:SS", + "caption": "..." + } +] +``` \ No newline at end of file diff --git a/src/napsack/label/prompts/output/standard_image.txt b/src/napsack/label/prompts/output/standard_image.txt new file mode 100644 index 0000000..bf77933 --- /dev/null +++ b/src/napsack/label/prompts/output/standard_image.txt @@ -0,0 +1,13 @@ +## Output + +A JSON array of objects. Use **frame numbers** (integers) for start and end: + +```json +[ + { + "start": 1, + "end": 3, + "caption": "..." + } +] +``` \ No newline at end of file diff --git a/src/napsack/label/prompts/screenshots_only.txt b/src/napsack/label/prompts/screenshots_only.txt index d3dc8e8..701ac2a 100644 --- a/src/napsack/label/prompts/screenshots_only.txt +++ b/src/napsack/label/prompts/screenshots_only.txt @@ -72,16 +72,4 @@ Generated captions must be in past tense, and at the level of detail as the exam You MUST quote specific things from the screen so it's easy to reproduce your steps. -## Output - -A JSON array of objects: - -```json -[ - { - "start": "MM:SS", - "end": "MM:SS", - "caption": "..." - } -] -``` \ No newline at end of file +{{OUTPUT_FORMAT}} \ No newline at end of file From 0ab6bf9a40949c2dfad67088d1fa779a784a06a1 Mon Sep 17 00:00:00 2001 From: Omar Shaikh Date: Fri, 3 Apr 2026 23:17:36 -0700 Subject: [PATCH 2/2] add dense --- src/napsack/label/prompts/default.txt | 2 +- src/napsack/label/prompts/output/dense.txt | 1 + src/napsack/label/prompts/output/dense_image.txt | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/napsack/label/prompts/default.txt b/src/napsack/label/prompts/default.txt index c54f771..58e6557 100644 --- a/src/napsack/label/prompts/default.txt +++ b/src/napsack/label/prompts/default.txt @@ -83,4 +83,4 @@ Generated captions must be in past tense, and at the level of detail as the exam You MUST quote specific things from the screen so it's easy to reproduce your steps. -{{OUTPUT_FORMAT}} +{{OUTPUT_FORMAT}} \ No newline at end of file diff --git a/src/napsack/label/prompts/output/dense.txt b/src/napsack/label/prompts/output/dense.txt index 368380d..f3ceb36 100644 --- a/src/napsack/label/prompts/output/dense.txt +++ b/src/napsack/label/prompts/output/dense.txt @@ -13,6 +13,7 @@ The dense caption should be a concise but information-rich summary optimized for - "Chrome browser on GitHub pull request #347 'Fix race condition in connection pool' in repo acme/backend. Files changed: pool.py (+23 -8), test_pool.py (+45). Review comment from @danl: 'Should we add a timeout here?' on line 89 of pool.py." - "Terminal running `kubectl get pods -n staging`. Output shows 4 pods: api-server-7f8b (Running), worker-3a2c (CrashLoopBackOff), redis-cache-1d4e (Running), nginx-proxy-9b1f (Running). Error on worker pod: OOMKilled, restarts: 12." - "Google Sheets spreadsheet 'Q2 Marketing Budget'. Editing cell D14 with formula `=SUM(D2:D13)`. Column D header: 'Spend ($)'. Visible values: D2=1200, D3=3400, D5=890. Sheet tab: 'Paid Ads'." +- "Slack conversation in #eng-incidents channel. User typed: 'looks like the redis cluster in us-east-1 is throwing CLUSTERDOWN errors — can someone check if the failover completed? cc @oncall-infra'. Replying to message from @mkhan: 'We're seeing elevated 5xx rates on the payments service since 2:14pm PT.' Channel topic: 'Production incident triage'." ## Output diff --git a/src/napsack/label/prompts/output/dense_image.txt b/src/napsack/label/prompts/output/dense_image.txt index f55ada0..12e508d 100644 --- a/src/napsack/label/prompts/output/dense_image.txt +++ b/src/napsack/label/prompts/output/dense_image.txt @@ -13,6 +13,7 @@ The dense caption should be a concise but information-rich summary optimized for - "Chrome browser on GitHub pull request #347 'Fix race condition in connection pool' in repo acme/backend. Files changed: pool.py (+23 -8), test_pool.py (+45). Review comment from @danl: 'Should we add a timeout here?' on line 89 of pool.py." - "Terminal running `kubectl get pods -n staging`. Output shows 4 pods: api-server-7f8b (Running), worker-3a2c (CrashLoopBackOff), redis-cache-1d4e (Running), nginx-proxy-9b1f (Running). Error on worker pod: OOMKilled, restarts: 12." - "Google Sheets spreadsheet 'Q2 Marketing Budget'. Editing cell D14 with formula `=SUM(D2:D13)`. Column D header: 'Spend ($)'. Visible values: D2=1200, D3=3400, D5=890. Sheet tab: 'Paid Ads'." +- "Slack conversation in #eng-incidents channel. User typed: 'looks like the redis cluster in us-east-1 is throwing CLUSTERDOWN errors — can someone check if the failover completed? cc @oncall-infra'. Replying to message from @mkhan: 'We're seeing elevated 5xx rates on the payments service since 2:14pm PT.' Channel topic: 'Production incident triage'." ## Output