From 1b44516a030e85f164c917af3e37359721059ca4 Mon Sep 17 00:00:00 2001
From: Omar Shaikh <oshaikh13@gmail.com>
Date: Fri, 3 Apr 2026 23:06:48 -0700
Subject: [PATCH 1/2] dense captions on napsack

---
 pyproject.toml                                |  2 +-
 src/napsack/label/__main__.py                 |  4 +++
 src/napsack/label/clients/__init__.py         |  4 ++-
 src/napsack/label/clients/client.py           | 17 ++++++++++
 src/napsack/label/models.py                   | 14 ++++++--
 src/napsack/label/processor.py                | 31 +++++++++++++++---
 src/napsack/label/prompts/default.txt         | 14 +-------
 src/napsack/label/prompts/image_mode.txt      | 14 +-------
 src/napsack/label/prompts/output/dense.txt    | 32 +++++++++++++++++++
 .../label/prompts/output/dense_image.txt      | 32 +++++++++++++++++++
 src/napsack/label/prompts/output/standard.txt | 13 ++++++++
 .../label/prompts/output/standard_image.txt   | 13 ++++++++
 .../label/prompts/screenshots_only.txt        | 14 +-------
 13 files changed, 155 insertions(+), 49 deletions(-)
 create mode 100644 src/napsack/label/prompts/output/dense.txt
 create mode 100644 src/napsack/label/prompts/output/dense_image.txt
 create mode 100644 src/napsack/label/prompts/output/standard.txt
 create mode 100644 src/napsack/label/prompts/output/standard_image.txt

diff --git a/pyproject.toml b/pyproject.toml
index c2da310..9249bcb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "napsack"
-version = "0.1.1"
+version = "0.1.2"
 readme = "README.md"
 description = "NAPsack records and aggregates your computer use — screenshots plus input events (click, keypress, scroll, cursor move). It groups activity into event bursts and uses a VLM pipeline to generate human-readable captions describing what happened."
 requires-python = ">=3.11,<=3.13"
diff --git a/src/napsack/label/__main__.py b/src/napsack/label/__main__.py
index 2588e5b..208bad8 100644
--- a/src/napsack/label/__main__.py
+++ b/src/napsack/label/__main__.py
@@ -28,6 +28,7 @@ def parse_args():
     p.add_argument("--dedupe-threshold", type=int, default=1, help="Hamming distance threshold for deduplication (drop if <= threshold, default: 1)")
     p.add_argument("--annotate", action="store_true", help="Annotate videos with cursor positions and clicks (only for standard processing)")
     p.add_argument("--image-mode", action="store_true", help="Send frames as individual images instead of video (for models that don't support video input)")
+    p.add_argument("--dense-caption", action="store_true", help="Include a dense text caption per chunk describing important text the user focused on, for retrieval")
     p.add_argument("--skip-existing", action="store_true", help="Skip sessions that have already been processed")
     p.add_argument("--visualize", action="store_true", help="Create annotated video visualizations after processing")
     p.add_argument("--encode-only", action="store_true", help="Only encode videos (create chunks), skip labeling. Useful for pre-processing before running the full pipeline.")
@@ -121,6 +122,7 @@ def process_with_litellm(args, configs):
         hash_cache_path=args.hash_cache,
         dedupe_threshold=args.dedupe_threshold,
         image_mode=args.image_mode,
+        dense_caption=args.dense_caption,
     )
 
     return processor.process_sessions(
@@ -151,6 +153,7 @@ def process_with_bigquery(args, configs):
         hash_cache_path=args.hash_cache,
         dedupe_threshold=args.dedupe_threshold,
         image_mode=args.image_mode,
+        dense_caption=args.dense_caption,
     )
 
     return processor.process_sessions(
@@ -179,6 +182,7 @@ def process_with_tinfoil(args, configs):
         hash_cache_path=args.hash_cache,
         dedupe_threshold=args.dedupe_threshold,
         image_mode=args.image_mode,
+        dense_caption=args.dense_caption,
     )
 
     return processor.process_sessions(
diff --git a/src/napsack/label/clients/__init__.py b/src/napsack/label/clients/__init__.py
index 6dc8fd4..b6f85f0 100644
--- a/src/napsack/label/clients/__init__.py
+++ b/src/napsack/label/clients/__init__.py
@@ -1,4 +1,4 @@
-from napsack.label.clients.client import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA
+from napsack.label.clients.client import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA, DENSE_CAPTION_SCHEMA, DENSE_IMAGE_CAPTION_SCHEMA
 from napsack.label.clients.litellm import LiteLLMClient
 from napsack.label.clients.bigquery import BigQueryClient, BigQueryResponse
 from napsack.label.clients.tinfoil import TinfoilClient
@@ -23,5 +23,7 @@ def create_client(client_type: str, **kwargs) -> VLMClient:
     "TinfoilClient",
     "CAPTION_SCHEMA",
     "IMAGE_CAPTION_SCHEMA",
+    "DENSE_CAPTION_SCHEMA",
+    "DENSE_IMAGE_CAPTION_SCHEMA",
     "create_client",
 ]
diff --git a/src/napsack/label/clients/client.py b/src/napsack/label/clients/client.py
index 1522a64..69e91af 100644
--- a/src/napsack/label/clients/client.py
+++ b/src/napsack/label/clients/client.py
@@ -28,6 +28,23 @@
     }
 }
 
+DENSE_CAPTION_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "actions": CAPTION_SCHEMA,
+        "dense_caption": {"type": "string"}
+    },
+    "required": ["actions", "dense_caption"]
+}
+
+DENSE_IMAGE_CAPTION_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "actions": IMAGE_CAPTION_SCHEMA,
+        "dense_caption": {"type": "string"}
+    },
+    "required": ["actions", "dense_caption"]
+}
 
 class VLMClient(ABC):
     @abstractmethod
diff --git a/src/napsack/label/models.py b/src/napsack/label/models.py
index fd91a78..6d924bd 100644
--- a/src/napsack/label/models.py
+++ b/src/napsack/label/models.py
@@ -366,6 +366,7 @@ class Caption:
     end_seconds: float
     text: str
     chunk_index: int = 0
+    dense_caption: Optional[str] = None
 
     @property
     def start_formatted(self) -> str:
@@ -381,11 +382,12 @@ def from_dict(cls, data: Dict) -> Caption:
             start_seconds=data['start_seconds'],
             end_seconds=data['end_seconds'],
             text=data['caption'],
-            chunk_index=data.get('chunk_index', 0)
+            chunk_index=data.get('chunk_index', 0),
+            dense_caption=data.get('dense_caption'),
         )
 
     def to_dict(self) -> Dict:
-        return {
+        d = {
             'start': self.start_formatted,
             'end': self.end_formatted,
             'start_seconds': self.start_seconds,
@@ -393,6 +395,9 @@ def to_dict(self) -> Dict:
             'caption': self.text,
             'chunk_index': self.chunk_index
         }
+        if self.dense_caption is not None:
+            d['dense_caption'] = self.dense_caption
+        return d
 
 
 @dataclass
@@ -417,7 +422,7 @@ def all_events(self) -> List[Event]:
         return events
 
     def to_dict(self) -> Dict:
-        return {
+        d = {
             'start_time': self.aggregations[0].timestamp if self.aggregations else 0,
             'end_time': self.aggregations[-1].timestamp if self.aggregations else 0,
             'start_index': self.start_index,
@@ -430,6 +435,9 @@ def to_dict(self) -> Dict:
             'end_formatted': self.caption.end_formatted,
             'scale_factor': self.screenshot_scale_factor
         }
+        if self.caption.dense_caption is not None:
+            d['dense_caption'] = self.caption.dense_caption
+        return d
 
 
 @dataclass
diff --git a/src/napsack/label/processor.py b/src/napsack/label/processor.py
index 31a4693..d97b25e 100644
--- a/src/napsack/label/processor.py
+++ b/src/napsack/label/processor.py
@@ -9,7 +9,7 @@
 
 from napsack.label.models import SessionConfig, ChunkTask, Caption, Aggregation, VideoPath, MatchedCaption
 from napsack.label.video import create_video, split_video, compute_max_size
-from napsack.label.clients import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA
+from napsack.label.clients import VLMClient, CAPTION_SCHEMA, IMAGE_CAPTION_SCHEMA, DENSE_CAPTION_SCHEMA, DENSE_IMAGE_CAPTION_SCHEMA
 
 
 # ============================================================================
@@ -144,16 +144,21 @@ def __init__(
         hash_cache_path: Optional[str] = None,
         dedupe_threshold: int = 1,
         image_mode: bool = False,
+        dense_caption: bool = False,
     ):
         self.client = client
         self.encode_workers = encode_workers
         self.label_workers = label_workers
         self.screenshots_only = screenshots_only
-        self.prompt = self._load_prompt(prompt_file)
         self.max_time_gap = max_time_gap
         self.dedupe_threshold = dedupe_threshold
         self.hash_map = load_hash_cache(hash_cache_path) if hash_cache_path else None
         self.image_mode = image_mode
+        self.dense_caption = dense_caption
+
+        base_prompt = self._load_prompt(prompt_file)
+        output_format = self._load_output_format(image_mode, dense_caption)
+        self.prompt = base_prompt.replace("{{OUTPUT_FORMAT}}", output_format)
 
     def _load_prompt(self, path: str) -> str:
         p = Path(path)
@@ -161,6 +166,13 @@ def _load_prompt(self, path: str) -> str:
             p = Path(__file__).parent / path
         return p.read_text()
 
+    def _load_output_format(self, image_mode: bool, dense_caption: bool) -> str:
+        if dense_caption:
+            filename = "prompts/output/dense_image.txt" if image_mode else "prompts/output/dense.txt"
+        else:
+            filename = "prompts/output/standard_image.txt" if image_mode else "prompts/output/standard.txt"
+        return self._load_prompt(filename)
+
     def process_sessions(
         self,
         configs: List[SessionConfig],
@@ -535,6 +547,7 @@ def _process_tasks(self, tasks: List[ChunkTask], config_map: dict) -> List[Tuple
     def _process_single_task(self, task: ChunkTask) -> any:
         """Process single task with schema."""
         if self.image_mode:
+            schema = DENSE_IMAGE_CAPTION_SCHEMA if self.dense_caption else IMAGE_CAPTION_SCHEMA
             per_frame_text = None
             if task.aggregations:
                 per_frame_text = [agg.to_prompt(f"Frame {j + 1}") for j, agg in enumerate(task.aggregations)]
@@ -542,10 +555,11 @@ def _process_single_task(self, task: ChunkTask) -> any:
                 [str(p) for p in task.image_paths], session_id=task.session_id,
                 per_frame_text=per_frame_text,
             )
-            response = self.client.generate(task.prompt, file_desc, schema=IMAGE_CAPTION_SCHEMA)
+            response = self.client.generate(task.prompt, file_desc, schema=schema)
         else:
+            schema = DENSE_CAPTION_SCHEMA if self.dense_caption else CAPTION_SCHEMA
             file_desc = self.client.upload_file(str(task.video_path.resolve()), session_id=task.session_id)
-            response = self.client.generate(task.prompt, file_desc, schema=CAPTION_SCHEMA)
+            response = self.client.generate(task.prompt, file_desc, schema=schema)
 
         return response
 
@@ -597,6 +611,12 @@ def _save_results(
 
     def _extract_captions(self, result: any, task: ChunkTask, fps: int = 1) -> List[Caption]:
         captions = []
+        dense_caption_text = None
+
+        # Unwrap dense caption response format
+        if isinstance(result, dict) and "actions" in result:
+            dense_caption_text = result.get("dense_caption") if self.dense_caption else None
+            result = result.get("actions", [])
 
         if isinstance(result, str) or not isinstance(result, list):
             return captions
@@ -636,7 +656,8 @@ def _extract_captions(self, result: any, task: ChunkTask, fps: int = 1) -> List[
                 start_seconds=abs_start,
                 end_seconds=abs_end,
                 text=item.get("caption", item.get("description", "")),
-                chunk_index=task.chunk_index
+                chunk_index=task.chunk_index,
+                dense_caption=dense_caption_text,
             ))
 
         return captions
diff --git a/src/napsack/label/prompts/default.txt b/src/napsack/label/prompts/default.txt
index 666ae21..c54f771 100644
--- a/src/napsack/label/prompts/default.txt
+++ b/src/napsack/label/prompts/default.txt
@@ -83,16 +83,4 @@ Generated captions must be in past tense, and at the level of detail as the exam
 
 You MUST quote specific things from the screen so it's easy to reproduce your steps.
 
-## Output
-
-A JSON array of objects:
-
-```json
-[
-  {
-    "start": "MM:SS",
-    "end":   "MM:SS",
-    "caption": "..."
-  }
-]
-```
+{{OUTPUT_FORMAT}}
diff --git a/src/napsack/label/prompts/image_mode.txt b/src/napsack/label/prompts/image_mode.txt
index 0759d29..ea62814 100644
--- a/src/napsack/label/prompts/image_mode.txt
+++ b/src/napsack/label/prompts/image_mode.txt
@@ -77,16 +77,4 @@ Generated captions must be in past tense, and at the level of detail as the exam
 
 You MUST quote specific things from the screen so it's easy to reproduce your steps.
 
-## Output
-
-A JSON array of objects. Use **frame numbers** (integers) for start and end:
-
-```json
-[
-  {
-    "start": 1,
-    "end":   3,
-    "caption": "..."
-  }
-]
-```
+{{OUTPUT_FORMAT}}
diff --git a/src/napsack/label/prompts/output/dense.txt b/src/napsack/label/prompts/output/dense.txt
new file mode 100644
index 0000000..368380d
--- /dev/null
+++ b/src/napsack/label/prompts/output/dense.txt
@@ -0,0 +1,32 @@
+## Dense Caption
+
+In addition to the actions array, you MUST also produce a `dense_caption` field: a single string that captures the **important text content the user was focused on or interacting with** during this chunk. This includes:
+- Text being read, written, or edited
+- URLs, file paths, code snippets, search queries
+- Names, labels, or data values visible and relevant to the user's activity
+
+The dense caption should be a concise but information-rich summary optimized for text search and retrieval. Do NOT describe actions — instead, capture the **textual content** itself.
+
+### Examples
+
+- "VS Code editor open to src/utils/auth.py. Function `verify_jwt_token(token: str, secret: str) -> dict` on lines 42-58. Import statements: `from jose import jwt`, `from datetime import datetime, timedelta`. Error highlighted on line 51: `jwt.ExpiredSignatureError`."
+- "Chrome browser on GitHub pull request #347 'Fix race condition in connection pool' in repo acme/backend. Files changed: pool.py (+23 -8), test_pool.py (+45). Review comment from @danl: 'Should we add a timeout here?' on line 89 of pool.py."
+- "Terminal running `kubectl get pods -n staging`. Output shows 4 pods: api-server-7f8b (Running), worker-3a2c (CrashLoopBackOff), redis-cache-1d4e (Running), nginx-proxy-9b1f (Running). Error on worker pod: OOMKilled, restarts: 12."
+- "Google Sheets spreadsheet 'Q2 Marketing Budget'. Editing cell D14 with formula `=SUM(D2:D13)`. Column D header: 'Spend ($)'. Visible values: D2=1200, D3=3400, D5=890. Sheet tab: 'Paid Ads'."
+
+## Output
+
+Return a JSON object with two fields:
+
+```json
+{
+  "actions": [
+    {
+      "start": "MM:SS",
+      "end":   "MM:SS",
+      "caption": "..."
+    }
+  ],
+  "dense_caption": "A single string capturing important text content..."
+}
+```
\ No newline at end of file
diff --git a/src/napsack/label/prompts/output/dense_image.txt b/src/napsack/label/prompts/output/dense_image.txt
new file mode 100644
index 0000000..f55ada0
--- /dev/null
+++ b/src/napsack/label/prompts/output/dense_image.txt
@@ -0,0 +1,32 @@
+## Dense Caption
+
+In addition to the actions array, you MUST also produce a `dense_caption` field: a single string that captures the **important text content the user was focused on or interacting with** during this chunk. This includes:
+- Text being read, written, or edited
+- URLs, file paths, code snippets, search queries
+- Names, labels, or data values visible and relevant to the user's activity
+
+The dense caption should be a concise but information-rich summary optimized for text search and retrieval. Do NOT describe actions — instead, capture the **textual content** itself.
+
+### Examples
+
+- "VS Code editor open to src/utils/auth.py. Function `verify_jwt_token(token: str, secret: str) -> dict` on lines 42-58. Import statements: `from jose import jwt`, `from datetime import datetime, timedelta`. Error highlighted on line 51: `jwt.ExpiredSignatureError`."
+- "Chrome browser on GitHub pull request #347 'Fix race condition in connection pool' in repo acme/backend. Files changed: pool.py (+23 -8), test_pool.py (+45). Review comment from @danl: 'Should we add a timeout here?' on line 89 of pool.py."
+- "Terminal running `kubectl get pods -n staging`. Output shows 4 pods: api-server-7f8b (Running), worker-3a2c (CrashLoopBackOff), redis-cache-1d4e (Running), nginx-proxy-9b1f (Running). Error on worker pod: OOMKilled, restarts: 12."
+- "Google Sheets spreadsheet 'Q2 Marketing Budget'. Editing cell D14 with formula `=SUM(D2:D13)`. Column D header: 'Spend ($)'. Visible values: D2=1200, D3=3400, D5=890. Sheet tab: 'Paid Ads'."
+
+## Output
+
+Return a JSON object with two fields. Use **frame numbers** (integers) for start and end:
+
+```json
+{
+  "actions": [
+    {
+      "start": 1,
+      "end":   3,
+      "caption": "..."
+    }
+  ],
+  "dense_caption": "A single string capturing important text content..."
+}
+```
\ No newline at end of file
diff --git a/src/napsack/label/prompts/output/standard.txt b/src/napsack/label/prompts/output/standard.txt
new file mode 100644
index 0000000..4bfe979
--- /dev/null
+++ b/src/napsack/label/prompts/output/standard.txt
@@ -0,0 +1,13 @@
+## Output
+
+A JSON array of objects:
+
+```json
+[
+  {
+    "start": "MM:SS",
+    "end":   "MM:SS",
+    "caption": "..."
+  }
+]
+```
\ No newline at end of file
diff --git a/src/napsack/label/prompts/output/standard_image.txt b/src/napsack/label/prompts/output/standard_image.txt
new file mode 100644
index 0000000..bf77933
--- /dev/null
+++ b/src/napsack/label/prompts/output/standard_image.txt
@@ -0,0 +1,13 @@
+## Output
+
+A JSON array of objects. Use **frame numbers** (integers) for start and end:
+
+```json
+[
+  {
+    "start": 1,
+    "end":   3,
+    "caption": "..."
+  }
+]
+```
\ No newline at end of file
diff --git a/src/napsack/label/prompts/screenshots_only.txt b/src/napsack/label/prompts/screenshots_only.txt
index d3dc8e8..701ac2a 100644
--- a/src/napsack/label/prompts/screenshots_only.txt
+++ b/src/napsack/label/prompts/screenshots_only.txt
@@ -72,16 +72,4 @@ Generated captions must be in past tense, and at the level of detail as the exam
 
 You MUST quote specific things from the screen so it's easy to reproduce your steps.
 
-## Output
-
-A JSON array of objects:
-
-```json
-[
-  {
-    "start": "MM:SS",
-    "end":   "MM:SS",
-    "caption": "..."
-  }
-]
-```
\ No newline at end of file
+{{OUTPUT_FORMAT}}
\ No newline at end of file

From 0ab6bf9a40949c2dfad67088d1fa779a784a06a1 Mon Sep 17 00:00:00 2001
From: Omar Shaikh <oshaikh13@gmail.com>
Date: Fri, 3 Apr 2026 23:17:36 -0700
Subject: [PATCH 2/2] add dense

---
 src/napsack/label/prompts/default.txt            | 2 +-
 src/napsack/label/prompts/output/dense.txt       | 1 +
 src/napsack/label/prompts/output/dense_image.txt | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/napsack/label/prompts/default.txt b/src/napsack/label/prompts/default.txt
index c54f771..58e6557 100644
--- a/src/napsack/label/prompts/default.txt
+++ b/src/napsack/label/prompts/default.txt
@@ -83,4 +83,4 @@ Generated captions must be in past tense, and at the level of detail as the exam
 
 You MUST quote specific things from the screen so it's easy to reproduce your steps.
 
-{{OUTPUT_FORMAT}}
+{{OUTPUT_FORMAT}}
\ No newline at end of file
diff --git a/src/napsack/label/prompts/output/dense.txt b/src/napsack/label/prompts/output/dense.txt
index 368380d..f3ceb36 100644
--- a/src/napsack/label/prompts/output/dense.txt
+++ b/src/napsack/label/prompts/output/dense.txt
@@ -13,6 +13,7 @@ The dense caption should be a concise but information-rich summary optimized for
 - "Chrome browser on GitHub pull request #347 'Fix race condition in connection pool' in repo acme/backend. Files changed: pool.py (+23 -8), test_pool.py (+45). Review comment from @danl: 'Should we add a timeout here?' on line 89 of pool.py."
 - "Terminal running `kubectl get pods -n staging`. Output shows 4 pods: api-server-7f8b (Running), worker-3a2c (CrashLoopBackOff), redis-cache-1d4e (Running), nginx-proxy-9b1f (Running). Error on worker pod: OOMKilled, restarts: 12."
 - "Google Sheets spreadsheet 'Q2 Marketing Budget'. Editing cell D14 with formula `=SUM(D2:D13)`. Column D header: 'Spend ($)'. Visible values: D2=1200, D3=3400, D5=890. Sheet tab: 'Paid Ads'."
+- "Slack conversation in #eng-incidents channel. User typed: 'looks like the redis cluster in us-east-1 is throwing CLUSTERDOWN errors — can someone check if the failover completed? cc @oncall-infra'. Replying to message from @mkhan: 'We're seeing elevated 5xx rates on the payments service since 2:14pm PT.' Channel topic: 'Production incident triage'."
 
 ## Output
 
diff --git a/src/napsack/label/prompts/output/dense_image.txt b/src/napsack/label/prompts/output/dense_image.txt
index f55ada0..12e508d 100644
--- a/src/napsack/label/prompts/output/dense_image.txt
+++ b/src/napsack/label/prompts/output/dense_image.txt
@@ -13,6 +13,7 @@ The dense caption should be a concise but information-rich summary optimized for
 - "Chrome browser on GitHub pull request #347 'Fix race condition in connection pool' in repo acme/backend. Files changed: pool.py (+23 -8), test_pool.py (+45). Review comment from @danl: 'Should we add a timeout here?' on line 89 of pool.py."
 - "Terminal running `kubectl get pods -n staging`. Output shows 4 pods: api-server-7f8b (Running), worker-3a2c (CrashLoopBackOff), redis-cache-1d4e (Running), nginx-proxy-9b1f (Running). Error on worker pod: OOMKilled, restarts: 12."
 - "Google Sheets spreadsheet 'Q2 Marketing Budget'. Editing cell D14 with formula `=SUM(D2:D13)`. Column D header: 'Spend ($)'. Visible values: D2=1200, D3=3400, D5=890. Sheet tab: 'Paid Ads'."
+- "Slack conversation in #eng-incidents channel. User typed: 'looks like the redis cluster in us-east-1 is throwing CLUSTERDOWN errors — can someone check if the failover completed? cc @oncall-infra'. Replying to message from @mkhan: 'We're seeing elevated 5xx rates on the payments service since 2:14pm PT.' Channel topic: 'Production incident triage'."
 
 ## Output