#70 simplify and document support for reading streams with mixed JSON and binary data

daggaz · daggaz · commit 1b915bec1380 · 2026-03-23T10:41:00.000Z
diff --git a/README.md b/README.md
@@ -11,7 +11,8 @@ Simple streaming JSON parser and encoder.
 When [reading](#reading) JSON data, `json-stream` can decode JSON data in 
 a streaming manner, providing a pythonic dict/list-like interface, or a
 [visitor-based interface](#visitor). It can stream from files, [URLs](#urls) 
-or [iterators](#iterators). It can process [multiple JSON documents](#multiple) in a single stream.
+or [iterators](#iterators). It can process [multiple JSON documents](#multiple) 
+in a single stream, and can read JSON [mixed with other non-JSON data](#reading-mixed-data).
 
 When [writing](#writing) JSON data, `json-stream` can stream JSON objects 
 as you generate them.
@@ -495,13 +496,81 @@ significant parsing speedup compared to pure python implementation.
 `json-stream` will fallback to its pure python tokenizer implementation
 if `json-stream-rs-tokenizer` is not available.
 
+#### <a id="reading-mixed-data"></a> Reading mixed data
+
+When using the Rust tokenizer, you can also use `json-stream` to parse mixed
+data, for example a file with a JSON followed by binary data. 
+ 
+To do this, you should pass `correct_cursor=True` to `load()`. The ensures the
+rust tokenizer keeps track of the exact stream position it has read up to. This
+comes with a **significant performance cost** for un-seekable streams.
+
+After reading the JSON data, call `read_all()` on the top-level object returned
+by `load()` to ensure you are at the end of the JSON data, and then call 
+`.tokenizer.park_cursor()` "park" the underlying file cursor at the correct
+position.
+
+```python
+import json_stream
+
+with open('test.bin', 'rb') as f:
+    # read JSON header
+    header = json_stream.load(f, correct_cursor=True)
+    # ... process JSON header ...
+    header.read_all()
+
+    # ensure the tokenizer has "parked" the file 
+    # cursor at the end of the JSON data
+    header.tokenizer.park_cursor()
+
+    # now we can read binary data from the same file
+    binary_start = f.tell()
+    data = f.read()
+
+#### <a id="mixed-scenarios"></a> Other mixed data scenarios
+
+`json-stream` can also handle streams that start with binary data, or have binary
+data between multiple JSON documents.
+
+##### Binary then JSON
+
+You can simply read the binary data from the file before calling `load()`.
+
+```python
+with open('test.bin', 'rb') as f:
+    binary_data = f.read(1024)
+    data = json_stream.load(f)
+    # ... process JSON ...
+```
+
+##### JSON then binary then JSON
+
+You must use `correct_cursor=True` for any JSON document that is followed by 
+binary data.
+
+```python
+with open('test.bin', 'rb') as f:
+    # 1. Read first JSON
+    data1 = json_stream.load(f, correct_cursor=True)
+    # ... process data1 ...
+    data1.read_all()
+    data1.tokenizer.park_cursor()
+
+    # 2. Read binary data
+    binary_data = f.read(1024)
+
+    # 3. Read second JSON
+    data2 = json_stream.load(f)
+    # ... process data2 ...
+```
+
 ### Custom tokenizer
 
 You can supply an alternative JSON tokenizer implementation. Simply pass 
 a tokenizer to the `load()` or `visit()` methods.
 
 ```python
-json_stream.load(f, tokenizer=some_tokenizer)
+json_stream.load(f, tokenizer=some_tokenizer, tokenizer_kwargs=...)
 ```
 
 The requests methods also accept a customer tokenizer parameter.
diff --git a/src/json_stream/base.py b/src/json_stream/base.py
@@ -36,6 +36,10 @@ def __init__(self, token_stream):
         self._stream = token_stream
         self._child: Optional[StreamingJSONBase] = None
 
+    @property
+    def tokenizer(self):
+        return self._stream
+
     def _clear_child(self):
         if self._child is not None:
             self._child.read_all()
diff --git a/src/json_stream/loader.py b/src/json_stream/loader.py
@@ -3,13 +3,13 @@
 from json_stream.select_tokenizer import default_tokenizer
 
 
-def load(fp_or_iterable, persistent=False, tokenizer=default_tokenizer):
-    return next(load_many(fp_or_iterable, persistent, tokenizer))
+def load(fp_or_iterable, persistent=False, tokenizer=default_tokenizer, **tokenizer_kwargs):
+    return next(load_many(fp_or_iterable, persistent, tokenizer, **tokenizer_kwargs))
 
 
-def load_many(fp_or_iterable, persistent=False, tokenizer=default_tokenizer):
+def load_many(fp_or_iterable, persistent=False, tokenizer=default_tokenizer, **tokenizer_kwargs):
     fp = ensure_file(fp_or_iterable)
-    token_stream = tokenizer(fp)
+    token_stream = tokenizer(fp, **tokenizer_kwargs)
     for token_type, token in token_stream:
         if token_type == TokenType.OPERATOR:
             data = StreamingJSONBase.factory(token, token_stream, persistent)
diff --git a/src/json_stream/visitor.py b/src/json_stream/visitor.py
@@ -20,9 +20,9 @@ def _visit(obj, visitor, path):
         visitor(obj, path)
 
 
-def visit_many(fp_or_iterator, visitor, tokenizer=default_tokenizer):
+def visit_many(fp_or_iterator, visitor, tokenizer=default_tokenizer, **tokenizer_kwargs):
     fp = ensure_file(fp_or_iterator)
-    token_stream = tokenizer(fp)
+    token_stream = tokenizer(fp, **tokenizer_kwargs)
     for token_type, token in token_stream:
         if token_type == TokenType.OPERATOR:
             obj = StreamingJSONBase.factory(token, token_stream, persistent=False)
@@ -33,5 +33,5 @@ def visit_many(fp_or_iterator, visitor, tokenizer=default_tokenizer):
         yield
 
 
-def visit(fp_or_iterator, visitor, tokenizer=default_tokenizer):
-    next(visit_many(fp_or_iterator, visitor, tokenizer))
+def visit(fp_or_iterator, visitor, tokenizer=default_tokenizer, **tokenizer_kwargs):
+    next(visit_many(fp_or_iterator, visitor, tokenizer, **tokenizer_kwargs))