diff --git a/edgar/documents/utils/streaming.py b/edgar/documents/utils/streaming.py index 3f7a5a60..d9da9a94 100644 --- a/edgar/documents/utils/streaming.py +++ b/edgar/documents/utils/streaming.py @@ -58,6 +58,10 @@ def _reset_state(self): self.text_buffer = [] self.in_table = False self._table_depth = 0 + # Depth counter for structural elements (p, h1-h6, section) that read + # their full subtree at end-tag time. Used to defer elem.clear() until + # the enclosing structural element has finished reading its children. + self._content_depth = 0 self.table_buffer = [] def parse(self, html: str) -> "Document": @@ -108,10 +112,20 @@ def parse(self, html: str) -> "Document": if len(self.node_buffer) >= self.MAX_NODE_BUFFER: self._flush_buffer() - # Clean up processed elements to save memory. - # Skip clearing while inside a table — _end_table needs - # the full element tree (tr/td children) to extract data. - if self._table_depth == 0: + # Clean up processed elements to save memory. Two gates: + # - Only on `end` events: at `start`, lxml html-mode has + # pre-populated children/text via lookahead and structural + # handlers (e.g., _start_heading) read them; clearing at + # start destroys that content. + # - Skip while inside a table or other structural content + # element (p/h1-h6/section). _end_paragraph/_end_heading/ + # _end_section read the full child subtree via + # _get_text_content; clearing children first (depth-first + # end events fire for descendants before ancestors) wipes + # their .text and .tail and produces empty text silently. + if (event == 'end' + and self._table_depth == 0 + and self._content_depth == 0): elem.clear() while elem.getprevious() is not None: parent = elem.getparent() @@ -176,12 +190,20 @@ def _handle_start_tag(self, elem: HtmlElement): body_container = ContainerNode(tag_name='body') self.root.add_child(body_container) self.current_parent = body_container + elif tag == 'table': + self._start_table(elem) + elif self._table_depth > 0: + # Suppress structural-content handlers while inside a table. + # _end_table runs processor.process(elem) over the full table + # subtree and emits each cell's text as part of the TableNode; + # also emitting
/ 's end
+ event ran _get_text_content(p). SEC filings wrap virtually every
+ word in tags, so streaming-mode paragraphs
+ produced empty text — silently, with no warning.
+
+ Symptom in production: filings in the ~30MB–110MB band (which
+ cross the default 10MB streaming_threshold) returned text() output
+ 20%+ shorter than the non-streaming path; for some filings,
+ nearly empty. No exception was raised.
+
+ Fix: edgar/documents/utils/streaming.py — clear only on end
+ events, and gate clearing on a content-depth counter that tracks
+ open p/h1-h6/section elements (matching the existing _table_depth
+ gate). This defers child cleanup until the enclosing structural
+ element has read its subtree.
+
+ Expected: Streaming-mode text() returns the full paragraph
+ content, including text inside nested wrappers.
+ """
+ # Mimics SEC filing structure: every word inside its own .
+ html = (
+ ""
+ " Alpha beta gamma second paragraph nested spans everywhere ...
+ start/end handlers fired unconditionally) and once as TableNode
+ cell text (because _end_table walks the full subtree via
+ processor.process(elem)). Same applies to handlers
+ produced empty paragraphs anyway. Once paragraph text was
+ recovered, the duplication showed up as 10-36% content overshoot
+ vs non-streaming on table-heavy filings — visible as the same
+ financial-statement labels ('Total', interest-income line items,
+ etc.) repeating dozens of times more in streaming output than
+ non-streaming output.
+
+ Fix: _handle_start_tag / _handle_end_tag gate / Cell paragraph one Cell paragraph two Row two A Row two BRisk Factors
"
+ "
+ twice — once as a free-standing ParagraphNode (because the .
+
+ This was masked before the span-bug fix because "
+ "
"
+ ""
+ )
+
+ streaming_cfg = ParserConfig(
+ streaming_threshold=1,
+ max_document_size=10 * 1024 * 1024,
+ )
+ text = parse_html(html, config=streaming_cfg).text()
+
+ # Each cell must appear exactly once. Pre-fix this PR, each of
+ # these would appear twice (once as standalone paragraph, once
+ # as a table cell), and `text.count(...) == 2`.
+ for cell in ("Cell paragraph one", "Cell paragraph two",
+ "Row two A", "Row two B"):
+ assert text.count(cell) == 1, (
+ f"{cell!r} appears {text.count(cell)} times in streaming "
+ f"output (expected 1) — table cell content is being "
+ f"double-emitted as both ParagraphNode and TableNode cell"
+ )
+
+ # And the non-streaming baseline must show the same single-emission
+ # behaviour, so the assertion isn't accidentally locking in a
+ # streaming-specific quirk.
+ normal_cfg = ParserConfig(streaming_threshold=10 * 1024 * 1024)
+ normal_text = parse_html(html, config=normal_cfg).text()
+ for cell in ("Cell paragraph one", "Cell paragraph two",
+ "Row two A", "Row two B"):
+ assert normal_text.count(cell) == 1, (
+ f"baseline emits {cell!r} {normal_text.count(cell)} times"
+ )
+
class TestSectionDetectionRegressions:
"""Regression tests for section detection bugs."""
"
+ " "
+ " "
+ " "
+ "