diff --git a/edgar/documents/utils/streaming.py b/edgar/documents/utils/streaming.py index 3f7a5a60..d9da9a94 100644 --- a/edgar/documents/utils/streaming.py +++ b/edgar/documents/utils/streaming.py @@ -58,6 +58,10 @@ def _reset_state(self): self.text_buffer = [] self.in_table = False self._table_depth = 0 + # Depth counter for structural elements (p, h1-h6, section) that read + # their full subtree at end-tag time. Used to defer elem.clear() until + # the enclosing structural element has finished reading its children. + self._content_depth = 0 self.table_buffer = [] def parse(self, html: str) -> "Document": @@ -108,10 +112,20 @@ def parse(self, html: str) -> "Document": if len(self.node_buffer) >= self.MAX_NODE_BUFFER: self._flush_buffer() - # Clean up processed elements to save memory. - # Skip clearing while inside a table — _end_table needs - # the full element tree (tr/td children) to extract data. - if self._table_depth == 0: + # Clean up processed elements to save memory. Two gates: + # - Only on `end` events: at `start`, lxml html-mode has + # pre-populated children/text via lookahead and structural + # handlers (e.g., _start_heading) read them; clearing at + # start destroys that content. + # - Skip while inside a table or other structural content + # element (p/h1-h6/section). _end_paragraph/_end_heading/ + # _end_section read the full child subtree via + # _get_text_content; clearing children first (depth-first + # end events fire for descendants before ancestors) wipes + # their .text and .tail and produces empty text silently. + if (event == 'end' + and self._table_depth == 0 + and self._content_depth == 0): elem.clear() while elem.getprevious() is not None: parent = elem.getparent() @@ -176,12 +190,20 @@ def _handle_start_tag(self, elem: HtmlElement): body_container = ContainerNode(tag_name='body') self.root.add_child(body_container) self.current_parent = body_container + elif tag == 'table': + self._start_table(elem) + elif self._table_depth > 0: + # Suppress structural-content handlers while inside a table. + # _end_table runs processor.process(elem) over the full table + # subtree and emits each cell's text as part of the TableNode; + # also emitting

//

nodes here would produce the + # same text twice in document.text(). Symmetrical to the + # existing _table_depth gate on elem.clear() above. + pass elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: self._start_heading(elem) elif tag == 'p': self._start_paragraph(elem) - elif tag == 'table': - self._start_table(elem) elif tag == 'section': self._start_section(elem) @@ -194,12 +216,14 @@ def _handle_end_tag(self, elem: HtmlElement): self.tag_stack.pop() # Handle specific tags - if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + if tag == 'table': + self._end_table(elem) + elif self._table_depth > 0: + pass # see _handle_start_tag: structural handlers are suppressed inside tables + elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: self._end_heading(elem) elif tag == 'p': self._end_paragraph(elem) - elif tag == 'table': - self._end_table(elem) elif tag == 'section': self._end_section(elem) elif tag == 'body': @@ -217,6 +241,7 @@ def _start_heading(self, elem: HtmlElement): # Import node types at runtime to avoid circular imports from edgar.documents.nodes import HeadingNode + self._content_depth += 1 level = int(elem.tag[1]) text = self._get_text_content(elem) @@ -246,12 +271,15 @@ def _end_heading(self, elem: HtmlElement): # Clear any accumulated text buffer self.text_buffer.clear() + if self._content_depth > 0: + self._content_depth -= 1 def _start_paragraph(self, elem: HtmlElement): """Start processing a paragraph.""" # Import node types at runtime to avoid circular imports from edgar.documents.nodes import ParagraphNode + self._content_depth += 1 para = ParagraphNode() # Get style if present @@ -275,6 +303,8 @@ def _end_paragraph(self, elem: HtmlElement): # Clear any accumulated text buffer self.text_buffer.clear() + if self._content_depth > 0: + self._content_depth -= 1 def _start_table(self, elem: HtmlElement): """Start processing a table.""" @@ -311,6 +341,7 @@ def _start_section(self, elem: HtmlElement): # Import node types at runtime to avoid circular imports from edgar.documents.nodes import SectionNode + self._content_depth += 1 section = SectionNode() # Get section attributes @@ -328,6 +359,8 @@ def _start_section(self, elem: HtmlElement): def _end_section(self, elem: HtmlElement): """End processing a section.""" self.current_section = None + if self._content_depth > 0: + self._content_depth -= 1 def _flush_buffer(self): """Flush node buffer to document tree.""" diff --git a/tests/test_html_parser_regressions.py b/tests/test_html_parser_regressions.py index f66c3a69..34179f8e 100644 --- a/tests/test_html_parser_regressions.py +++ b/tests/test_html_parser_regressions.py @@ -158,6 +158,127 @@ def test_large_document_streaming_trigger(self): doc = parse_html(html, config=config) assert doc is not None + def test_streaming_preserves_span_wrapped_paragraph_text(self): + """ + Regression: Streaming parser dropped text from -wrapped paragraphs. + + Bug: The iterparse loop called elem.clear() on every event (both + start and end), and on every element regardless of whether an + enclosing structural element (p/h1-h6/section) had finished reading + its children. Because iterparse fires end events depth-first, the + inner 's end event cleared its .text/.tail before

's end + event ran _get_text_content(p). SEC filings wrap virtually every + word in tags, so streaming-mode paragraphs + produced empty text — silently, with no warning. + + Symptom in production: filings in the ~30MB–110MB band (which + cross the default 10MB streaming_threshold) returned text() output + 20%+ shorter than the non-streaming path; for some filings, + nearly empty. No exception was raised. + + Fix: edgar/documents/utils/streaming.py — clear only on end + events, and gate clearing on a content-depth counter that tracks + open p/h1-h6/section elements (matching the existing _table_depth + gate). This defers child cleanup until the enclosing structural + element has read its subtree. + + Expected: Streaming-mode text() returns the full paragraph + content, including text inside nested wrappers. + """ + # Mimics SEC filing structure: every word inside its own . + html = ( + "" + "

Alpha beta gamma

" + "

second paragraph

" + "

Risk Factors

" + "

nested spans everywhere

" + "" + ) + + # Force streaming mode regardless of size. + streaming_cfg = ParserConfig( + streaming_threshold=1, + max_document_size=10 * 1024 * 1024, + ) + text = parse_html(html, config=streaming_cfg).text() + + # All paragraph and heading content must survive the streaming path. + assert "Alpha" in text and "beta" in text and "gamma" in text + assert "second paragraph" in text + assert "Risk Factors" in text + assert "nested spans everywhere" in text + + # Non-streaming baseline must agree on the same content. + normal_cfg = ParserConfig(streaming_threshold=10 * 1024 * 1024) + normal_text = parse_html(html, config=normal_cfg).text() + for needle in ("Alpha", "beta", "gamma", "second paragraph", + "Risk Factors", "nested spans everywhere"): + assert needle in normal_text, f"baseline missing {needle!r}" + + def test_streaming_does_not_double_emit_table_cell_paragraphs(self): + """ + Regression: Streaming parser emitted text inside

...

+ twice — once as a free-standing ParagraphNode (because the

+ start/end handlers fired unconditionally) and once as TableNode + cell text (because _end_table walks the full subtree via + processor.process(elem)). Same applies to and

+ inside . + + This was masked before the span-bug fix because

handlers + produced empty paragraphs anyway. Once paragraph text was + recovered, the duplication showed up as 10-36% content overshoot + vs non-streaming on table-heavy filings — visible as the same + financial-statement labels ('Total', interest-income line items, + etc.) repeating dozens of times more in streaming output than + non-streaming output. + + Fix: _handle_start_tag / _handle_end_tag gate

//

+ on _table_depth == 0, symmetrical to the existing _table_depth + gate on elem.clear(). The table processor remains the single + source of cell text. + + Expected: each cell's text appears exactly once in streaming + output, matching non-streaming behaviour. + """ + html = ( + "" + "" + "" + " " + "" + " " + "

Cell paragraph one

Cell paragraph two

Row two A

Row two B

" + "" + ) + + streaming_cfg = ParserConfig( + streaming_threshold=1, + max_document_size=10 * 1024 * 1024, + ) + text = parse_html(html, config=streaming_cfg).text() + + # Each cell must appear exactly once. Pre-fix this PR, each of + # these would appear twice (once as standalone paragraph, once + # as a table cell), and `text.count(...) == 2`. + for cell in ("Cell paragraph one", "Cell paragraph two", + "Row two A", "Row two B"): + assert text.count(cell) == 1, ( + f"{cell!r} appears {text.count(cell)} times in streaming " + f"output (expected 1) — table cell content is being " + f"double-emitted as both ParagraphNode and TableNode cell" + ) + + # And the non-streaming baseline must show the same single-emission + # behaviour, so the assertion isn't accidentally locking in a + # streaming-specific quirk. + normal_cfg = ParserConfig(streaming_threshold=10 * 1024 * 1024) + normal_text = parse_html(html, config=normal_cfg).text() + for cell in ("Cell paragraph one", "Cell paragraph two", + "Row two A", "Row two B"): + assert normal_text.count(cell) == 1, ( + f"baseline emits {cell!r} {normal_text.count(cell)} times" + ) + class TestSectionDetectionRegressions: """Regression tests for section detection bugs."""