Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 42 additions & 9 deletions edgar/documents/utils/streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ def _reset_state(self):
self.text_buffer = []
self.in_table = False
self._table_depth = 0
# Depth counter for structural elements (p, h1-h6, section) that read
# their full subtree at end-tag time. Used to defer elem.clear() until
# the enclosing structural element has finished reading its children.
self._content_depth = 0
self.table_buffer = []

def parse(self, html: str) -> "Document":
Expand Down Expand Up @@ -108,10 +112,20 @@ def parse(self, html: str) -> "Document":
if len(self.node_buffer) >= self.MAX_NODE_BUFFER:
self._flush_buffer()

# Clean up processed elements to save memory.
# Skip clearing while inside a table — _end_table needs
# the full element tree (tr/td children) to extract data.
if self._table_depth == 0:
# Clean up processed elements to save memory. Two gates:
# - Only on `end` events: at `start`, lxml html-mode has
# pre-populated children/text via lookahead and structural
# handlers (e.g., _start_heading) read them; clearing at
# start destroys that content.
# - Skip while inside a table or other structural content
# element (p/h1-h6/section). _end_paragraph/_end_heading/
# _end_section read the full child subtree via
# _get_text_content; clearing children first (depth-first
# end events fire for descendants before ancestors) wipes
# their .text and .tail and produces empty text silently.
if (event == 'end'
and self._table_depth == 0
and self._content_depth == 0):
elem.clear()
while elem.getprevious() is not None:
parent = elem.getparent()
Expand Down Expand Up @@ -176,12 +190,20 @@ def _handle_start_tag(self, elem: HtmlElement):
body_container = ContainerNode(tag_name='body')
self.root.add_child(body_container)
self.current_parent = body_container
elif tag == 'table':
self._start_table(elem)
elif self._table_depth > 0:
# Suppress structural-content handlers while inside a table.
# _end_table runs processor.process(elem) over the full table
# subtree and emits each cell's text as part of the TableNode;
# also emitting <p>/<h*>/<section> nodes here would produce the
# same text twice in document.text(). Symmetrical to the
# existing _table_depth gate on elem.clear() above.
pass
elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
self._start_heading(elem)
elif tag == 'p':
self._start_paragraph(elem)
elif tag == 'table':
self._start_table(elem)
elif tag == 'section':
self._start_section(elem)

Expand All @@ -194,12 +216,14 @@ def _handle_end_tag(self, elem: HtmlElement):
self.tag_stack.pop()

# Handle specific tags
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
if tag == 'table':
self._end_table(elem)
elif self._table_depth > 0:
pass # see _handle_start_tag: structural handlers are suppressed inside tables
elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
self._end_heading(elem)
elif tag == 'p':
self._end_paragraph(elem)
elif tag == 'table':
self._end_table(elem)
elif tag == 'section':
self._end_section(elem)
elif tag == 'body':
Expand All @@ -217,6 +241,7 @@ def _start_heading(self, elem: HtmlElement):
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import HeadingNode

self._content_depth += 1
level = int(elem.tag[1])
text = self._get_text_content(elem)

Expand Down Expand Up @@ -246,12 +271,15 @@ def _end_heading(self, elem: HtmlElement):

# Clear any accumulated text buffer
self.text_buffer.clear()
if self._content_depth > 0:
self._content_depth -= 1

def _start_paragraph(self, elem: HtmlElement):
"""Start processing a paragraph."""
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import ParagraphNode

self._content_depth += 1
para = ParagraphNode()

# Get style if present
Expand All @@ -275,6 +303,8 @@ def _end_paragraph(self, elem: HtmlElement):

# Clear any accumulated text buffer
self.text_buffer.clear()
if self._content_depth > 0:
self._content_depth -= 1

def _start_table(self, elem: HtmlElement):
"""Start processing a table."""
Expand Down Expand Up @@ -311,6 +341,7 @@ def _start_section(self, elem: HtmlElement):
# Import node types at runtime to avoid circular imports
from edgar.documents.nodes import SectionNode

self._content_depth += 1
section = SectionNode()

# Get section attributes
Expand All @@ -328,6 +359,8 @@ def _start_section(self, elem: HtmlElement):
def _end_section(self, elem: HtmlElement):
"""End processing a section."""
self.current_section = None
if self._content_depth > 0:
self._content_depth -= 1

def _flush_buffer(self):
"""Flush node buffer to document tree."""
Expand Down
121 changes: 121 additions & 0 deletions tests/test_html_parser_regressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,127 @@ def test_large_document_streaming_trigger(self):
doc = parse_html(html, config=config)
assert doc is not None

def test_streaming_preserves_span_wrapped_paragraph_text(self):
"""
Regression: Streaming parser dropped text from <span>-wrapped paragraphs.

Bug: The iterparse loop called elem.clear() on every event (both
start and end), and on every element regardless of whether an
enclosing structural element (p/h1-h6/section) had finished reading
its children. Because iterparse fires end events depth-first, the
inner <span>'s end event cleared its .text/.tail before <p>'s end
event ran _get_text_content(p). SEC filings wrap virtually every
word in <span style="..."> tags, so streaming-mode paragraphs
produced empty text — silently, with no warning.

Symptom in production: filings in the ~30MB–110MB band (which
cross the default 10MB streaming_threshold) returned text() output
20%+ shorter than the non-streaming path; for some filings,
nearly empty. No exception was raised.

Fix: edgar/documents/utils/streaming.py — clear only on end
events, and gate clearing on a content-depth counter that tracks
open p/h1-h6/section elements (matching the existing _table_depth
gate). This defers child cleanup until the enclosing structural
element has read its subtree.

Expected: Streaming-mode text() returns the full paragraph
content, including text inside nested <span> wrappers.
"""
# Mimics SEC filing structure: every word inside its own <span>.
html = (
"<html><body>"
"<p><span>Alpha </span><span>beta </span><span>gamma</span></p>"
"<p><span>second </span><span>paragraph</span></p>"
"<h2><span>Risk Factors</span></h2>"
"<p><span>nested </span><span>spans </span><span>everywhere</span></p>"
"</body></html>"
)

# Force streaming mode regardless of size.
streaming_cfg = ParserConfig(
streaming_threshold=1,
max_document_size=10 * 1024 * 1024,
)
text = parse_html(html, config=streaming_cfg).text()

# All paragraph and heading content must survive the streaming path.
assert "Alpha" in text and "beta" in text and "gamma" in text
assert "second paragraph" in text
assert "Risk Factors" in text
assert "nested spans everywhere" in text

# Non-streaming baseline must agree on the same content.
normal_cfg = ParserConfig(streaming_threshold=10 * 1024 * 1024)
normal_text = parse_html(html, config=normal_cfg).text()
for needle in ("Alpha", "beta", "gamma", "second paragraph",
"Risk Factors", "nested spans everywhere"):
assert needle in normal_text, f"baseline missing {needle!r}"

def test_streaming_does_not_double_emit_table_cell_paragraphs(self):
"""
Regression: Streaming parser emitted text inside <td><p>...</p></td>
twice — once as a free-standing ParagraphNode (because the <p>
start/end handlers fired unconditionally) and once as TableNode
cell text (because _end_table walks the full subtree via
processor.process(elem)). Same applies to <h*> and <section>
inside <td>.

This was masked before the span-bug fix because <p> handlers
produced empty paragraphs anyway. Once paragraph text was
recovered, the duplication showed up as 10-36% content overshoot
vs non-streaming on table-heavy filings — visible as the same
financial-statement labels ('Total', interest-income line items,
etc.) repeating dozens of times more in streaming output than
non-streaming output.

Fix: _handle_start_tag / _handle_end_tag gate <p>/<h1-6>/<section>
on _table_depth == 0, symmetrical to the existing _table_depth
gate on elem.clear(). The table processor remains the single
source of cell text.

Expected: each cell's text appears exactly once in streaming
output, matching non-streaming behaviour.
"""
html = (
"<html><body>"
"<table>"
"<tr><td><p>Cell paragraph one</p></td>"
" <td><p>Cell paragraph two</p></td></tr>"
"<tr><td><p>Row two A</p></td>"
" <td><p>Row two B</p></td></tr>"
"</table>"
"</body></html>"
)

streaming_cfg = ParserConfig(
streaming_threshold=1,
max_document_size=10 * 1024 * 1024,
)
text = parse_html(html, config=streaming_cfg).text()

# Each cell must appear exactly once. Pre-fix this PR, each of
# these would appear twice (once as standalone paragraph, once
# as a table cell), and `text.count(...) == 2`.
for cell in ("Cell paragraph one", "Cell paragraph two",
"Row two A", "Row two B"):
assert text.count(cell) == 1, (
f"{cell!r} appears {text.count(cell)} times in streaming "
f"output (expected 1) — table cell content is being "
f"double-emitted as both ParagraphNode and TableNode cell"
)

# And the non-streaming baseline must show the same single-emission
# behaviour, so the assertion isn't accidentally locking in a
# streaming-specific quirk.
normal_cfg = ParserConfig(streaming_threshold=10 * 1024 * 1024)
normal_text = parse_html(html, config=normal_cfg).text()
for cell in ("Cell paragraph one", "Cell paragraph two",
"Row two A", "Row two B"):
assert normal_text.count(cell) == 1, (
f"baseline emits {cell!r} {normal_text.count(cell)} times"
)


class TestSectionDetectionRegressions:
"""Regression tests for section detection bugs."""
Expand Down
Loading