diff --git a/edgar/documents/document.py b/edgar/documents/document.py index 3a054816..a507f38e 100644 --- a/edgar/documents/document.py +++ b/edgar/documents/document.py @@ -115,6 +115,40 @@ def text(self, **kwargs) -> str: # Clean up boundary artifacts (page numbers, next section headers) return self._clean_boundary_artifacts(text) + def markdown(self) -> str: + """Render this section to Markdown. + + Mirrors :meth:`text` but preserves syntax for tables (pipe + format) and lists (bullet / numbered markers) by routing + through the same renderer used by :meth:`Document.to_markdown`. + + For heading/pattern-based sections the cached node tree is + rendered directly. For TOC-based sections (whose node has no + children — content is fetched lazily from the original HTML) + this method currently falls back to :meth:`text` because + correctly extracting a TOC section's HTML subtree without + leaking adjacent sections or losing structural wrappers + (````/````/etc.) is non-trivial — every shape of + anchor nesting requires careful handling. The fallback is + safe (no regression vs ``text()``) but does not deliver the + table/list-preserving benefit on TOC sections. Adding full + TOC markdown support is tracked as a follow-up. + """ + if self.detection_method == 'toc' and self._text_extractor is not None: + # Conservative fallback — see docstring. Returns the same + # output as `Section.text()` so callers get correct text + # rather than risk leakage of adjacent-section markup. + return self.text() + + # Heading/pattern-based sections: render the cached node tree. + # Apply the same boundary-artifact cleanup as `text()` so page + # numbers and bleed-in next-item headers don't leak into the + # markdown output. + from edgar.documents.renderers.markdown import MarkdownRenderer + renderer = MarkdownRenderer() + rendered = renderer.render_node(self.node) + return self._clean_boundary_artifacts(rendered) + def _clean_boundary_artifacts(self, text: str) -> str: """ Remove common artifacts at section boundaries. @@ -134,10 +168,10 @@ def _clean_boundary_artifacts(self, text: str) -> str: # Pattern: page number followed by PART header followed by Item number # e.g., "\n\n 16\n\n PART I\n\nItem 1A\n\n" (this is a page break artifact) text = re.sub( - r'\n\s*\d{1,3}\s*\n\s*PART\s+[IVX]+\s*\n\s*Item\s+\d+[A-Za-z]?(?:,\s*\d+[A-Za-z]?)?\s*\n', + r'\n\s*\d{1,3}\s*\n\s*(?:#{1,6}\s+|\*\*\s*)?PART\s+[IVX]+(?:\s*\*\*)?\s*\n\s*(?:#{1,6}\s+|\*\*\s*)?Item\s+\d+[A-Za-z]?(?:\\?\.)?(?:,\s*\d+[A-Za-z]?(?:\\?\.)?)?(?:\s*\*\*)?\s*\n', '\n\n', text, - flags=re.IGNORECASE + flags=re.IGNORECASE, ) # 1b. Remove interior page headers for 20-F (Table of Contents format) @@ -152,12 +186,15 @@ def _clean_boundary_artifacts(self, text: str) -> str: # 2a. Remove trailing page footer for 10-K/10-Q (PART + Item at end) # Pattern: page number + PART + Item header at end (next section bleeding in) - # e.g., "\n\n 29\n\n PART I\n\nItem 1B, 1C" at end + # e.g., "\n\n 29\n\n PART I\n\nItem 1B, 1C" at end. Optional + # markdown decorations (``#``, ``**``) and backslash-escaped + # periods accommodate ``MarkdownRenderer`` output where each + # of PART or Item may have been rendered as a heading or bold. text = re.sub( - r'\n\s*\d{1,3}\s*\n\s*PART\s+[IVX]+\s*\n\s*Item\s+\d+[A-Za-z]?(?:,\s*\d+[A-Za-z]?)?\s*$', + r'\n\s*\d{1,3}\s*\n\s*(?:#{1,6}\s+|\*\*\s*)?PART\s+[IVX]+(?:\s*\*\*)?\s*\n\s*(?:#{1,6}\s+|\*\*\s*)?Item\s+\d+[A-Za-z]?(?:\\?\.)?(?:,\s*\d+[A-Za-z]?(?:\\?\.)?)?(?:\s*\*\*)?\s*$', '', text, - flags=re.IGNORECASE + flags=re.IGNORECASE, ) # 2b. Remove trailing page footer for 20-F (Table of Contents at end) @@ -169,9 +206,31 @@ def _clean_boundary_artifacts(self, text: str) -> str: ) # 3. Remove trailing Item headers if they appear at the very end - # (without preceding PART header) - # e.g., "\n\nItem 15" or "\n\nITEM 15." - text = re.sub(r'\n\s*Item\s+\d+[A-Za-z]?\.?\s*$', '', text, flags=re.IGNORECASE) + # (without preceding PART header). Matches: + # - "Item 15", "ITEM 15." (plain text) + # - "Item 15\." (markdown-escaped period) + # - "# Item 15", "## Item 15" (markdown heading) + # - "**Item 15**", "**Item 15.**" (markdown bold) + # The ``MarkdownRenderer`` can produce any of these depending on + # how the next-item bleed-in was structured in the HTML. + text = re.sub( + r'\n\s*#{1,6}\s*Item\s+\d+[A-Za-z]?(?:\\?\.)?\s*$', + '', + text, + flags=re.IGNORECASE, + ) + text = re.sub( + r'\n\s*\*\*\s*Item\s+\d+[A-Za-z]?(?:\\?\.)?\s*\*\*\s*$', + '', + text, + flags=re.IGNORECASE, + ) + text = re.sub( + r'\n\s*Item\s+\d+[A-Za-z]?(?:\\?\.)?\s*$', + '', + text, + flags=re.IGNORECASE, + ) # 4. Remove trailing page numbers: whitespace followed by 1-3 digits at end # e.g., "\n\n 100" or "\n 92" diff --git a/tests/test_section_markdown.py b/tests/test_section_markdown.py new file mode 100644 index 00000000..146c142e --- /dev/null +++ b/tests/test_section_markdown.py @@ -0,0 +1,359 @@ +""" +Tests for ``Section.markdown()``. + +`Section.text()` walks the HTML subtree and emits newline-joined cell +content — tables and bullet lists are flattened to space/newline +soup. The whole-document `Filing.markdown()` (and `Document.to_markdown`) +preserves table pipe syntax and list markers but is whole-document +only — there is no per-item slice. + +`Section.markdown()` closes that gap: same scope as ``text()`` (one +section) but the same renderer as ``Document.to_markdown`` so tables +and lists keep their syntax. + +Downstream effect: per-item chunkers can call ``section.markdown()`` +to get a structured per-item view instead of paying either the cost +of `text()` (flat) or the cost of `filing.markdown()` (no item +boundaries). +""" +from __future__ import annotations + +import pytest + +from edgar.documents import HTMLParser +from edgar.documents.config import ParserConfig + + +def _parse_html(html: str, form: str = "10-K"): + return HTMLParser(ParserConfig(form=form)).parse(html) + + +class TestSectionMarkdownTablePreservation: + """A `
` in section HTML should render as pipe-delimited + markdown, not flattened cell-by-cell text.""" + + HTML_WITH_TABLE = """ + +

Item 1. Business

+

We operate three segments.

+ +

Item 2. Properties

+

Issuer Purchases of Equity Securities:

+
+ + + + + + + + +
PeriodTotal SharesAvg Price
April1,000$50.00
May2,000$52.50
June1,500$53.25
+ +

Item 3. Legal Proceedings

+

From time to time...

+ + """ + + @pytest.fixture + def properties_section(self): + doc = _parse_html(self.HTML_WITH_TABLE) + sections = doc.sections + # Find the section containing the table — section keys vary by + # detection method; locate by title substring. + for name, section in sections.items(): + if "properties" in name.lower() or "item_2" in name.lower(): + return section + pytest.skip("Test HTML did not produce an Item 2 section") + + def test_markdown_returns_str(self, properties_section): + result = properties_section.markdown() + assert isinstance(result, str) + assert result # non-empty + + def test_markdown_preserves_pipe_syntax(self, properties_section): + """The hallmark: pipes show up between columns.""" + result = properties_section.markdown() + assert "|" in result, ( + f"Section.markdown() must preserve pipe-table syntax; got:\n{result[:500]}" + ) + + def test_markdown_includes_cell_values(self, properties_section): + """Sanity: actual cell contents are present.""" + result = properties_section.markdown() + for value in ("Period", "Total Shares", "April", "1,000", "$50.00"): + assert value in result, f"missing cell value {value!r} in markdown:\n{result}" + + def test_text_does_not_preserve_pipe_syntax(self, properties_section): + """Negative control: confirm `text()` does NOT have pipes — + otherwise the contrast we care about isn't there to begin with.""" + text = properties_section.text() + # Cells appear in text() but not column-delimited + assert "Period" in text # still has the words + assert "|" not in text, ( + f"text() should not have pipes (otherwise markdown isn't adding anything); " + f"got:\n{text[:500]}" + ) + + +class TestSectionMarkdownListPreservation: + """Bullet lists in section HTML should render with list markers.""" + + HTML_WITH_BULLETS = """ + +

Item 1A. Risk Factors

+

Our business is subject to the following risks:

+ + +

Item 2. Properties

+

Our principal facilities are located worldwide.

+ + """ + + @pytest.fixture + def risk_factors_section(self): + doc = _parse_html(self.HTML_WITH_BULLETS) + sections = doc.sections + for name, section in sections.items(): + if "risk_factors" in name.lower() or "item_1a" in name.lower(): + return section + pytest.skip("Test HTML did not produce an Item 1A section") + + def test_bullets_preserved_in_markdown(self, risk_factors_section): + result = risk_factors_section.markdown() + # Markdown bullets can render as '-', '*', or '+' — accept any. + bullet_lines = [ + line for line in result.splitlines() + if line.lstrip().startswith(("-", "*", "+")) + ] + assert len(bullet_lines) >= 3, ( + f"expected at least 3 bullet lines; got {len(bullet_lines)}:\n{result[:500]}" + ) + + def test_bullet_text_preserved(self, risk_factors_section): + result = risk_factors_section.markdown() + for phrase in ( + "decreased demand in the restaurant business", + "volatility in commodity costs", + "foreign currency exchange rate fluctuations", + ): + assert phrase in result, f"missing bullet text {phrase!r}" + + +class TestSectionMarkdownAcrossDetectionPaths: + """Markdown should work whether the section came from heading-based, + pattern-based, or TOC-based detection.""" + + def test_pattern_based_section(self): + """Pattern-extractor sections have `detection_method='pattern'` + and a populated `node`. Markdown renders the node tree.""" + html = """ + +

Item 1. Business

+

We design products.

+ + + +
SegmentRevenue
Auto$100M
+

Item 2. Properties

+

Facilities worldwide.

+ + """ + doc = _parse_html(html) + # Find any section that has a table in it. + found = False + for name, section in doc.sections.items(): + md = section.markdown() + if "|" in md and "Segment" in md: + found = True + # Must include the cell values too. + assert "Auto" in md + assert "$100M" in md + break + assert found, "no section produced pipe-table markdown — synthetic HTML may need adjustment" + + def test_toc_section_falls_back_to_text(self): + """TOC-based sections fall back to ``text()`` output. + + Per the docstring on :meth:`Section.markdown`, the TOC path + is conservative — extracting a TOC section's HTML subtree + cleanly (without leaking adjacent sections or losing + structural ````/```` wrappers when anchors + cross those boundaries) is non-trivial. The fallback returns + the same string ``Section.text()`` would produce so callers + get correct content rather than risk corruption. Full TOC + markdown support is tracked as a follow-up. + """ + from edgar.documents.document import Section as _Section + from edgar.documents.nodes import SectionNode + + # Synthetic TOC section with a stub text extractor. + expected = "This is the TOC section text." + sect = _Section( + name="part_i_item_1", + title="Item 1", + node=SectionNode(section_name="part_i_item_1"), + detection_method="toc", + _text_extractor=lambda name, **kw: expected, + ) + assert sect.markdown() == expected + + def test_boundary_artifacts_cleaned_from_markdown(self): + """Regression for codex review round 7: SEC page-break + artifacts (``\\n\\nPART X\\nItem N\\n``) and trailing + next-item headers must be stripped from markdown output just + as ``Section.text()`` strips them. Without the cleanup, the + markdown path would leak boundary noise that the text path + already removes. + """ + from edgar.documents.document import Section as _Section + from edgar.documents.nodes import SectionNode + + # Build a Section with a stub renderer-friendly node, then + # assert cleanup applies to the rendered output. + sect = _Section( + name="item_1", + title="Item 1", + node=SectionNode(section_name="item_1"), + detection_method="heading", + ) + # Exercise _clean_boundary_artifacts directly via markdown's + # cleanup hook by feeding text that contains the boundary + # artifact pattern. + raw = "Body content here.\n\n 100\n\n PART II\n\nItem 5" + cleaned = sect._clean_boundary_artifacts(raw) + # Trailing PART/Item artifact must be stripped. + assert "PART II" not in cleaned, ( + f"boundary cleanup must strip trailing PART artifact; got:\n{cleaned}" + ) + assert "Item 5" not in cleaned + assert "Body content here" in cleaned + + def test_boundary_artifacts_cleaned_when_markdown_escaped(self): + """Regression for codex review round 8: ``MarkdownRenderer`` + escapes periods as ``\\.``, so a trailing ``Item 5.`` in the + raw text becomes ``Item 5\\.`` after rendering. The cleanup + regex must accept the optional backslash so it strips both + the plain (``text()``) and markdown-escaped variants. + """ + from edgar.documents.document import Section as _Section + from edgar.documents.nodes import SectionNode + + sect = _Section( + name="item_1", + title="Item 1", + node=SectionNode(section_name="item_1"), + detection_method="heading", + ) + # The renderer would produce this trailing artifact shape: + markdown_escaped = "Body content here.\n\nItem 5\\." + cleaned = sect._clean_boundary_artifacts(markdown_escaped) + assert "Item 5" not in cleaned, ( + f"backslash-escaped trailing Item header must be stripped; got:\n{cleaned!r}" + ) + assert "Body content here" in cleaned + + # And the interior PART + Item\. variant in markdown form: + with_interior = ( + "Body content here.\n\n 16\n\n PART I\n\nItem 1A\\.\n\nMore body." + ) + cleaned_interior = sect._clean_boundary_artifacts(with_interior) + assert "PART I" not in cleaned_interior, ( + f"interior PART + Item\\. artifact must be stripped; got:\n{cleaned_interior!r}" + ) + assert "Body content here" in cleaned_interior + assert "More body" in cleaned_interior + + def test_boundary_artifacts_cleaned_when_markdown_decorated(self): + """Regression for codex review round 9: ``MarkdownRenderer`` + can emit trailing next-item headers as ``# Item 5``, + ``## Item 5\\.``, or ``**Item 5\\.**`` depending on the source + HTML structure. All those decorated forms must be stripped by + the cleanup, not just bare ``Item 5``. + """ + from edgar.documents.document import Section as _Section + from edgar.documents.nodes import SectionNode + + sect = _Section( + name="item_1", + title="Item 1", + node=SectionNode(section_name="item_1"), + detection_method="heading", + ) + # Markdown heading form (``# Item 5``): + for header_md in ("# Item 5", "## Item 5", "### Item 5\\.", "## ITEM 1A"): + raw = f"Body content here.\n\n{header_md}" + cleaned = sect._clean_boundary_artifacts(raw) + assert "Item" not in cleaned.split("Body content here.", 1)[1], ( + f"markdown-heading boundary header {header_md!r} must be stripped; " + f"got:\n{cleaned!r}" + ) + assert "Body content here" in cleaned + + # Markdown bold form (``**Item 5**``): + for bold_md in ("**Item 5**", "**Item 5.**", "**Item 5\\.**", "**ITEM 1A**"): + raw = f"Body content here.\n\n{bold_md}" + cleaned = sect._clean_boundary_artifacts(raw) + assert "Item" not in cleaned.split("Body content here.", 1)[1], ( + f"markdown-bold boundary header {bold_md!r} must be stripped; " + f"got:\n{cleaned!r}" + ) + assert "Body content here" in cleaned + + def test_boundary_cleanup_handles_decorated_part_plus_item(self): + """Regression for codex review round 10: a full page-footer + artifact (page-number + PART + Item) can be rendered as + ``100\\n\\n# PART II\\n\\n# Item 5`` after markdown processing. + The combined cleanup regex must strip the whole footer block, + not just the trailing Item line — otherwise the page number + and PART line are left behind. + """ + from edgar.documents.document import Section as _Section + from edgar.documents.nodes import SectionNode + + sect = _Section( + name="item_1", + title="Item 1", + node=SectionNode(section_name="item_1"), + detection_method="heading", + ) + + # Trailing footer: page + heading-decorated PART + heading-decorated Item. + raw_heading = "Body content here.\n\n 100\n\n# PART II\n\n# Item 5" + cleaned = sect._clean_boundary_artifacts(raw_heading) + for noise in ("PART II", "Item 5", "100"): + assert noise not in cleaned, ( + f"footer noise {noise!r} leaked through after decorated PART/Item;\n" + f"got: {cleaned!r}" + ) + assert "Body content here" in cleaned + + # Bold-decorated variant. + raw_bold = "Body content here.\n\n 29\n\n**PART I**\n\n**Item 1A\\.**" + cleaned_bold = sect._clean_boundary_artifacts(raw_bold) + for noise in ("PART I", "Item 1A", "29"): + assert noise not in cleaned_bold, ( + f"footer noise {noise!r} leaked through after bold PART/Item;\n" + f"got: {cleaned_bold!r}" + ) + assert "Body content here" in cleaned_bold + + def test_markdown_is_idempotent(self): + """Calling markdown() twice should return the same string.""" + html = """ + +

Item 1. Business

+

We operate three segments.

+ + + """ + doc = _parse_html(html) + for name, section in doc.sections.items(): + a = section.markdown() + b = section.markdown() + assert a == b, "markdown() must be deterministic across calls" + break