diff --git a/edgar/documents/document.py b/edgar/documents/document.py
index 3a054816..a507f38e 100644
--- a/edgar/documents/document.py
+++ b/edgar/documents/document.py
@@ -115,6 +115,40 @@ def text(self, **kwargs) -> str:
# Clean up boundary artifacts (page numbers, next section headers)
return self._clean_boundary_artifacts(text)
+ def markdown(self) -> str:
+ """Render this section to Markdown.
+
+ Mirrors :meth:`text` but preserves syntax for tables (pipe
+ format) and lists (bullet / numbered markers) by routing
+ through the same renderer used by :meth:`Document.to_markdown`.
+
+ For heading/pattern-based sections the cached node tree is
+ rendered directly. For TOC-based sections (whose node has no
+ children — content is fetched lazily from the original HTML)
+ this method currently falls back to :meth:`text` because
+ correctly extracting a TOC section's HTML subtree without
+ leaking adjacent sections or losing structural wrappers
+ (``
``/````/etc.) is non-trivial — every shape of
+ anchor nesting requires careful handling. The fallback is
+ safe (no regression vs ``text()``) but does not deliver the
+ table/list-preserving benefit on TOC sections. Adding full
+ TOC markdown support is tracked as a follow-up.
+ """
+ if self.detection_method == 'toc' and self._text_extractor is not None:
+ # Conservative fallback — see docstring. Returns the same
+ # output as `Section.text()` so callers get correct text
+ # rather than risk leakage of adjacent-section markup.
+ return self.text()
+
+ # Heading/pattern-based sections: render the cached node tree.
+ # Apply the same boundary-artifact cleanup as `text()` so page
+ # numbers and bleed-in next-item headers don't leak into the
+ # markdown output.
+ from edgar.documents.renderers.markdown import MarkdownRenderer
+ renderer = MarkdownRenderer()
+ rendered = renderer.render_node(self.node)
+ return self._clean_boundary_artifacts(rendered)
+
def _clean_boundary_artifacts(self, text: str) -> str:
"""
Remove common artifacts at section boundaries.
@@ -134,10 +168,10 @@ def _clean_boundary_artifacts(self, text: str) -> str:
# Pattern: page number followed by PART header followed by Item number
# e.g., "\n\n 16\n\n PART I\n\nItem 1A\n\n" (this is a page break artifact)
text = re.sub(
- r'\n\s*\d{1,3}\s*\n\s*PART\s+[IVX]+\s*\n\s*Item\s+\d+[A-Za-z]?(?:,\s*\d+[A-Za-z]?)?\s*\n',
+ r'\n\s*\d{1,3}\s*\n\s*(?:#{1,6}\s+|\*\*\s*)?PART\s+[IVX]+(?:\s*\*\*)?\s*\n\s*(?:#{1,6}\s+|\*\*\s*)?Item\s+\d+[A-Za-z]?(?:\\?\.)?(?:,\s*\d+[A-Za-z]?(?:\\?\.)?)?(?:\s*\*\*)?\s*\n',
'\n\n',
text,
- flags=re.IGNORECASE
+ flags=re.IGNORECASE,
)
# 1b. Remove interior page headers for 20-F (Table of Contents format)
@@ -152,12 +186,15 @@ def _clean_boundary_artifacts(self, text: str) -> str:
# 2a. Remove trailing page footer for 10-K/10-Q (PART + Item at end)
# Pattern: page number + PART + Item header at end (next section bleeding in)
- # e.g., "\n\n 29\n\n PART I\n\nItem 1B, 1C" at end
+ # e.g., "\n\n 29\n\n PART I\n\nItem 1B, 1C" at end. Optional
+ # markdown decorations (``#``, ``**``) and backslash-escaped
+ # periods accommodate ``MarkdownRenderer`` output where each
+ # of PART or Item may have been rendered as a heading or bold.
text = re.sub(
- r'\n\s*\d{1,3}\s*\n\s*PART\s+[IVX]+\s*\n\s*Item\s+\d+[A-Za-z]?(?:,\s*\d+[A-Za-z]?)?\s*$',
+ r'\n\s*\d{1,3}\s*\n\s*(?:#{1,6}\s+|\*\*\s*)?PART\s+[IVX]+(?:\s*\*\*)?\s*\n\s*(?:#{1,6}\s+|\*\*\s*)?Item\s+\d+[A-Za-z]?(?:\\?\.)?(?:,\s*\d+[A-Za-z]?(?:\\?\.)?)?(?:\s*\*\*)?\s*$',
'',
text,
- flags=re.IGNORECASE
+ flags=re.IGNORECASE,
)
# 2b. Remove trailing page footer for 20-F (Table of Contents at end)
@@ -169,9 +206,31 @@ def _clean_boundary_artifacts(self, text: str) -> str:
)
# 3. Remove trailing Item headers if they appear at the very end
- # (without preceding PART header)
- # e.g., "\n\nItem 15" or "\n\nITEM 15."
- text = re.sub(r'\n\s*Item\s+\d+[A-Za-z]?\.?\s*$', '', text, flags=re.IGNORECASE)
+ # (without preceding PART header). Matches:
+ # - "Item 15", "ITEM 15." (plain text)
+ # - "Item 15\." (markdown-escaped period)
+ # - "# Item 15", "## Item 15" (markdown heading)
+ # - "**Item 15**", "**Item 15.**" (markdown bold)
+ # The ``MarkdownRenderer`` can produce any of these depending on
+ # how the next-item bleed-in was structured in the HTML.
+ text = re.sub(
+ r'\n\s*#{1,6}\s*Item\s+\d+[A-Za-z]?(?:\\?\.)?\s*$',
+ '',
+ text,
+ flags=re.IGNORECASE,
+ )
+ text = re.sub(
+ r'\n\s*\*\*\s*Item\s+\d+[A-Za-z]?(?:\\?\.)?\s*\*\*\s*$',
+ '',
+ text,
+ flags=re.IGNORECASE,
+ )
+ text = re.sub(
+ r'\n\s*Item\s+\d+[A-Za-z]?(?:\\?\.)?\s*$',
+ '',
+ text,
+ flags=re.IGNORECASE,
+ )
# 4. Remove trailing page numbers: whitespace followed by 1-3 digits at end
# e.g., "\n\n 100" or "\n 92"
diff --git a/tests/test_section_markdown.py b/tests/test_section_markdown.py
new file mode 100644
index 00000000..146c142e
--- /dev/null
+++ b/tests/test_section_markdown.py
@@ -0,0 +1,359 @@
+"""
+Tests for ``Section.markdown()``.
+
+`Section.text()` walks the HTML subtree and emits newline-joined cell
+content — tables and bullet lists are flattened to space/newline
+soup. The whole-document `Filing.markdown()` (and `Document.to_markdown`)
+preserves table pipe syntax and list markers but is whole-document
+only — there is no per-item slice.
+
+`Section.markdown()` closes that gap: same scope as ``text()`` (one
+section) but the same renderer as ``Document.to_markdown`` so tables
+and lists keep their syntax.
+
+Downstream effect: per-item chunkers can call ``section.markdown()``
+to get a structured per-item view instead of paying either the cost
+of `text()` (flat) or the cost of `filing.markdown()` (no item
+boundaries).
+"""
+from __future__ import annotations
+
+import pytest
+
+from edgar.documents import HTMLParser
+from edgar.documents.config import ParserConfig
+
+
+def _parse_html(html: str, form: str = "10-K"):
+ return HTMLParser(ParserConfig(form=form)).parse(html)
+
+
+class TestSectionMarkdownTablePreservation:
+ """A `` in section HTML should render as pipe-delimited
+ markdown, not flattened cell-by-cell text."""
+
+ HTML_WITH_TABLE = """
+
+ Item 1. Business
+ We operate three segments.
+
+ Item 2. Properties
+ Issuer Purchases of Equity Securities:
+
+
+ | Period | Total Shares | Avg Price |
+
+
+ | April | 1,000 | $50.00 |
+ | May | 2,000 | $52.50 |
+ | June | 1,500 | $53.25 |
+
+
+
+ Item 3. Legal Proceedings
+ From time to time...
+
+ """
+
+ @pytest.fixture
+ def properties_section(self):
+ doc = _parse_html(self.HTML_WITH_TABLE)
+ sections = doc.sections
+ # Find the section containing the table — section keys vary by
+ # detection method; locate by title substring.
+ for name, section in sections.items():
+ if "properties" in name.lower() or "item_2" in name.lower():
+ return section
+ pytest.skip("Test HTML did not produce an Item 2 section")
+
+ def test_markdown_returns_str(self, properties_section):
+ result = properties_section.markdown()
+ assert isinstance(result, str)
+ assert result # non-empty
+
+ def test_markdown_preserves_pipe_syntax(self, properties_section):
+ """The hallmark: pipes show up between columns."""
+ result = properties_section.markdown()
+ assert "|" in result, (
+ f"Section.markdown() must preserve pipe-table syntax; got:\n{result[:500]}"
+ )
+
+ def test_markdown_includes_cell_values(self, properties_section):
+ """Sanity: actual cell contents are present."""
+ result = properties_section.markdown()
+ for value in ("Period", "Total Shares", "April", "1,000", "$50.00"):
+ assert value in result, f"missing cell value {value!r} in markdown:\n{result}"
+
+ def test_text_does_not_preserve_pipe_syntax(self, properties_section):
+ """Negative control: confirm `text()` does NOT have pipes —
+ otherwise the contrast we care about isn't there to begin with."""
+ text = properties_section.text()
+ # Cells appear in text() but not column-delimited
+ assert "Period" in text # still has the words
+ assert "|" not in text, (
+ f"text() should not have pipes (otherwise markdown isn't adding anything); "
+ f"got:\n{text[:500]}"
+ )
+
+
+class TestSectionMarkdownListPreservation:
+ """Bullet lists in section HTML should render with list markers."""
+
+ HTML_WITH_BULLETS = """
+
+ Item 1A. Risk Factors
+ Our business is subject to the following risks:
+
+ - decreased demand in the restaurant business
+ - volatility in commodity costs
+ - foreign currency exchange rate fluctuations
+
+
+ Item 2. Properties
+ Our principal facilities are located worldwide.
+
+ """
+
+ @pytest.fixture
+ def risk_factors_section(self):
+ doc = _parse_html(self.HTML_WITH_BULLETS)
+ sections = doc.sections
+ for name, section in sections.items():
+ if "risk_factors" in name.lower() or "item_1a" in name.lower():
+ return section
+ pytest.skip("Test HTML did not produce an Item 1A section")
+
+ def test_bullets_preserved_in_markdown(self, risk_factors_section):
+ result = risk_factors_section.markdown()
+ # Markdown bullets can render as '-', '*', or '+' — accept any.
+ bullet_lines = [
+ line for line in result.splitlines()
+ if line.lstrip().startswith(("-", "*", "+"))
+ ]
+ assert len(bullet_lines) >= 3, (
+ f"expected at least 3 bullet lines; got {len(bullet_lines)}:\n{result[:500]}"
+ )
+
+ def test_bullet_text_preserved(self, risk_factors_section):
+ result = risk_factors_section.markdown()
+ for phrase in (
+ "decreased demand in the restaurant business",
+ "volatility in commodity costs",
+ "foreign currency exchange rate fluctuations",
+ ):
+ assert phrase in result, f"missing bullet text {phrase!r}"
+
+
+class TestSectionMarkdownAcrossDetectionPaths:
+ """Markdown should work whether the section came from heading-based,
+ pattern-based, or TOC-based detection."""
+
+ def test_pattern_based_section(self):
+ """Pattern-extractor sections have `detection_method='pattern'`
+ and a populated `node`. Markdown renders the node tree."""
+ html = """
+
+ Item 1. Business
+ We design products.
+
+ | Segment | Revenue |
+ | Auto | $100M |
+
+ Item 2. Properties
+ Facilities worldwide.
+
+ """
+ doc = _parse_html(html)
+ # Find any section that has a table in it.
+ found = False
+ for name, section in doc.sections.items():
+ md = section.markdown()
+ if "|" in md and "Segment" in md:
+ found = True
+ # Must include the cell values too.
+ assert "Auto" in md
+ assert "$100M" in md
+ break
+ assert found, "no section produced pipe-table markdown — synthetic HTML may need adjustment"
+
+ def test_toc_section_falls_back_to_text(self):
+ """TOC-based sections fall back to ``text()`` output.
+
+ Per the docstring on :meth:`Section.markdown`, the TOC path
+ is conservative — extracting a TOC section's HTML subtree
+ cleanly (without leaking adjacent sections or losing
+ structural ````/```` wrappers when anchors
+ cross those boundaries) is non-trivial. The fallback returns
+ the same string ``Section.text()`` would produce so callers
+ get correct content rather than risk corruption. Full TOC
+ markdown support is tracked as a follow-up.
+ """
+ from edgar.documents.document import Section as _Section
+ from edgar.documents.nodes import SectionNode
+
+ # Synthetic TOC section with a stub text extractor.
+ expected = "This is the TOC section text."
+ sect = _Section(
+ name="part_i_item_1",
+ title="Item 1",
+ node=SectionNode(section_name="part_i_item_1"),
+ detection_method="toc",
+ _text_extractor=lambda name, **kw: expected,
+ )
+ assert sect.markdown() == expected
+
+ def test_boundary_artifacts_cleaned_from_markdown(self):
+ """Regression for codex review round 7: SEC page-break
+ artifacts (``\\n\\nPART X\\nItem N\\n``) and trailing
+ next-item headers must be stripped from markdown output just
+ as ``Section.text()`` strips them. Without the cleanup, the
+ markdown path would leak boundary noise that the text path
+ already removes.
+ """
+ from edgar.documents.document import Section as _Section
+ from edgar.documents.nodes import SectionNode
+
+ # Build a Section with a stub renderer-friendly node, then
+ # assert cleanup applies to the rendered output.
+ sect = _Section(
+ name="item_1",
+ title="Item 1",
+ node=SectionNode(section_name="item_1"),
+ detection_method="heading",
+ )
+ # Exercise _clean_boundary_artifacts directly via markdown's
+ # cleanup hook by feeding text that contains the boundary
+ # artifact pattern.
+ raw = "Body content here.\n\n 100\n\n PART II\n\nItem 5"
+ cleaned = sect._clean_boundary_artifacts(raw)
+ # Trailing PART/Item artifact must be stripped.
+ assert "PART II" not in cleaned, (
+ f"boundary cleanup must strip trailing PART artifact; got:\n{cleaned}"
+ )
+ assert "Item 5" not in cleaned
+ assert "Body content here" in cleaned
+
+ def test_boundary_artifacts_cleaned_when_markdown_escaped(self):
+ """Regression for codex review round 8: ``MarkdownRenderer``
+ escapes periods as ``\\.``, so a trailing ``Item 5.`` in the
+ raw text becomes ``Item 5\\.`` after rendering. The cleanup
+ regex must accept the optional backslash so it strips both
+ the plain (``text()``) and markdown-escaped variants.
+ """
+ from edgar.documents.document import Section as _Section
+ from edgar.documents.nodes import SectionNode
+
+ sect = _Section(
+ name="item_1",
+ title="Item 1",
+ node=SectionNode(section_name="item_1"),
+ detection_method="heading",
+ )
+ # The renderer would produce this trailing artifact shape:
+ markdown_escaped = "Body content here.\n\nItem 5\\."
+ cleaned = sect._clean_boundary_artifacts(markdown_escaped)
+ assert "Item 5" not in cleaned, (
+ f"backslash-escaped trailing Item header must be stripped; got:\n{cleaned!r}"
+ )
+ assert "Body content here" in cleaned
+
+ # And the interior PART + Item\. variant in markdown form:
+ with_interior = (
+ "Body content here.\n\n 16\n\n PART I\n\nItem 1A\\.\n\nMore body."
+ )
+ cleaned_interior = sect._clean_boundary_artifacts(with_interior)
+ assert "PART I" not in cleaned_interior, (
+ f"interior PART + Item\\. artifact must be stripped; got:\n{cleaned_interior!r}"
+ )
+ assert "Body content here" in cleaned_interior
+ assert "More body" in cleaned_interior
+
+ def test_boundary_artifacts_cleaned_when_markdown_decorated(self):
+ """Regression for codex review round 9: ``MarkdownRenderer``
+ can emit trailing next-item headers as ``# Item 5``,
+ ``## Item 5\\.``, or ``**Item 5\\.**`` depending on the source
+ HTML structure. All those decorated forms must be stripped by
+ the cleanup, not just bare ``Item 5``.
+ """
+ from edgar.documents.document import Section as _Section
+ from edgar.documents.nodes import SectionNode
+
+ sect = _Section(
+ name="item_1",
+ title="Item 1",
+ node=SectionNode(section_name="item_1"),
+ detection_method="heading",
+ )
+ # Markdown heading form (``# Item 5``):
+ for header_md in ("# Item 5", "## Item 5", "### Item 5\\.", "## ITEM 1A"):
+ raw = f"Body content here.\n\n{header_md}"
+ cleaned = sect._clean_boundary_artifacts(raw)
+ assert "Item" not in cleaned.split("Body content here.", 1)[1], (
+ f"markdown-heading boundary header {header_md!r} must be stripped; "
+ f"got:\n{cleaned!r}"
+ )
+ assert "Body content here" in cleaned
+
+ # Markdown bold form (``**Item 5**``):
+ for bold_md in ("**Item 5**", "**Item 5.**", "**Item 5\\.**", "**ITEM 1A**"):
+ raw = f"Body content here.\n\n{bold_md}"
+ cleaned = sect._clean_boundary_artifacts(raw)
+ assert "Item" not in cleaned.split("Body content here.", 1)[1], (
+ f"markdown-bold boundary header {bold_md!r} must be stripped; "
+ f"got:\n{cleaned!r}"
+ )
+ assert "Body content here" in cleaned
+
+ def test_boundary_cleanup_handles_decorated_part_plus_item(self):
+ """Regression for codex review round 10: a full page-footer
+ artifact (page-number + PART + Item) can be rendered as
+ ``100\\n\\n# PART II\\n\\n# Item 5`` after markdown processing.
+ The combined cleanup regex must strip the whole footer block,
+ not just the trailing Item line — otherwise the page number
+ and PART line are left behind.
+ """
+ from edgar.documents.document import Section as _Section
+ from edgar.documents.nodes import SectionNode
+
+ sect = _Section(
+ name="item_1",
+ title="Item 1",
+ node=SectionNode(section_name="item_1"),
+ detection_method="heading",
+ )
+
+ # Trailing footer: page + heading-decorated PART + heading-decorated Item.
+ raw_heading = "Body content here.\n\n 100\n\n# PART II\n\n# Item 5"
+ cleaned = sect._clean_boundary_artifacts(raw_heading)
+ for noise in ("PART II", "Item 5", "100"):
+ assert noise not in cleaned, (
+ f"footer noise {noise!r} leaked through after decorated PART/Item;\n"
+ f"got: {cleaned!r}"
+ )
+ assert "Body content here" in cleaned
+
+ # Bold-decorated variant.
+ raw_bold = "Body content here.\n\n 29\n\n**PART I**\n\n**Item 1A\\.**"
+ cleaned_bold = sect._clean_boundary_artifacts(raw_bold)
+ for noise in ("PART I", "Item 1A", "29"):
+ assert noise not in cleaned_bold, (
+ f"footer noise {noise!r} leaked through after bold PART/Item;\n"
+ f"got: {cleaned_bold!r}"
+ )
+ assert "Body content here" in cleaned_bold
+
+ def test_markdown_is_idempotent(self):
+ """Calling markdown() twice should return the same string."""
+ html = """
+
+ Item 1. Business
+ We operate three segments.
+
+
+ """
+ doc = _parse_html(html)
+ for name, section in doc.sections.items():
+ a = section.markdown()
+ b = section.markdown()
+ assert a == b, "markdown() must be deterministic across calls"
+ break