diff --git a/edgar/_filings.py b/edgar/_filings.py index bdeb15d8..eca88a89 100644 --- a/edgar/_filings.py +++ b/edgar/_filings.py @@ -2034,13 +2034,14 @@ def open(self): @lru_cache(maxsize=1) def sections(self) -> List[str]: html = self.html() - if html is None: - raise ValueError( - f"Filing {self.accession_no} ({self.form}, filed {self.filing_date}) " - "has no HTML primary document — likely a pre-2001 SGML/text submission. " - "Use filing.text() to access the raw submission content." - ) - return html_sections(html) + if html is not None: + return html_sections(html) + # Old text-only filings (pre-2002) — chunk on markers. + text = self.text() + if not text: + return [] + chunks = [c.strip() for c in re.split(r"|\n\s*\n", text) if len(c.strip()) >= 50] + return chunks if chunks else [text] @cached_property def __get_bm25_search_index(self): @@ -2083,29 +2084,19 @@ def grep(self, pattern: str, *, regex: bool = False, document: Optional[str] = N from edgar.search.grep import GrepResult, _grep_text all_matches = [] + found_any_text = False try: attachments = self.attachments except Exception: - return GrepResult(pattern, []) + attachments = [] for attachment in attachments: - # Filter by document if specified - if document: - if document.lower() == "primary": - if attachment.sequence_number != "1": - continue - else: - # Match by document_type (e.g. "EX-10.1") or document filename - doc_type = (attachment.document_type or "").upper() - if document.upper() not in doc_type and document.lower() not in (attachment.document or "").lower(): - continue - - # Skip binary/non-text attachments + if document and not self._attachment_matches(attachment, document): + continue if attachment.empty or attachment.is_binary(): continue - # Get text content try: text = attachment.text() except Exception as e: @@ -2115,20 +2106,49 @@ def grep(self, pattern: str, *, regex: bool = False, document: Optional[str] = N if not text: continue - # Determine location label - doc_type = attachment.document_type or "" - if attachment.sequence_number == "1": - location = "primary" - elif doc_type: - location = doc_type - else: - location = attachment.document or f"doc-{attachment.sequence_number}" + found_any_text = True + location = self._attachment_location(attachment) + all_matches.extend(_grep_text(text, pattern, location, regex=regex)) - matches = _grep_text(text, pattern, location, regex=regex) - all_matches.extend(matches) + # Old text filings: SGML returns empty attachment shells, fall back to filing.text(). + if not found_any_text and (document is None or document.lower() == "primary"): + all_matches.extend(self._grep_filing_text(pattern, regex)) return GrepResult(pattern, all_matches) + @staticmethod + def _attachment_matches(attachment, document: str) -> bool: + """Whether `attachment` satisfies grep()'s `document` filter.""" + if document.lower() == "primary": + return attachment.sequence_number == "1" + doc_type = (attachment.document_type or "").upper() + if document.upper() in doc_type: + return True + return document.lower() in (attachment.document or "").lower() + + @staticmethod + def _attachment_location(attachment) -> str: + """Label for an attachment in grep result locations.""" + if attachment.sequence_number == "1": + return "primary" + if attachment.document_type: + return attachment.document_type + return attachment.document or f"doc-{attachment.sequence_number}" + + def _grep_filing_text(self, pattern: str, regex: bool) -> list: + """Grep the combined filing text as a 'primary' document. + + Used by grep() when no attachment yields usable text — covers older + plain-text filings whose SGML decomposition emits empty shells. + """ + from edgar.search.grep import _grep_text + try: + text = self.text() + except Exception as e: + log.debug(f"grep: could not extract filing text: {e}") + return [] + return _grep_text(text, pattern, "primary", regex=regex) if text else [] + @property def filing_url(self) -> str: return f"{self.base_dir}/{self.document.document}" diff --git a/tests/issues/regression/test_issue_819_search_grep_on_text_filings.py b/tests/issues/regression/test_issue_819_search_grep_on_text_filings.py new file mode 100644 index 00000000..df2a4b7d --- /dev/null +++ b/tests/issues/regression/test_issue_819_search_grep_on_text_filings.py @@ -0,0 +1,60 @@ +"""Regression test for #819 — search()/grep() failed on plain-text filings.""" + +import pytest + +from edgar import Company + + +PCG_TEXT_10K_ACCESSION = "0000929624-00-000321" + + +def _pcg_oldest_10k(): + filings = Company("PCG").get_filings(form="10-K") + target = next( + (f for f in filings if f.accession_no == PCG_TEXT_10K_ACCESSION), + None, + ) + assert target is not None, f"PCG 10-K {PCG_TEXT_10K_ACCESSION} missing from EDGAR results" + return target + + +@pytest.mark.network +def test_text_filing_html_is_none_but_text_is_populated(): + filing = _pcg_oldest_10k() + assert filing.html() is None + text = filing.text() + assert text is not None + assert len(text) > 100_000 + assert "employees" in text.lower() + + +@pytest.mark.network +def test_sections_falls_back_to_text_for_plain_text_filings(): + filing = _pcg_oldest_10k() + sections = filing.sections() + assert isinstance(sections, list) + assert len(sections) > 1 + assert any("employees" in s.lower() for s in sections) + + +@pytest.mark.network +def test_search_works_on_plain_text_filing(): + filing = _pcg_oldest_10k() + results = filing.search("employees") + assert results is not None + assert len(results) >= 1 + + +@pytest.mark.network +def test_grep_works_on_plain_text_filing(): + filing = _pcg_oldest_10k() + matches = filing.grep("employees") + assert len(matches) >= 1 + + +@pytest.mark.network +def test_search_grep_still_work_on_html_filings(): + newest = Company("PCG").get_filings(form="10-K")[0] + assert newest.html() is not None + assert len(newest.search("employees")) > 0 + assert len(newest.grep("employees")) > 0 diff --git a/tests/issues/regression/test_issue_819_text_filing_search.py b/tests/issues/regression/test_issue_819_text_filing_search.py deleted file mode 100644 index f718f506..00000000 --- a/tests/issues/regression/test_issue_819_text_filing_search.py +++ /dev/null @@ -1,66 +0,0 @@ -""" -Regression test for GitHub Issue #819 (quick-win partial fix): -Filing.search() raised bare AssertionError on pre-2001 SGML/text filings -because Filing.sections() asserted that filing.html() was non-null with no -helpful message. - -This test covers only the assertion → ValueError change. The deeper fix -(making sections() and grep() actually work on text filings) is scheduled -for 5.32.0. -""" - -from unittest.mock import MagicMock, patch - -import pytest - -from edgar._filings import Filing - - -def _make_filing_no_html() -> Filing: - """Build a minimal Filing instance whose .html() returns None.""" - filing = Filing( - cik=12345, - company="TEST", - form="10-K", - filing_date="2000-03-08", - accession_no="0000000000-00-000000", - ) - return filing - - -class TestSectionsRaisesUsefulError: - """sections() must raise a legible ValueError when html() returns None.""" - - def test_sections_raises_value_error_not_assertion(self): - filing = _make_filing_no_html() - with patch.object(filing, "html", return_value=None): - with pytest.raises(ValueError) as exc_info: - filing.sections() - - msg = str(exc_info.value) - assert "no HTML primary document" in msg - assert "filing.text()" in msg - assert filing.accession_no in msg - - def test_sections_does_not_raise_assertion_error(self): - """Specifically — no bare AssertionError (the original GH #819 symptom).""" - filing = _make_filing_no_html() - with patch.object(filing, "html", return_value=None): - try: - filing.sections() - except ValueError: - pass # expected - except AssertionError: - pytest.fail("sections() must not raise bare AssertionError") - - def test_search_surfaces_useful_error_on_text_filing(self): - """search() delegates to sections() — the error must propagate legibly.""" - filing = _make_filing_no_html() - with patch.object(filing, "html", return_value=None): - with pytest.raises(ValueError) as exc_info: - filing.search("employees") - - msg = str(exc_info.value) - assert "filing.text()" in msg, ( - "Error message must point users at the working workaround" - )