Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 51 additions & 31 deletions edgar/_filings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2034,13 +2034,14 @@ def open(self):
@lru_cache(maxsize=1)
def sections(self) -> List[str]:
html = self.html()
if html is None:
raise ValueError(
f"Filing {self.accession_no} ({self.form}, filed {self.filing_date}) "
"has no HTML primary document — likely a pre-2001 SGML/text submission. "
"Use filing.text() to access the raw submission content."
)
return html_sections(html)
if html is not None:
return html_sections(html)
# Old text-only filings (pre-2002) — chunk on <PAGE> markers.
text = self.text()
if not text:
return []
chunks = [c.strip() for c in re.split(r"<PAGE>|\n\s*\n", text) if len(c.strip()) >= 50]
return chunks if chunks else [text]

@cached_property
def __get_bm25_search_index(self):
Expand Down Expand Up @@ -2083,29 +2084,19 @@ def grep(self, pattern: str, *, regex: bool = False, document: Optional[str] = N
from edgar.search.grep import GrepResult, _grep_text

all_matches = []
found_any_text = False

try:
attachments = self.attachments
except Exception:
return GrepResult(pattern, [])
attachments = []

for attachment in attachments:
# Filter by document if specified
if document:
if document.lower() == "primary":
if attachment.sequence_number != "1":
continue
else:
# Match by document_type (e.g. "EX-10.1") or document filename
doc_type = (attachment.document_type or "").upper()
if document.upper() not in doc_type and document.lower() not in (attachment.document or "").lower():
continue

# Skip binary/non-text attachments
if document and not self._attachment_matches(attachment, document):
continue
if attachment.empty or attachment.is_binary():
continue

# Get text content
try:
text = attachment.text()
except Exception as e:
Expand All @@ -2115,20 +2106,49 @@ def grep(self, pattern: str, *, regex: bool = False, document: Optional[str] = N
if not text:
continue

# Determine location label
doc_type = attachment.document_type or ""
if attachment.sequence_number == "1":
location = "primary"
elif doc_type:
location = doc_type
else:
location = attachment.document or f"doc-{attachment.sequence_number}"
found_any_text = True
location = self._attachment_location(attachment)
all_matches.extend(_grep_text(text, pattern, location, regex=regex))

matches = _grep_text(text, pattern, location, regex=regex)
all_matches.extend(matches)
# Old text filings: SGML returns empty attachment shells, fall back to filing.text().
if not found_any_text and (document is None or document.lower() == "primary"):
all_matches.extend(self._grep_filing_text(pattern, regex))

return GrepResult(pattern, all_matches)

@staticmethod
def _attachment_matches(attachment, document: str) -> bool:
"""Whether `attachment` satisfies grep()'s `document` filter."""
if document.lower() == "primary":
return attachment.sequence_number == "1"
doc_type = (attachment.document_type or "").upper()
if document.upper() in doc_type:
return True
return document.lower() in (attachment.document or "").lower()

@staticmethod
def _attachment_location(attachment) -> str:
"""Label for an attachment in grep result locations."""
if attachment.sequence_number == "1":
return "primary"
if attachment.document_type:
return attachment.document_type
return attachment.document or f"doc-{attachment.sequence_number}"

def _grep_filing_text(self, pattern: str, regex: bool) -> list:
"""Grep the combined filing text as a 'primary' document.

Used by grep() when no attachment yields usable text — covers older
plain-text filings whose SGML decomposition emits empty shells.
"""
from edgar.search.grep import _grep_text
try:
text = self.text()
except Exception as e:
log.debug(f"grep: could not extract filing text: {e}")
return []
return _grep_text(text, pattern, "primary", regex=regex) if text else []

@property
def filing_url(self) -> str:
return f"{self.base_dir}/{self.document.document}"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Regression test for #819 — search()/grep() failed on plain-text filings."""

import pytest

from edgar import Company


PCG_TEXT_10K_ACCESSION = "0000929624-00-000321"


def _pcg_oldest_10k():
filings = Company("PCG").get_filings(form="10-K")
target = next(
(f for f in filings if f.accession_no == PCG_TEXT_10K_ACCESSION),
None,
)
assert target is not None, f"PCG 10-K {PCG_TEXT_10K_ACCESSION} missing from EDGAR results"
return target
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what I do in tests
filing = Filing(form='10-K', filing_date='2000-03-08', company='PG&E CORP', cik=1004980, accession_no='0000929624-00-000321')

'''
How to find a filing by accession number

f = find("0000929624-00-000321")
str(f)
"Filing(form='10-K', filing_date='2000-03-08', company='PG&E CORP', cik=1004980, accession_no='0000929624-00-000321')"
'''



@pytest.mark.network
def test_text_filing_html_is_none_but_text_is_populated():
filing = _pcg_oldest_10k()
assert filing.html() is None
text = filing.text()
assert text is not None
assert len(text) > 100_000
assert "employees" in text.lower()


@pytest.mark.network
def test_sections_falls_back_to_text_for_plain_text_filings():
filing = _pcg_oldest_10k()
sections = filing.sections()
assert isinstance(sections, list)
assert len(sections) > 1
assert any("employees" in s.lower() for s in sections)


@pytest.mark.network
def test_search_works_on_plain_text_filing():
filing = _pcg_oldest_10k()
results = filing.search("employees")
assert results is not None
assert len(results) >= 1


@pytest.mark.network
def test_grep_works_on_plain_text_filing():
filing = _pcg_oldest_10k()
matches = filing.grep("employees")
assert len(matches) >= 1


@pytest.mark.network
def test_search_grep_still_work_on_html_filings():
newest = Company("PCG").get_filings(form="10-K")[0]
assert newest.html() is not None
assert len(newest.search("employees")) > 0
assert len(newest.grep("employees")) > 0
66 changes: 0 additions & 66 deletions tests/issues/regression/test_issue_819_text_filing_search.py

This file was deleted.

Loading