diff --git a/README.md b/README.md index b634b60..46a894d 100644 --- a/README.md +++ b/README.md @@ -496,6 +496,19 @@ supporting_text: "protein [X] functions in cell cycle regulation" - `[gene name]` - Clarifications - `[...]` - Omitted content markers +If your corpus uses square brackets as literal source text, configure patterns +that should be preserved instead of stripped: + +```yaml +validation: + literal_bracket_patterns: + - "\\d" # keep [2Fe-2S], [+21], [30S], [Ca2+] + - "^\\S" # keep tight brackets like [poly(A)+], strip [ editorial ] +``` + +Patterns are matched against the content inside `[...]`. If any pattern matches, +that bracketed text is kept verbatim during validation. + ### Omitted Text `...` Use ellipsis for gaps in quoted text: diff --git a/docs/concepts/editorial-conventions.md b/docs/concepts/editorial-conventions.md index c9a90a9..ff61be5 100644 --- a/docs/concepts/editorial-conventions.md +++ b/docs/concepts/editorial-conventions.md @@ -70,6 +70,30 @@ Nested brackets are not recommended and may not work as expected: ✅ "MUC1 [mucin 1, a membrane protein] blocks targeting" ``` +### Literal Brackets in Source Text + +Some corpora use square brackets as part of the quoted source text itself: + +``` +Reference: "polyadenylated [poly(A)+] RNA export" +Quote: "polyadenylated [poly(A)+] RNA export" +``` + +By default, linkml-reference-validator strips all bracketed text for backward +compatibility. To preserve literal brackets, configure +`literal_bracket_patterns` in your validation config: + +```yaml +validation: + literal_bracket_patterns: + - "\\d" # keep [2Fe-2S], [+21], [30S], [Ca2+] + - "^\\S" # keep tight brackets like [poly(A)+], strip [ editorial ] +``` + +Each pattern is matched against the content inside `[...]`. If any pattern +matches, that bracketed text is kept during validation. If none match, the +content is treated as an editorial insertion and stripped. + ## Ellipsis `...` Use ellipsis (three dots) to indicate **omitted text** between parts of a quote. diff --git a/src/linkml_reference_validator/models.py b/src/linkml_reference_validator/models.py index c16ea3e..49f5a97 100644 --- a/src/linkml_reference_validator/models.py +++ b/src/linkml_reference_validator/models.py @@ -352,6 +352,11 @@ class ReferenceValidationConfig(BaseModel): ['SRA', 'MGNIFY'] >>> config.unknown_prefix_severity + >>> config = ReferenceValidationConfig( + ... literal_bracket_patterns=[r"\d", r"^[A-Z]$"] + ... ) + >>> config.literal_bracket_patterns + ['\\d', '^[A-Z]$'] """ cache_dir: Path = Field( @@ -385,6 +390,15 @@ class ReferenceValidationConfig(BaseModel): ge=1, description="Regex capture group number containing the reference ID", ) + literal_bracket_patterns: list[str] = Field( + default_factory=list, + description=( + "Regular expressions matched against the content inside square brackets. " + "If any pattern matches, the bracketed text is treated as literal source " + "text and preserved during supporting text validation. " + "If no patterns are configured, all bracketed text is stripped." + ), + ) reference_prefix_map: dict[str, str] = Field( default_factory=dict, description=( diff --git a/src/linkml_reference_validator/validation/supporting_text_validator.py b/src/linkml_reference_validator/validation/supporting_text_validator.py index 90d98e2..7eb936d 100644 --- a/src/linkml_reference_validator/validation/supporting_text_validator.py +++ b/src/linkml_reference_validator/validation/supporting_text_validator.py @@ -23,7 +23,8 @@ class SupportingTextValidator: appears in the referenced publication using deterministic substring matching. Supports: - - Editorial notes in [square brackets] that are ignored + - Editorial notes in [square brackets] that are ignored by default + - Configurable literal bracket patterns for bracketed source text - Multi-part quotes with "..." separators indicating omitted text Examples: @@ -46,6 +47,9 @@ def __init__(self, config: ReferenceValidationConfig): """ self.config = config self.fetcher = ReferenceFetcher(config) + self._literal_bracket_regexes = [ + re.compile(pattern) for pattern in config.literal_bracket_patterns + ] def validate_title( self, @@ -287,7 +291,7 @@ def find_text_in_reference( return self._substring_match(query_parts, reference.content, supporting_text) def _split_query(self, text: str) -> list[str]: - """Split query into parts separated by ... removing [...] editorial notes. + """Split query into parts while stripping editorial brackets by default. Args: text: Query text @@ -302,12 +306,28 @@ def _split_query(self, text: str) -> list[str]: ['protein functions', 'in cells'] >>> validator._split_query("protein [important] functions") ['protein functions'] + >>> config = ReferenceValidationConfig(literal_bracket_patterns=[r"\\d"]) + >>> validator = SupportingTextValidator(config) + >>> validator._split_query("protein [important] binds [2Fe-2S] cluster") + ['protein binds [2Fe-2S] cluster'] >>> validator._split_query("[editorial note]") [] """ - text_without_brackets = re.sub(r"\[.*?\]", " ", text) + if not self._literal_bracket_regexes: + text_without_brackets = re.sub(r"\[.*?\]", " ", text) + else: + + def replace_bracket(match: re.Match[str]) -> str: + """Preserve configured literal bracket content, strip editorial notes.""" + content = match.group(1) + if any(regex.search(content) for regex in self._literal_bracket_regexes): + return match.group(0) + return " " + + text_without_brackets = re.sub(r"\[(.*?)\]", replace_bracket, text) + parts = re.split(r"\s*\.{2,}\s*", text_without_brackets) - parts = [p.strip() for p in parts if p.strip()] + parts = [re.sub(r"\s+", " ", p).strip() for p in parts if p.strip()] return parts def _substring_match( diff --git a/tests/test_models.py b/tests/test_models.py index 3068ef6..2c1bc13 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -16,6 +16,7 @@ def test_config_defaults(): config = ReferenceValidationConfig() assert config.cache_dir == Path("references_cache") assert config.rate_limit_delay == 0.5 + assert config.literal_bracket_patterns == [] def test_config_custom_values(): @@ -23,9 +24,11 @@ def test_config_custom_values(): config = ReferenceValidationConfig( cache_dir=Path("/tmp/cache"), rate_limit_delay=1.0, + literal_bracket_patterns=[r"\d"], ) assert config.cache_dir == Path("/tmp/cache") assert config.rate_limit_delay == 1.0 + assert config.literal_bracket_patterns == [r"\d"] def test_config_get_cache_dir(tmp_path): diff --git a/tests/test_supporting_text_validator.py b/tests/test_supporting_text_validator.py index 35d5352..6ec64e8 100644 --- a/tests/test_supporting_text_validator.py +++ b/tests/test_supporting_text_validator.py @@ -96,6 +96,20 @@ def test_split_query_with_brackets(validator): assert "important" not in parts[0] +def test_split_query_keeps_literal_brackets_when_pattern_matches(tmp_path): + """Test splitting query keeps configured literal bracket content.""" + config = ReferenceValidationConfig( + cache_dir=tmp_path / "cache", + rate_limit_delay=0.0, + literal_bracket_patterns=[r"\d"], + ) + validator = SupportingTextValidator(config) + + parts = validator._split_query("protein [important] binds [2Fe-2S] cluster") + + assert parts == ["protein binds [2Fe-2S] cluster"] + + def test_substring_match_found(validator): """Test substring matching when text is found.""" match = validator._substring_match( @@ -163,6 +177,37 @@ def test_find_text_empty_query_after_brackets(validator): assert "empty" in match.error_message.lower() +def test_find_text_in_reference_literal_brackets_require_config(validator): + """Test literal bracket content is still stripped by default.""" + ref = ReferenceContent( + reference_id="PMID:123", + content="The [2Fe-2S] cluster is required for activity.", + ) + + match = validator.find_text_in_reference("The [2Fe-2S] cluster", ref) + + assert match.found is False + + +def test_find_text_in_reference_keeps_literal_brackets_with_config(tmp_path): + """Test literal bracket content can be preserved through configuration.""" + config = ReferenceValidationConfig( + cache_dir=tmp_path / "cache", + rate_limit_delay=0.0, + literal_bracket_patterns=[r"\d"], + ) + validator = SupportingTextValidator(config) + ref = ReferenceContent( + reference_id="PMID:123", + content="The [2Fe-2S] cluster is required for activity.", + ) + + match = validator.find_text_in_reference("The [2Fe-2S] cluster", ref) + + assert match.found is True + assert match.similarity_score == 1.0 + + def test_validate_success(validator, mocker): """Test successful validation.""" mock_fetch = mocker.patch.object(validator.fetcher, "fetch") diff --git a/tests/test_validation_config.py b/tests/test_validation_config.py index a3dcd36..48f41bd 100644 --- a/tests/test_validation_config.py +++ b/tests/test_validation_config.py @@ -36,3 +36,20 @@ def test_load_validation_config_ignores_repair_only(tmp_path): config = load_validation_config(config_file) assert config.reference_prefix_map == {} + + +def test_load_validation_config_literal_bracket_patterns(tmp_path): + """Should load literal bracket patterns from validation config.""" + config_file = tmp_path / ".linkml-reference-validator.yaml" + config_file.write_text( + """ +validation: + literal_bracket_patterns: + - "\\\\d" + - "^\\\\S" +""" + ) + + config = load_validation_config(config_file) + + assert config.literal_bracket_patterns == [r"\d", r"^\S"]