Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,19 @@ supporting_text: "protein [X] functions in cell cycle regulation"
- `[gene name]` - Clarifications
- `[...]` - Omitted content markers

If your corpus uses square brackets as literal source text, configure patterns
that should be preserved instead of stripped:

```yaml
validation:
literal_bracket_patterns:
- "\\d" # keep [2Fe-2S], [+21], [30S], [Ca2+]
- "^\\S" # keep tight brackets like [poly(A)+], strip [ editorial ]
```

Patterns are matched against the content inside `[...]`. If any pattern matches,
that bracketed text is kept verbatim during validation.
Copy link

Copilot AI Apr 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The README says preserved bracketed text is kept "verbatim during validation", but the validator normalizes punctuation/whitespace before substring matching (and _split_query() also collapses whitespace). Consider rewording to “preserved (not stripped) before normalization/validation” to avoid implying exact byte-for-byte matching.

Suggested change
that bracketed text is kept verbatim during validation.
that bracketed text is preserved (not stripped) before normalization and validation.

Copilot uses AI. Check for mistakes.

### Omitted Text `...`

Use ellipsis for gaps in quoted text:
Expand Down
24 changes: 24 additions & 0 deletions docs/concepts/editorial-conventions.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,30 @@ Nested brackets are not recommended and may not work as expected:
✅ "MUC1 [mucin 1, a membrane protein] blocks targeting"
```

### Literal Brackets in Source Text

Some corpora use square brackets as part of the quoted source text itself:

```
Reference: "polyadenylated [poly(A)+] RNA export"
Quote: "polyadenylated [poly(A)+] RNA export"
```

By default, linkml-reference-validator strips all bracketed text for backward
compatibility. To preserve literal brackets, configure
`literal_bracket_patterns` in your validation config:

```yaml
validation:
literal_bracket_patterns:
- "\\d" # keep [2Fe-2S], [+21], [30S], [Ca2+]
- "^\\S" # keep tight brackets like [poly(A)+], strip [ editorial ]
```

Each pattern is matched against the content inside `[...]`. If any pattern
matches, that bracketed text is kept during validation. If none match, the
content is treated as an editorial insertion and stripped.

## Ellipsis `...`

Use ellipsis (three dots) to indicate **omitted text** between parts of a quote.
Expand Down
14 changes: 14 additions & 0 deletions src/linkml_reference_validator/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,11 @@ class ReferenceValidationConfig(BaseModel):
['SRA', 'MGNIFY']
>>> config.unknown_prefix_severity
<ValidationSeverity.WARNING: 'WARNING'>
>>> config = ReferenceValidationConfig(
... literal_bracket_patterns=[r"\d", r"^[A-Z]$"]
... )
>>> config.literal_bracket_patterns
['\\d', '^[A-Z]$']
"""

cache_dir: Path = Field(
Expand Down Expand Up @@ -385,6 +390,15 @@ class ReferenceValidationConfig(BaseModel):
ge=1,
description="Regex capture group number containing the reference ID",
)
literal_bracket_patterns: list[str] = Field(
default_factory=list,
description=(
"Regular expressions matched against the content inside square brackets. "
"If any pattern matches, the bracketed text is treated as literal source "
"text and preserved during supporting text validation. "
"If no patterns are configured, all bracketed text is stripped."
),
)
reference_prefix_map: dict[str, str] = Field(
default_factory=dict,
description=(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ class SupportingTextValidator:
appears in the referenced publication using deterministic substring matching.

Supports:
- Editorial notes in [square brackets] that are ignored
- Editorial notes in [square brackets] that are ignored by default
- Configurable literal bracket patterns for bracketed source text
- Multi-part quotes with "..." separators indicating omitted text

Examples:
Expand All @@ -46,6 +47,9 @@ def __init__(self, config: ReferenceValidationConfig):
"""
self.config = config
self.fetcher = ReferenceFetcher(config)
self._literal_bracket_regexes = [
re.compile(pattern) for pattern in config.literal_bracket_patterns
]

def validate_title(
self,
Expand Down Expand Up @@ -287,7 +291,7 @@ def find_text_in_reference(
return self._substring_match(query_parts, reference.content, supporting_text)

def _split_query(self, text: str) -> list[str]:
"""Split query into parts separated by ... removing [...] editorial notes.
"""Split query into parts while stripping editorial brackets by default.

Args:
text: Query text
Expand All @@ -302,12 +306,28 @@ def _split_query(self, text: str) -> list[str]:
['protein functions', 'in cells']
>>> validator._split_query("protein [important] functions")
['protein functions']
>>> config = ReferenceValidationConfig(literal_bracket_patterns=[r"\\d"])
Copy link

Copilot AI Apr 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the _split_query() docstring example, literal_bracket_patterns=[r"\\d"] compiles a pattern that matches a literal \d sequence, not digits. This example likely intends r"\d" so the shown output is reproducible.

Suggested change
>>> config = ReferenceValidationConfig(literal_bracket_patterns=[r"\\d"])
>>> config = ReferenceValidationConfig(literal_bracket_patterns=[r"\d"])

Copilot uses AI. Check for mistakes.
>>> validator = SupportingTextValidator(config)
>>> validator._split_query("protein [important] binds [2Fe-2S] cluster")
['protein binds [2Fe-2S] cluster']
>>> validator._split_query("[editorial note]")
[]
"""
text_without_brackets = re.sub(r"\[.*?\]", " ", text)
if not self._literal_bracket_regexes:
text_without_brackets = re.sub(r"\[.*?\]", " ", text)
else:

def replace_bracket(match: re.Match[str]) -> str:
"""Preserve configured literal bracket content, strip editorial notes."""
content = match.group(1)
if any(regex.search(content) for regex in self._literal_bracket_regexes):
return match.group(0)
return " "

text_without_brackets = re.sub(r"\[(.*?)\]", replace_bracket, text)

parts = re.split(r"\s*\.{2,}\s*", text_without_brackets)
parts = [p.strip() for p in parts if p.strip()]
parts = [re.sub(r"\s+", " ", p).strip() for p in parts if p.strip()]
return parts

def _substring_match(
Expand Down
3 changes: 3 additions & 0 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,19 @@ def test_config_defaults():
config = ReferenceValidationConfig()
assert config.cache_dir == Path("references_cache")
assert config.rate_limit_delay == 0.5
assert config.literal_bracket_patterns == []


def test_config_custom_values():
"""Test configuration with custom values."""
config = ReferenceValidationConfig(
cache_dir=Path("/tmp/cache"),
rate_limit_delay=1.0,
literal_bracket_patterns=[r"\d"],
)
assert config.cache_dir == Path("/tmp/cache")
assert config.rate_limit_delay == 1.0
assert config.literal_bracket_patterns == [r"\d"]


def test_config_get_cache_dir(tmp_path):
Expand Down
45 changes: 45 additions & 0 deletions tests/test_supporting_text_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,20 @@ def test_split_query_with_brackets(validator):
assert "important" not in parts[0]


def test_split_query_keeps_literal_brackets_when_pattern_matches(tmp_path):
"""Test splitting query keeps configured literal bracket content."""
config = ReferenceValidationConfig(
cache_dir=tmp_path / "cache",
rate_limit_delay=0.0,
literal_bracket_patterns=[r"\d"],
)
validator = SupportingTextValidator(config)

parts = validator._split_query("protein [important] binds [2Fe-2S] cluster")

assert parts == ["protein binds [2Fe-2S] cluster"]


def test_substring_match_found(validator):
"""Test substring matching when text is found."""
match = validator._substring_match(
Expand Down Expand Up @@ -163,6 +177,37 @@ def test_find_text_empty_query_after_brackets(validator):
assert "empty" in match.error_message.lower()


def test_find_text_in_reference_literal_brackets_require_config(validator):
"""Test literal bracket content is still stripped by default."""
ref = ReferenceContent(
reference_id="PMID:123",
content="The [2Fe-2S] cluster is required for activity.",
)

match = validator.find_text_in_reference("The [2Fe-2S] cluster", ref)

assert match.found is False


def test_find_text_in_reference_keeps_literal_brackets_with_config(tmp_path):
"""Test literal bracket content can be preserved through configuration."""
config = ReferenceValidationConfig(
cache_dir=tmp_path / "cache",
rate_limit_delay=0.0,
literal_bracket_patterns=[r"\d"],
)
validator = SupportingTextValidator(config)
ref = ReferenceContent(
reference_id="PMID:123",
content="The [2Fe-2S] cluster is required for activity.",
)

match = validator.find_text_in_reference("The [2Fe-2S] cluster", ref)

assert match.found is True
assert match.similarity_score == 1.0


def test_validate_success(validator, mocker):
"""Test successful validation."""
mock_fetch = mocker.patch.object(validator.fetcher, "fetch")
Expand Down
17 changes: 17 additions & 0 deletions tests/test_validation_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,20 @@ def test_load_validation_config_ignores_repair_only(tmp_path):
config = load_validation_config(config_file)

assert config.reference_prefix_map == {}


def test_load_validation_config_literal_bracket_patterns(tmp_path):
"""Should load literal bracket patterns from validation config."""
config_file = tmp_path / ".linkml-reference-validator.yaml"
config_file.write_text(
"""
validation:
literal_bracket_patterns:
- "\\\\d"
- "^\\\\S"
"""
)

config = load_validation_config(config_file)

assert config.literal_bracket_patterns == [r"\d", r"^\S"]
Loading