Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions dapytains/app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ def collection_view(
}, ), mimetype="application/ld+json", status=200)


def document_view(resource, ref, start, end, tree, media, transformer: Transformer) -> Response:
def document_view(resource, ref, start, end, tree, media, transformer: Transformer,
include_header: bool = False, include_standoff: bool = False) -> Response:
if not resource:
return msg_4xx("Resource parameter was not provided")

Expand Down Expand Up @@ -124,7 +125,8 @@ def document_view(resource, ref, start, end, tree, media, transformer: Transform
return Response(content, mimetype="application/xml")

doc = Document(collection.filepath)
passage = doc.get_passage(ref_or_start=ref or start, end=end, tree=tree)
passage = doc.get_passage(ref_or_start=ref or start, end=end, tree=tree,
include_header=include_header, include_standoff=include_standoff)
if media and media != "application/xml":
return transformer.transform(media, collection, passage)
else:
Expand Down Expand Up @@ -211,7 +213,9 @@ def get_templates(
def create_app(
app: Flask,
use_query: bool = False,
media_transformer: Transformer = Transformer()
media_transformer: Transformer = Transformer(),
include_header: bool = False,
include_standoff: bool = False,
) -> (Flask, SQLAlchemy):
"""

Expand Down Expand Up @@ -270,7 +274,8 @@ def document_route():
end = request.args.get("end")
tree = request.args.get("tree")
media = request.args.get("mediaType")
return document_view(resource, ref, start, end, tree, media=media, transformer=media_transformer)
return document_view(resource, ref, start, end, tree, media=media, transformer=media_transformer,
include_header=include_header, include_standoff=include_standoff)

return app, db

Expand Down
124 changes: 123 additions & 1 deletion dapytains/tei/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,54 @@

COPY_UNTIL_END = -1
_namespace = re.compile(r"Q{(?P<namespace>[^}]+)}(?P<tagname>.+)")
_ID_REF = re.compile(r'#([\w.\-:]+)')
_XML_ID = "{http://www.w3.org/XML/1998/namespace}id"


def _collect_id_refs(element: Element) -> set:
"""All #fragment values referenced in any attribute of the element tree."""
refs = set()
for node in element.iter():
for value in node.attrib.values():
refs.update(_ID_REF.findall(value))
return refs


def _collect_xml_ids(element: Element) -> set:
"""All @xml:id values declared anywhere in the element tree."""
ids = set()
for node in element.iter():
xml_id = node.get(_XML_ID)
if xml_id:
ids.add(xml_id)
return ids


def _filter_standoff(element: Element, passage_refs: set, passage_xml_ids: set) -> bool:
"""Recursively prune standOff in-place, keeping only relevant elements.

An element is kept (returns True) when:
- its @xml:id is in passage_refs (passage points to it), or
- any of its attribute values reference an id in passage_xml_ids
(it points to a passage element, e.g. target="#w55"), or
- at least one of its descendants satisfies either condition.

Call after the fixed-point expansion of passage_refs so that transitive
standOff→standOff references (e.g. @ana="#fs05") are already folded in.
"""
own_id = element.get(_XML_ID)
if own_id and own_id in passage_refs:
return True
for value in element.attrib.values():
if any(ref in passage_xml_ids for ref in _ID_REF.findall(value)):
return True
keep = False
for child in list(element):
if _filter_standoff(child, passage_refs, passage_xml_ids):
keep = True
else:
element.remove(child)
return keep


def xpath_split(string: str) -> List[str]:
Expand Down Expand Up @@ -558,12 +606,30 @@ def __init__(self, file_path: str, processor: Optional[saxonlib.PySaxonProcessor

self.default_tree: str = default

def get_passage(self, ref_or_start: Optional[str], end: Optional[str] = None, tree: Optional[str] = None) -> Element:
def get_passage(
self,
ref_or_start: Optional[str],
end: Optional[str] = None,
tree: Optional[str] = None,
include_header: bool = False,
include_standoff: bool = False,
) -> Element:
""" Retrieve a given passage from the document

:param ref_or_start: First element of a range or single ref
:param end: End of a range
:param tree: Name of a specific tree
:param include_header: Prepend the full teiHeader to the result (default False)
:param include_standoff: Append filtered standOff content to the result (default False).
Three reference directions are resolved:
(1) passage → standOff: attributes such as @corresp/@ref in the passage pointing to
standOff @xml:id values;
(2) standOff → passage: @target/@from/@to etc. on standOff elements pointing to
@xml:id values present in the passage;
(3) transitive standOff → standOff: @ana and similar on already-included standOff
elements (fixed-point expansion until stable).
Note: when include_header is False, teiHeader entries referenced only from standOff
(e.g. taxonomy categories via @ana) are not individually extracted.
"""
if ref_or_start and not end:
start, end = ref_or_start, None
Expand Down Expand Up @@ -617,6 +683,62 @@ def get_passage(self, ref_or_start: Optional[str], end: Optional[str] = None, tr
processor=self.xml_processor
)
objectify.deannotate(root, cleanup_namespaces=True)

if include_header:
xp = get_xpath_proc(self.xml, processor=self.xml_processor)
header_nodes = xpath_eval(xp, "/TEI/teiHeader")
if header_nodes:
xq = self.xml_processor.new_xquery_processor()
xq.set_context(xdm_item=header_nodes[0])
header_str = xq.run_query_to_string(query_text=(
"declare namespace output = 'http://www.w3.org/2010/xslt-xquery-serialization';"
"declare option output:omit-xml-declaration 'yes';"
"."
))
if header_str and header_str.startswith("<"):
root.insert(0, fromstring(header_str))

if include_standoff:
passage_refs = _collect_id_refs(root)
passage_xml_ids = _collect_xml_ids(root)

xp = get_xpath_proc(self.xml, processor=self.xml_processor)
standoff_elems = []
for so_node in xpath_eval(xp, "/TEI/standOff"):
xq = self.xml_processor.new_xquery_processor()
xq.set_context(xdm_item=so_node)
so_str = xq.run_query_to_string(query_text=(
"declare namespace output = 'http://www.w3.org/2010/xslt-xquery-serialization';"
"declare option output:omit-xml-declaration 'yes';"
"."
))
if so_str and so_str.startswith("<"):
standoff_elems.append(fromstring(so_str))

if standoff_elems:
# Fixed-point: expand all_refs until included standOff elements
# reveal no further references (handles @ana and similar).
all_refs = set(passage_refs)
prev_size = -1
while len(all_refs) != prev_size:
prev_size = len(all_refs)
for so_elem in standoff_elems:
for node in so_elem.iter():
own_id = node.get(_XML_ID)
is_relevant = (own_id and own_id in all_refs) or any(
ref in passage_xml_ids
for val in node.attrib.values()
for ref in _ID_REF.findall(val)
)
if is_relevant:
for val in node.attrib.values():
all_refs.update(_ID_REF.findall(val))

for so_elem in standoff_elems:
_filter_standoff(so_elem, all_refs, passage_xml_ids)
if len(so_elem) or so_elem.text:
root.append(so_elem)

return root

def get_reffs(self, tree: Optional[str] = None):
Expand Down
9 changes: 9 additions & 0 deletions tests/catalog/standoff-catalog.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<collection identifier="https://foo.bar/standoff-root">
<title>Standoff test collection</title>
<members>
<resource identifier="https://foo.bar/standoff" filepath="../tei/tei_with_standoff.xml">
<title>Standoff document</title>
</resource>
</members>
</collection>
53 changes: 53 additions & 0 deletions tests/tei/tei_with_standoff.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<encodingDesc>
<refsDecl>
<citeStructure unit="section" match="//body/div" use="@n"/>
</refsDecl>
</encodingDesc>
</teiHeader>
<text>
<body>
<div n="1">
<p>
<w xml:id="w1">Martin</w>
<w xml:id="w2">Luther</w>
<w xml:id="w3">King</w>
<persName ref="#MLK">Martin Luther King</persName>
spoke in
<placeName corresp="#LATL">Atlanta</placeName>
and
<placeName corresp="#LBHM">Birmingham</placeName>.
</p>
</div>
<div n="2">
<p>
<w xml:id="w7">Other</w>
<w xml:id="w8">content</w>
<placeName corresp="#LDAL">Dallas</placeName>
</p>
</div>
</body>
</text>
<standOff>
<listPlace>
<place xml:id="LATL"><placeName>Atlanta</placeName><location><geo>33.755 -84.39</geo></location></place>
<place xml:id="LBHM"><placeName>Birmingham</placeName><location><geo>33.653333 -86.808889</geo></location></place>
<place xml:id="LDAL"><placeName>Dallas</placeName><location><geo>32.783 -96.8</geo></location></place>
</listPlace>
<listPerson>
<person xml:id="MLK"><persName>Martin Luther King Jr.</persName></person>
<person xml:id="JFK"><persName>John F. Kennedy</persName></person>
</listPerson>
<spanGrp type="pos">
<span xml:id="ann1" target="#w1" ana="#pos-NNP"/>
<span xml:id="ann2" target="#w2" ana="#pos-NNP"/>
<span xml:id="ann3" target="#w3" ana="#pos-NNP"/>
<span xml:id="ann4" target="#w7" ana="#pos-JJ"/>
</spanGrp>
<fvLib>
<fs xml:id="pos-NNP"><f name="pos"><string>NNP</string></f></fs>
<fs xml:id="pos-JJ"><f name="pos"><string>JJ</string></f></fs>
</fvLib>
</standOff>
</TEI>
133 changes: 133 additions & 0 deletions tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import urllib

basedir = os.path.abspath(os.path.dirname(__file__))
STANDOFF_RESOURCE = "https://foo.bar/standoff"
BASE_URI = "http://localhost"
CONTEXT_URL = "https://dtsapi.org/context/v1.0.json"
DTS_VERSION = "1.0"
Expand Down Expand Up @@ -172,3 +173,135 @@ def test_collection(client):
'title': 'My First Collection',
'totalChildren': 1,
'totalParents': 1} == response.get_json()


# ── document endpoint fixtures ────────────────────────────────────────────────

def _make_standoff_app(include_header=False, include_standoff=False):
flask_app = Flask(__name__)
flask_app, db = create_app(flask_app, include_header=include_header,
include_standoff=include_standoff)
db_path = os.path.join(basedir, 'app_standoff.db')
flask_app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{db_path}'
flask_app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
db.init_app(flask_app)
with flask_app.app_context():
db.create_all()
catalog, _ = parse(f"{basedir}/catalog/standoff-catalog.xml")
store_catalog(catalog)
return flask_app, db


@pytest.fixture
def standoff_client_default():
"""Document endpoint with both flags off (default behaviour)."""
flask_app, db = _make_standoff_app(include_header=False, include_standoff=False)
yield flask_app.test_client()
with flask_app.app_context():
db.session.remove()
db.drop_all()


@pytest.fixture
def standoff_client_header():
"""Document endpoint with include_header=True only."""
flask_app, db = _make_standoff_app(include_header=True, include_standoff=False)
yield flask_app.test_client()
with flask_app.app_context():
db.session.remove()
db.drop_all()


@pytest.fixture
def standoff_client_standoff():
"""Document endpoint with include_standoff=True only."""
flask_app, db = _make_standoff_app(include_header=False, include_standoff=True)
yield flask_app.test_client()
with flask_app.app_context():
db.session.remove()
db.drop_all()


@pytest.fixture
def standoff_client_both():
"""Document endpoint with both include_header=True and include_standoff=True."""
flask_app, db = _make_standoff_app(include_header=True, include_standoff=True)
yield flask_app.test_client()
with flask_app.app_context():
db.session.remove()
db.drop_all()


# ── helpers ───────────────────────────────────────────────────────────────────

def _get_doc(client, ref):
from urllib.parse import quote_plus
return client.get(f"/document/?resource={quote_plus(STANDOFF_RESOURCE)}&ref={ref}")


# ── document endpoint tests ───────────────────────────────────────────────────

def test_document_default_no_header_no_standoff(standoff_client_default):
"""By default neither teiHeader nor standOff appears in the response."""
response = _get_doc(standoff_client_default, "1")
assert response.status_code == 200
body = response.data.decode()
assert "<teiHeader" not in body
assert "<standOff" not in body
# passage content is still there
assert 'xml:id="w1"' in body


def test_document_include_header(standoff_client_header):
"""include_header=True causes teiHeader to appear before text in the response."""
response = _get_doc(standoff_client_header, "1")
assert response.status_code == 200
body = response.data.decode()
assert "<teiHeader" in body
assert "<standOff" not in body
assert body.index("<teiHeader") < body.index("<text")


def test_document_include_standoff(standoff_client_standoff):
"""include_standoff=True filters standOff entries for the retrieved passage.

div n="1" references LATL, LBHM, MLK (case 1); its word tokens w1–w3
are targeted by ann1–ann3 (case 2); those spans carry @ana="#pos-NNP"
pulling in pos-NNP transitively (case 3). LDAL, JFK, ann4, and pos-JJ
are excluded because they are only referenced from div n="2".
"""
response = _get_doc(standoff_client_standoff, "1")
assert response.status_code == 200
body = response.data.decode()

assert "<teiHeader" not in body
assert "<standOff" in body

# case 1: passage → standOff
assert 'xml:id="LATL"' in body
assert 'xml:id="LBHM"' in body
assert 'xml:id="MLK"' in body
assert 'xml:id="LDAL"' not in body
assert 'xml:id="JFK"' not in body

# case 2: standOff → passage
assert 'target="#w1"' in body
assert 'target="#w2"' in body
assert 'target="#w3"' in body
assert 'target="#w7"' not in body

# case 3: transitive standOff → standOff
assert 'xml:id="pos-NNP"' in body
assert 'xml:id="pos-JJ"' not in body

assert body.index("<text") < body.index("<standOff")


def test_document_include_header_and_standoff(standoff_client_both):
"""With both flags the response order is teiHeader → text → standOff."""
response = _get_doc(standoff_client_both, "1")
assert response.status_code == 200
body = response.data.decode()
assert "<teiHeader" in body
assert "<standOff" in body
assert body.index("<teiHeader") < body.index("<text") < body.index("<standOff")
Loading
Loading