From bf04dbae4910fdc0baaf2499e8370bc8e5a651f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Tue, 12 May 2026 10:24:12 +0200 Subject: [PATCH] Add teiHeader and standOff inclusion to get_passage() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces two opt-in parameters on Document.get_passage(): - include_header: prepends the full teiHeader to the reconstructed passage. - include_standoff: appends a filtered standOff, resolving three reference directions — passage → standOff (corresp/ref), standOff → passage (target/from/to), and transitive standOff → standOff (ana and similar attributes) via fixed-point expansion until the reference set stabilises. Both flags are exposed on create_app() in the Flask layer so a deployment can opt in server-wide without changing per-request logic. Adds a dedicated TEI test fixture (tei_with_standoff.xml) and tests covering all three linking cases, header/standOff ordering, and the document HTTP endpoint for each flag combination. --- dapytains/app/app.py | 13 ++- dapytains/tei/document.py | 124 ++++++++++++++++++++++++++- tests/catalog/standoff-catalog.xml | 9 ++ tests/tei/tei_with_standoff.xml | 53 ++++++++++++ tests/test_app.py | 133 +++++++++++++++++++++++++++++ tests/test_tei.py | 60 +++++++++++++ 6 files changed, 387 insertions(+), 5 deletions(-) create mode 100644 tests/catalog/standoff-catalog.xml create mode 100644 tests/tei/tei_with_standoff.xml diff --git a/dapytains/app/app.py b/dapytains/app/app.py index a47b7de..f12184e 100644 --- a/dapytains/app/app.py +++ b/dapytains/app/app.py @@ -89,7 +89,8 @@ def collection_view( }, ), mimetype="application/ld+json", status=200) -def document_view(resource, ref, start, end, tree, media, transformer: Transformer) -> Response: +def document_view(resource, ref, start, end, tree, media, transformer: Transformer, + include_header: bool = False, include_standoff: bool = False) -> Response: if not resource: return msg_4xx("Resource parameter was not provided") @@ -124,7 +125,8 @@ def document_view(resource, ref, start, end, tree, media, transformer: Transform return Response(content, mimetype="application/xml") doc = Document(collection.filepath) - passage = doc.get_passage(ref_or_start=ref or start, end=end, tree=tree) + passage = doc.get_passage(ref_or_start=ref or start, end=end, tree=tree, + include_header=include_header, include_standoff=include_standoff) if media and media != "application/xml": return transformer.transform(media, collection, passage) else: @@ -211,7 +213,9 @@ def get_templates( def create_app( app: Flask, use_query: bool = False, - media_transformer: Transformer = Transformer() + media_transformer: Transformer = Transformer(), + include_header: bool = False, + include_standoff: bool = False, ) -> (Flask, SQLAlchemy): """ @@ -270,7 +274,8 @@ def document_route(): end = request.args.get("end") tree = request.args.get("tree") media = request.args.get("mediaType") - return document_view(resource, ref, start, end, tree, media=media, transformer=media_transformer) + return document_view(resource, ref, start, end, tree, media=media, transformer=media_transformer, + include_header=include_header, include_standoff=include_standoff) return app, db diff --git a/dapytains/tei/document.py b/dapytains/tei/document.py index 2d24e74..8a260cd 100644 --- a/dapytains/tei/document.py +++ b/dapytains/tei/document.py @@ -12,6 +12,54 @@ COPY_UNTIL_END = -1 _namespace = re.compile(r"Q{(?P[^}]+)}(?P.+)") +_ID_REF = re.compile(r'#([\w.\-:]+)') +_XML_ID = "{http://www.w3.org/XML/1998/namespace}id" + + +def _collect_id_refs(element: Element) -> set: + """All #fragment values referenced in any attribute of the element tree.""" + refs = set() + for node in element.iter(): + for value in node.attrib.values(): + refs.update(_ID_REF.findall(value)) + return refs + + +def _collect_xml_ids(element: Element) -> set: + """All @xml:id values declared anywhere in the element tree.""" + ids = set() + for node in element.iter(): + xml_id = node.get(_XML_ID) + if xml_id: + ids.add(xml_id) + return ids + + +def _filter_standoff(element: Element, passage_refs: set, passage_xml_ids: set) -> bool: + """Recursively prune standOff in-place, keeping only relevant elements. + + An element is kept (returns True) when: + - its @xml:id is in passage_refs (passage points to it), or + - any of its attribute values reference an id in passage_xml_ids + (it points to a passage element, e.g. target="#w55"), or + - at least one of its descendants satisfies either condition. + + Call after the fixed-point expansion of passage_refs so that transitive + standOff→standOff references (e.g. @ana="#fs05") are already folded in. + """ + own_id = element.get(_XML_ID) + if own_id and own_id in passage_refs: + return True + for value in element.attrib.values(): + if any(ref in passage_xml_ids for ref in _ID_REF.findall(value)): + return True + keep = False + for child in list(element): + if _filter_standoff(child, passage_refs, passage_xml_ids): + keep = True + else: + element.remove(child) + return keep def xpath_split(string: str) -> List[str]: @@ -558,12 +606,30 @@ def __init__(self, file_path: str, processor: Optional[saxonlib.PySaxonProcessor self.default_tree: str = default - def get_passage(self, ref_or_start: Optional[str], end: Optional[str] = None, tree: Optional[str] = None) -> Element: + def get_passage( + self, + ref_or_start: Optional[str], + end: Optional[str] = None, + tree: Optional[str] = None, + include_header: bool = False, + include_standoff: bool = False, + ) -> Element: """ Retrieve a given passage from the document :param ref_or_start: First element of a range or single ref :param end: End of a range :param tree: Name of a specific tree + :param include_header: Prepend the full teiHeader to the result (default False) + :param include_standoff: Append filtered standOff content to the result (default False). + Three reference directions are resolved: + (1) passage → standOff: attributes such as @corresp/@ref in the passage pointing to + standOff @xml:id values; + (2) standOff → passage: @target/@from/@to etc. on standOff elements pointing to + @xml:id values present in the passage; + (3) transitive standOff → standOff: @ana and similar on already-included standOff + elements (fixed-point expansion until stable). + Note: when include_header is False, teiHeader entries referenced only from standOff + (e.g. taxonomy categories via @ana) are not individually extracted. """ if ref_or_start and not end: start, end = ref_or_start, None @@ -617,6 +683,62 @@ def get_passage(self, ref_or_start: Optional[str], end: Optional[str] = None, tr processor=self.xml_processor ) objectify.deannotate(root, cleanup_namespaces=True) + + if include_header: + xp = get_xpath_proc(self.xml, processor=self.xml_processor) + header_nodes = xpath_eval(xp, "/TEI/teiHeader") + if header_nodes: + xq = self.xml_processor.new_xquery_processor() + xq.set_context(xdm_item=header_nodes[0]) + header_str = xq.run_query_to_string(query_text=( + "declare namespace output = 'http://www.w3.org/2010/xslt-xquery-serialization';" + "declare option output:omit-xml-declaration 'yes';" + "." + )) + if header_str and header_str.startswith("<"): + root.insert(0, fromstring(header_str)) + + if include_standoff: + passage_refs = _collect_id_refs(root) + passage_xml_ids = _collect_xml_ids(root) + + xp = get_xpath_proc(self.xml, processor=self.xml_processor) + standoff_elems = [] + for so_node in xpath_eval(xp, "/TEI/standOff"): + xq = self.xml_processor.new_xquery_processor() + xq.set_context(xdm_item=so_node) + so_str = xq.run_query_to_string(query_text=( + "declare namespace output = 'http://www.w3.org/2010/xslt-xquery-serialization';" + "declare option output:omit-xml-declaration 'yes';" + "." + )) + if so_str and so_str.startswith("<"): + standoff_elems.append(fromstring(so_str)) + + if standoff_elems: + # Fixed-point: expand all_refs until included standOff elements + # reveal no further references (handles @ana and similar). + all_refs = set(passage_refs) + prev_size = -1 + while len(all_refs) != prev_size: + prev_size = len(all_refs) + for so_elem in standoff_elems: + for node in so_elem.iter(): + own_id = node.get(_XML_ID) + is_relevant = (own_id and own_id in all_refs) or any( + ref in passage_xml_ids + for val in node.attrib.values() + for ref in _ID_REF.findall(val) + ) + if is_relevant: + for val in node.attrib.values(): + all_refs.update(_ID_REF.findall(val)) + + for so_elem in standoff_elems: + _filter_standoff(so_elem, all_refs, passage_xml_ids) + if len(so_elem) or so_elem.text: + root.append(so_elem) + return root def get_reffs(self, tree: Optional[str] = None): diff --git a/tests/catalog/standoff-catalog.xml b/tests/catalog/standoff-catalog.xml new file mode 100644 index 0000000..7df5781 --- /dev/null +++ b/tests/catalog/standoff-catalog.xml @@ -0,0 +1,9 @@ + + + Standoff test collection + + + Standoff document + + + diff --git a/tests/tei/tei_with_standoff.xml b/tests/tei/tei_with_standoff.xml new file mode 100644 index 0000000..2736be4 --- /dev/null +++ b/tests/tei/tei_with_standoff.xml @@ -0,0 +1,53 @@ + + + + + + + + + + +
+

+ Martin + Luther + King + Martin Luther King + spoke in + Atlanta + and + Birmingham. +

+
+
+

+ Other + content + Dallas +

+
+ +
+ + + Atlanta33.755 -84.39 + Birmingham33.653333 -86.808889 + Dallas32.783 -96.8 + + + Martin Luther King Jr. + John F. Kennedy + + + + + + + + + NNP + JJ + + +
diff --git a/tests/test_app.py b/tests/test_app.py index 6a6f155..d9a0e38 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -8,6 +8,7 @@ import urllib basedir = os.path.abspath(os.path.dirname(__file__)) +STANDOFF_RESOURCE = "https://foo.bar/standoff" BASE_URI = "http://localhost" CONTEXT_URL = "https://dtsapi.org/context/v1.0.json" DTS_VERSION = "1.0" @@ -172,3 +173,135 @@ def test_collection(client): 'title': 'My First Collection', 'totalChildren': 1, 'totalParents': 1} == response.get_json() + + +# ── document endpoint fixtures ──────────────────────────────────────────────── + +def _make_standoff_app(include_header=False, include_standoff=False): + flask_app = Flask(__name__) + flask_app, db = create_app(flask_app, include_header=include_header, + include_standoff=include_standoff) + db_path = os.path.join(basedir, 'app_standoff.db') + flask_app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{db_path}' + flask_app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False + db.init_app(flask_app) + with flask_app.app_context(): + db.create_all() + catalog, _ = parse(f"{basedir}/catalog/standoff-catalog.xml") + store_catalog(catalog) + return flask_app, db + + +@pytest.fixture +def standoff_client_default(): + """Document endpoint with both flags off (default behaviour).""" + flask_app, db = _make_standoff_app(include_header=False, include_standoff=False) + yield flask_app.test_client() + with flask_app.app_context(): + db.session.remove() + db.drop_all() + + +@pytest.fixture +def standoff_client_header(): + """Document endpoint with include_header=True only.""" + flask_app, db = _make_standoff_app(include_header=True, include_standoff=False) + yield flask_app.test_client() + with flask_app.app_context(): + db.session.remove() + db.drop_all() + + +@pytest.fixture +def standoff_client_standoff(): + """Document endpoint with include_standoff=True only.""" + flask_app, db = _make_standoff_app(include_header=False, include_standoff=True) + yield flask_app.test_client() + with flask_app.app_context(): + db.session.remove() + db.drop_all() + + +@pytest.fixture +def standoff_client_both(): + """Document endpoint with both include_header=True and include_standoff=True.""" + flask_app, db = _make_standoff_app(include_header=True, include_standoff=True) + yield flask_app.test_client() + with flask_app.app_context(): + db.session.remove() + db.drop_all() + + +# ── helpers ─────────────────────────────────────────────────────────────────── + +def _get_doc(client, ref): + from urllib.parse import quote_plus + return client.get(f"/document/?resource={quote_plus(STANDOFF_RESOURCE)}&ref={ref}") + + +# ── document endpoint tests ─────────────────────────────────────────────────── + +def test_document_default_no_header_no_standoff(standoff_client_default): + """By default neither teiHeader nor standOff appears in the response.""" + response = _get_doc(standoff_client_default, "1") + assert response.status_code == 200 + body = response.data.decode() + assert "