distributed-text-services · PonteIneptique · May 12, 2026
diff --git a/dapytains/app/app.py b/dapytains/app/app.py
@@ -89,7 +89,8 @@ def collection_view(
     }, ), mimetype="application/ld+json", status=200)
 
 
-def document_view(resource, ref, start, end, tree, media, transformer: Transformer) -> Response:
+def document_view(resource, ref, start, end, tree, media, transformer: Transformer,
+                  include_header: bool = False, include_standoff: bool = False) -> Response:
     if not resource:
         return msg_4xx("Resource parameter was not provided")
 
@@ -124,7 +125,8 @@ def document_view(resource, ref, start, end, tree, media, transformer: Transform
         return Response(content, mimetype="application/xml")
 
     doc = Document(collection.filepath)
-    passage = doc.get_passage(ref_or_start=ref or start, end=end, tree=tree)
+    passage = doc.get_passage(ref_or_start=ref or start, end=end, tree=tree,
+                              include_header=include_header, include_standoff=include_standoff)
     if media and media != "application/xml":
         return transformer.transform(media, collection, passage)
     else:
@@ -211,7 +213,9 @@ def get_templates(
 def create_app(
         app: Flask,
         use_query: bool = False,
-        media_transformer: Transformer = Transformer()
+        media_transformer: Transformer = Transformer(),
+        include_header: bool = False,
+        include_standoff: bool = False,
 ) -> (Flask, SQLAlchemy):
     """
 
@@ -270,7 +274,8 @@ def document_route():
         end = request.args.get("end")
         tree = request.args.get("tree")
         media = request.args.get("mediaType")
-        return document_view(resource, ref, start, end, tree, media=media, transformer=media_transformer)
+        return document_view(resource, ref, start, end, tree, media=media, transformer=media_transformer,
+                             include_header=include_header, include_standoff=include_standoff)
 
     return app, db
 

diff --git a/dapytains/tei/document.py b/dapytains/tei/document.py
@@ -12,6 +12,54 @@
 
 COPY_UNTIL_END = -1
 _namespace = re.compile(r"Q{(?P<namespace>[^}]+)}(?P<tagname>.+)")
+_ID_REF = re.compile(r'#([\w.\-:]+)')
+_XML_ID = "{http://www.w3.org/XML/1998/namespace}id"
+
+
+def _collect_id_refs(element: Element) -> set:
+    """All #fragment values referenced in any attribute of the element tree."""
+    refs = set()
+    for node in element.iter():
+        for value in node.attrib.values():
+            refs.update(_ID_REF.findall(value))
+    return refs
+
+
+def _collect_xml_ids(element: Element) -> set:
+    """All @xml:id values declared anywhere in the element tree."""
+    ids = set()
+    for node in element.iter():
+        xml_id = node.get(_XML_ID)
+        if xml_id:
+            ids.add(xml_id)
+    return ids
+
+
+def _filter_standoff(element: Element, passage_refs: set, passage_xml_ids: set) -> bool:
+    """Recursively prune standOff in-place, keeping only relevant elements.
+
+    An element is kept (returns True) when:
+    - its @xml:id is in passage_refs  (passage points to it), or
+    - any of its attribute values reference an id in passage_xml_ids
+      (it points to a passage element, e.g. target="#w55"), or
+    - at least one of its descendants satisfies either condition.
+
+    Call after the fixed-point expansion of passage_refs so that transitive
+    standOff→standOff references (e.g. @ana="#fs05") are already folded in.
+    """
+    own_id = element.get(_XML_ID)
+    if own_id and own_id in passage_refs:
+        return True
+    for value in element.attrib.values():
+        if any(ref in passage_xml_ids for ref in _ID_REF.findall(value)):
+            return True
+    keep = False
+    for child in list(element):
+        if _filter_standoff(child, passage_refs, passage_xml_ids):
+            keep = True
+        else:
+            element.remove(child)
+    return keep
 
 
 def xpath_split(string: str) -> List[str]:
@@ -558,12 +606,30 @@ def __init__(self, file_path: str, processor: Optional[saxonlib.PySaxonProcessor
 
         self.default_tree: str = default
 
-    def get_passage(self, ref_or_start: Optional[str], end: Optional[str] = None, tree: Optional[str] = None) -> Element:
+    def get_passage(
+        self,
+        ref_or_start: Optional[str],
+        end: Optional[str] = None,
+        tree: Optional[str] = None,
+        include_header: bool = False,
+        include_standoff: bool = False,
+    ) -> Element:
         """ Retrieve a given passage from the document
 
         :param ref_or_start: First element of a range or single ref
         :param end: End of a range
         :param tree: Name of a specific tree
+        :param include_header: Prepend the full teiHeader to the result (default False)
+        :param include_standoff: Append filtered standOff content to the result (default False).
+            Three reference directions are resolved:
+            (1) passage → standOff: attributes such as @corresp/@ref in the passage pointing to
+                standOff @xml:id values;
+            (2) standOff → passage: @target/@from/@to etc. on standOff elements pointing to
+                @xml:id values present in the passage;
+            (3) transitive standOff → standOff: @ana and similar on already-included standOff
+                elements (fixed-point expansion until stable).
+            Note: when include_header is False, teiHeader entries referenced only from standOff
+            (e.g. taxonomy categories via @ana) are not individually extracted.
         """
         if ref_or_start and not end:
             start, end = ref_or_start, None
@@ -617,6 +683,62 @@ def get_passage(self, ref_or_start: Optional[str], end: Optional[str] = None, tr
             processor=self.xml_processor
         )
         objectify.deannotate(root, cleanup_namespaces=True)
+
+        if include_header:
+            xp = get_xpath_proc(self.xml, processor=self.xml_processor)
+            header_nodes = xpath_eval(xp, "/TEI/teiHeader")
+            if header_nodes:
+                xq = self.xml_processor.new_xquery_processor()
+                xq.set_context(xdm_item=header_nodes[0])
+                header_str = xq.run_query_to_string(query_text=(
+                    "declare namespace output = 'http://www.w3.org/2010/xslt-xquery-serialization';"
+                    "declare option output:omit-xml-declaration 'yes';"
+                    "."
+                ))
+                if header_str and header_str.startswith("<"):
+                    root.insert(0, fromstring(header_str))
+
+        if include_standoff:
+            passage_refs = _collect_id_refs(root)
+            passage_xml_ids = _collect_xml_ids(root)
+
+            xp = get_xpath_proc(self.xml, processor=self.xml_processor)
+            standoff_elems = []
+            for so_node in xpath_eval(xp, "/TEI/standOff"):
+                xq = self.xml_processor.new_xquery_processor()
+                xq.set_context(xdm_item=so_node)
+                so_str = xq.run_query_to_string(query_text=(
+                    "declare namespace output = 'http://www.w3.org/2010/xslt-xquery-serialization';"
+                    "declare option output:omit-xml-declaration 'yes';"
+                    "."
+                ))
+                if so_str and so_str.startswith("<"):
+                    standoff_elems.append(fromstring(so_str))
+
+            if standoff_elems:
+                # Fixed-point: expand all_refs until included standOff elements
+                # reveal no further references (handles @ana and similar).
+                all_refs = set(passage_refs)
+                prev_size = -1
+                while len(all_refs) != prev_size:
+                    prev_size = len(all_refs)
+                    for so_elem in standoff_elems:
+                        for node in so_elem.iter():
+                            own_id = node.get(_XML_ID)
+                            is_relevant = (own_id and own_id in all_refs) or any(
+                                ref in passage_xml_ids
+                                for val in node.attrib.values()
+                                for ref in _ID_REF.findall(val)
+                            )
+                            if is_relevant:
+                                for val in node.attrib.values():
+                                    all_refs.update(_ID_REF.findall(val))
+
+                for so_elem in standoff_elems:
+                    _filter_standoff(so_elem, all_refs, passage_xml_ids)
+                    if len(so_elem) or so_elem.text:
+                        root.append(so_elem)
+
         return root
 
     def get_reffs(self, tree: Optional[str] = None):

diff --git a/tests/catalog/standoff-catalog.xml b/tests/catalog/standoff-catalog.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<collection identifier="https://foo.bar/standoff-root">
+    <title>Standoff test collection</title>
+    <members>
+        <resource identifier="https://foo.bar/standoff" filepath="../tei/tei_with_standoff.xml">
+            <title>Standoff document</title>
+        </resource>
+    </members>
+</collection>
diff --git a/tests/tei/tei_with_standoff.xml b/tests/tei/tei_with_standoff.xml
@@ -0,0 +1,53 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0">
+  <teiHeader>
+    <encodingDesc>
+      <refsDecl>
+        <citeStructure unit="section" match="//body/div" use="@n"/>
+      </refsDecl>
+    </encodingDesc>
+  </teiHeader>
+  <text>
+    <body>
+      <div n="1">
+        <p>
+          <w xml:id="w1">Martin</w>
+          <w xml:id="w2">Luther</w>
+          <w xml:id="w3">King</w>
+          <persName ref="#MLK">Martin Luther King</persName>
+          spoke in
+          <placeName corresp="#LATL">Atlanta</placeName>
+          and
+          <placeName corresp="#LBHM">Birmingham</placeName>.
+        </p>
+      </div>
+      <div n="2">
+        <p>
+          <w xml:id="w7">Other</w>
+          <w xml:id="w8">content</w>
+          <placeName corresp="#LDAL">Dallas</placeName>
+        </p>
+      </div>
+    </body>
+  </text>
+  <standOff>
+    <listPlace>
+      <place xml:id="LATL"><placeName>Atlanta</placeName><location><geo>33.755 -84.39</geo></location></place>
+      <place xml:id="LBHM"><placeName>Birmingham</placeName><location><geo>33.653333 -86.808889</geo></location></place>
+      <place xml:id="LDAL"><placeName>Dallas</placeName><location><geo>32.783 -96.8</geo></location></place>
+    </listPlace>
+    <listPerson>
+      <person xml:id="MLK"><persName>Martin Luther King Jr.</persName></person>
+      <person xml:id="JFK"><persName>John F. Kennedy</persName></person>
+    </listPerson>
+    <spanGrp type="pos">
+      <span xml:id="ann1" target="#w1" ana="#pos-NNP"/>
+      <span xml:id="ann2" target="#w2" ana="#pos-NNP"/>
+      <span xml:id="ann3" target="#w3" ana="#pos-NNP"/>
+      <span xml:id="ann4" target="#w7" ana="#pos-JJ"/>
+    </spanGrp>
+    <fvLib>
+      <fs xml:id="pos-NNP"><f name="pos"><string>NNP</string></f></fs>
+      <fs xml:id="pos-JJ"><f name="pos"><string>JJ</string></f></fs>
+    </fvLib>
+  </standOff>
+</TEI>
diff --git a/tests/test_app.py b/tests/test_app.py
@@ -8,6 +8,7 @@
 import urllib
 
 basedir = os.path.abspath(os.path.dirname(__file__))
+STANDOFF_RESOURCE = "https://foo.bar/standoff"
 BASE_URI = "http://localhost"
 CONTEXT_URL = "https://dtsapi.org/context/v1.0.json"
 DTS_VERSION = "1.0"
@@ -172,3 +173,135 @@ def test_collection(client):
             'title': 'My First Collection',
             'totalChildren': 1,
             'totalParents': 1} == response.get_json()
+
+
+# ── document endpoint fixtures ────────────────────────────────────────────────
+
+def _make_standoff_app(include_header=False, include_standoff=False):
+    flask_app = Flask(__name__)
+    flask_app, db = create_app(flask_app, include_header=include_header,
+                               include_standoff=include_standoff)
+    db_path = os.path.join(basedir, 'app_standoff.db')
+    flask_app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{db_path}'
+    flask_app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
+    db.init_app(flask_app)
+    with flask_app.app_context():
+        db.create_all()
+        catalog, _ = parse(f"{basedir}/catalog/standoff-catalog.xml")
+        store_catalog(catalog)
+    return flask_app, db
+
+
+@pytest.fixture
+def standoff_client_default():
+    """Document endpoint with both flags off (default behaviour)."""
+    flask_app, db = _make_standoff_app(include_header=False, include_standoff=False)
+    yield flask_app.test_client()
+    with flask_app.app_context():
+        db.session.remove()
+        db.drop_all()
+
+
+@pytest.fixture
+def standoff_client_header():
+    """Document endpoint with include_header=True only."""
+    flask_app, db = _make_standoff_app(include_header=True, include_standoff=False)
+    yield flask_app.test_client()
+    with flask_app.app_context():
+        db.session.remove()
+        db.drop_all()
+
+
+@pytest.fixture
+def standoff_client_standoff():
+    """Document endpoint with include_standoff=True only."""
+    flask_app, db = _make_standoff_app(include_header=False, include_standoff=True)
+    yield flask_app.test_client()
+    with flask_app.app_context():
+        db.session.remove()
+        db.drop_all()
+
+
+@pytest.fixture
+def standoff_client_both():
+    """Document endpoint with both include_header=True and include_standoff=True."""
+    flask_app, db = _make_standoff_app(include_header=True, include_standoff=True)
+    yield flask_app.test_client()
+    with flask_app.app_context():
+        db.session.remove()
+        db.drop_all()
+
+
+# ── helpers ───────────────────────────────────────────────────────────────────
+
+def _get_doc(client, ref):
+    from urllib.parse import quote_plus
+    return client.get(f"/document/?resource={quote_plus(STANDOFF_RESOURCE)}&ref={ref}")
+
+
+# ── document endpoint tests ───────────────────────────────────────────────────
+
+def test_document_default_no_header_no_standoff(standoff_client_default):
+    """By default neither teiHeader nor standOff appears in the response."""
+    response = _get_doc(standoff_client_default, "1")
+    assert response.status_code == 200
+    body = response.data.decode()
+    assert "<teiHeader" not in body
+    assert "<standOff" not in body
+    # passage content is still there
+    assert 'xml:id="w1"' in body
+
+
+def test_document_include_header(standoff_client_header):
+    """include_header=True causes teiHeader to appear before text in the response."""
+    response = _get_doc(standoff_client_header, "1")
+    assert response.status_code == 200
+    body = response.data.decode()
+    assert "<teiHeader" in body
+    assert "<standOff" not in body
+    assert body.index("<teiHeader") < body.index("<text")
+
+
+def test_document_include_standoff(standoff_client_standoff):
+    """include_standoff=True filters standOff entries for the retrieved passage.
+
+    div n="1" references LATL, LBHM, MLK (case 1); its word tokens w1–w3
+    are targeted by ann1–ann3 (case 2); those spans carry @ana="#pos-NNP"
+    pulling in pos-NNP transitively (case 3).  LDAL, JFK, ann4, and pos-JJ
+    are excluded because they are only referenced from div n="2".
+    """
+    response = _get_doc(standoff_client_standoff, "1")
+    assert response.status_code == 200
+    body = response.data.decode()
+
+    assert "<teiHeader" not in body
+    assert "<standOff" in body
+
+    # case 1: passage → standOff
+    assert 'xml:id="LATL"' in body
+    assert 'xml:id="LBHM"' in body
+    assert 'xml:id="MLK"' in body
+    assert 'xml:id="LDAL"' not in body
+    assert 'xml:id="JFK"' not in body
+
+    # case 2: standOff → passage
+    assert 'target="#w1"' in body
+    assert 'target="#w2"' in body
+    assert 'target="#w3"' in body
+    assert 'target="#w7"' not in body
+
+    # case 3: transitive standOff → standOff
+    assert 'xml:id="pos-NNP"' in body
+    assert 'xml:id="pos-JJ"' not in body
+
+    assert body.index("<text") < body.index("<standOff")
+
+
+def test_document_include_header_and_standoff(standoff_client_both):
+    """With both flags the response order is teiHeader → text → standOff."""
+    response = _get_doc(standoff_client_both, "1")
+    assert response.status_code == 200
+    body = response.data.decode()
+    assert "<teiHeader" in body
+    assert "<standOff" in body
+    assert body.index("<teiHeader") < body.index("<text") < body.index("<standOff")