implement bytestream-handling & serialization methods

OGuggenbuehl · OGuggenbuehl · commit db8f46975a1c · 2025-07-11T15:57:07.000+02:00
simplify bytestream handling and add test

Signed-off-by: Oliver Guggenbühl &lt;oliver.guggenbuehl@deepset.ai&gt;
diff --git a/docling_haystack/converter.py b/docling_haystack/converter.py
@@ -5,6 +5,7 @@
 
 """Docling Haystack converter module."""
 
+import tempfile
 from abc import ABC, abstractmethod
 from enum import Enum
 from pathlib import Path
@@ -13,7 +14,10 @@
 from docling.chunking import BaseChunk, BaseChunker, HybridChunker
 from docling.datamodel.document import DoclingDocument
 from docling.document_converter import DocumentConverter
-from haystack import Document, component
+from haystack import Document, component, default_from_dict, default_to_dict, logging
+from haystack.dataclasses.byte_stream import ByteStream
+
+logger = logging.getLogger(__name__)
 
 
 class ExportType(str, Enum):
@@ -100,42 +104,109 @@ def __init__(
             )
         self._meta_extractor = meta_extractor or MetaExtractor()
 
+    def _handle_bytestream(self, bytestream: ByteStream) -> tuple[str, bool]:
+        """Save ByteStream to a temporary file if needed."""
+        suffix = (
+            f".{bytestream.meta.get('file_extension', '')}"
+            if bytestream.meta.get("file_extension")
+            else None
+        )
+        temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
+        temp_file.write(bytestream.data)
+        temp_file.close()
+        return temp_file.name, True
+
     @component.output_types(documents=list[Document])
     def run(
         self,
-        paths: Iterable[Union[Path, str]],
+        paths: Iterable[Union[Path, str, ByteStream]],
     ):
         """Run the DoclingConverter.
 
         Args:
-            paths: The input document locations, either as local paths or URLs.
+            paths: The input document locations, either as local paths, URLs, or ByteStream objects.
 
         Returns:
             list[Document]: The output Haystack Documents.
         """
         documents: list[Document] = []
-        for filepath in paths:
-            dl_doc = self._converter.convert(
-                source=filepath,
-                **self._convert_kwargs,
-            ).document
-
-            if self._export_type == ExportType.DOC_CHUNKS:
-                chunk_iter = self._chunker.chunk(dl_doc=dl_doc)
-                hs_docs = [
-                    Document(
-                        content=self._chunker.serialize(chunk=chunk),
-                        meta=self._meta_extractor.extract_chunk_meta(chunk=chunk),
+        temp_files = []  # Track temporary files to clean up later
+
+        try:
+            for source in paths:
+                try:
+                    if isinstance(source, ByteStream):
+                        filepath, is_temp = self._handle_bytestream(source)
+                        if is_temp:
+                            temp_files.append(filepath)
+                    else:
+                        filepath = str(source)
+
+                    dl_doc = self._converter.convert(
+                        source=filepath,
+                        **self._convert_kwargs,
+                    ).document
+
+                    if self._export_type == ExportType.DOC_CHUNKS:
+                        chunk_iter = self._chunker.chunk(dl_doc=dl_doc)
+                        hs_docs = [
+                            Document(
+                                content=self._chunker.serialize(chunk=chunk),
+                                meta=self._meta_extractor.extract_chunk_meta(
+                                    chunk=chunk
+                                ),
+                            )
+                            for chunk in chunk_iter
+                        ]
+                        documents.extend(hs_docs)
+                    elif self._export_type == ExportType.MARKDOWN:
+                        hs_doc = Document(
+                            content=dl_doc.export_to_markdown(**self._md_export_kwargs),
+                            meta=self._meta_extractor.extract_dl_doc_meta(
+                                dl_doc=dl_doc
+                            ),
+                        )
+                        documents.append(hs_doc)
+                    else:
+                        raise RuntimeError(
+                            f"Unexpected export type: {self._export_type}"
+                        )
+                except Exception as e:
+                    logger.warning(
+                        "Could not process {source}. Skipping it. Error: {error}",
+                        source=source,
+                        error=e,
                     )
-                    for chunk in chunk_iter
-                ]
-                documents.extend(hs_docs)
-            elif self._export_type == ExportType.MARKDOWN:
-                hs_doc = Document(
-                    content=dl_doc.export_to_markdown(**self._md_export_kwargs),
-                    meta=self._meta_extractor.extract_dl_doc_meta(dl_doc=dl_doc),
-                )
-                documents.append(hs_doc)
-            else:
-                raise RuntimeError(f"Unexpected export type: {self._export_type}")
-        return {"documents": documents}
+            return {"documents": documents}
+        finally:  # cleanup
+            for temp_file in temp_files:
+                try:
+                    Path(temp_file).unlink()
+                except Exception as e:
+                    logger.debug(f"Failed to delete temporary file {temp_file}: {e}")
+
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Serialize the component to a dictionary for pipeline persistence.
+
+        Returns:
+            dict[str, Any]: A dictionary representation of the component
+        """
+        return default_to_dict(
+            self,
+            convert_kwargs=self._convert_kwargs,
+            md_export_kwargs=self._md_export_kwargs,
+        )
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "DoclingConverter":
+        """
+        Deserialize the component from a dictionary.
+
+        Args:
+            data: Dictionary representation of the component
+
+        Returns:
+            DoclingConverter: A new instance of the component
+        """
+        return default_from_dict(cls, data)
diff --git a/test/test_converter.py b/test/test_converter.py
@@ -3,6 +3,7 @@
 
 from docling.chunking import HybridChunker
 from docling.datamodel.document import DoclingDocument
+from haystack.dataclasses.byte_stream import ByteStream
 
 from docling_haystack.converter import DoclingConverter, ExportType
 
@@ -80,3 +81,80 @@ def test_convert_markdown(monkeypatch):
     with open(EXPECTED_OUT_FILE) as f:
         exp_data = json.load(fp=f)
     assert exp_data == act_data
+
+
+def test_serialization_deserialization():
+    """Test component serialization and deserialization."""
+    converter = DoclingConverter(
+        convert_kwargs={"optimize_ocr": True},
+        md_export_kwargs={"image_placeholder": "[IMAGE]"},
+    )
+
+    # serialize the component to dict
+    serialized = converter.to_dict()
+
+    assert "init_parameters" in serialized
+    assert serialized["init_parameters"].get("convert_kwargs") == {"optimize_ocr": True}
+
+    md_export_kwargs = serialized["init_parameters"].get("md_export_kwargs", {})
+    assert md_export_kwargs.get("image_placeholder") == "[IMAGE]"
+
+    # deserialize back to component
+    deserialized = DoclingConverter.from_dict(serialized)
+    assert deserialized._convert_kwargs == {"optimize_ocr": True}
+
+    assert deserialized._md_export_kwargs.get("image_placeholder") == "[IMAGE]"
+
+
+def test_bytestream_handling(monkeypatch):
+    """Test conversion from ByteStream."""
+    with open("test/data/2408.09869v5.md", "rb") as f:
+        data = f.read()
+
+    bytestream = ByteStream(
+        data=data,
+        meta={"file_extension": "md", "filename": "test_file.md"},
+    )
+    convert_mock = MagicMock()
+
+    with open("test/data/2408.09869v5.json") as f:
+        data_json = f.read()
+    mock_dl_doc = DoclingDocument.model_validate_json(data_json)
+
+    mock_response = MagicMock()
+    mock_response.document = mock_dl_doc
+    convert_mock.return_value = mock_response
+
+    monkeypatch.setattr(
+        "docling.document_converter.DocumentConverter.__init__",
+        lambda *args, **kwargs: None,
+    )
+    monkeypatch.setattr(
+        "docling.document_converter.DocumentConverter.convert",
+        convert_mock,  # use our mock that captures the filepath
+    )
+
+    def mock_extract_meta(self, dl_doc):
+        return {"custom_field": "test_value"}
+
+    monkeypatch.setattr(
+        "docling_haystack.converter.MetaExtractor.extract_dl_doc_meta",
+        mock_extract_meta,
+    )
+
+    converter = DoclingConverter(
+        export_type=ExportType.MARKDOWN,
+    )
+
+    # ByteStream directly in the paths parameter
+    result = converter.run(paths=[bytestream])
+    documents = result["documents"]
+
+    assert convert_mock.called
+    filepath_arg = convert_mock.call_args[1]["source"]
+    assert isinstance(filepath_arg, str)
+    assert filepath_arg.endswith(".md")
+
+    assert len(documents) > 0
+    assert documents[0].meta.get("custom_field") == "test_value"
+    assert len(documents[0].content) > 0