Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion docling_haystack/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from docling.chunking import BaseChunk, BaseChunker, HybridChunker
from docling.datamodel.document import DoclingDocument
from docling.document_converter import DocumentConverter
from haystack import Document, component
from haystack import Document, component, default_from_dict, default_to_dict


class ExportType(str, Enum):
Expand Down Expand Up @@ -139,3 +139,29 @@ def run(
else:
raise RuntimeError(f"Unexpected export type: {self._export_type}")
return {"documents": documents}

def to_dict(self) -> dict[str, Any]:
"""
Serialize the component to a dictionary for pipeline persistence.

Returns:
dict[str, Any]: A dictionary representation of the component
"""
return default_to_dict(
self,
convert_kwargs=self._convert_kwargs,
md_export_kwargs=self._md_export_kwargs,
)

@classmethod
def from_dict(cls, data: dict[str, Any]) -> "DoclingConverter":
"""
Deserialize the component from a dictionary.

Args:
data: Dictionary representation of the component

Returns:
DoclingConverter: A new instance of the component
"""
return default_from_dict(cls, data)
23 changes: 23 additions & 0 deletions test/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,26 @@ def test_convert_markdown(monkeypatch):
with open(EXPECTED_OUT_FILE) as f:
exp_data = json.load(fp=f)
assert exp_data == act_data


def test_serialization_deserialization():
"""Test component serialization and deserialization."""
converter = DoclingConverter(
convert_kwargs={"optimize_ocr": True},
md_export_kwargs={"image_placeholder": "[IMAGE]"},
)

# serialize the component to dict
serialized = converter.to_dict()

assert "init_parameters" in serialized
assert serialized["init_parameters"].get("convert_kwargs") == {"optimize_ocr": True}

md_export_kwargs = serialized["init_parameters"].get("md_export_kwargs", {})
assert md_export_kwargs.get("image_placeholder") == "[IMAGE]"

# deserialize back to component
deserialized = DoclingConverter.from_dict(serialized)
assert deserialized._convert_kwargs == {"optimize_ocr": True}

assert deserialized._md_export_kwargs.get("image_placeholder") == "[IMAGE]"