Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 89 additions & 27 deletions workshops/2025_12_04/docling_lab_2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,56 +16,63 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
"I0000 00:00:1764342422.086269 24419707 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2mAudited \u001b[1m10 packages\u001b[0m \u001b[2min 61ms\u001b[0m\u001b[0m\n"
"/usr/bin/sh: 1: uv: not found\n"
]
}
],
"source": [
"!uv pip install langchain-docling langchain-core langchain-huggingface sentence-transformers langchain_milvus \"pymilvus[milvus_lite]\" langchain-text-splitters langchain-classic langchain-openai python-dotenv"
"!uv pip install langchain-docling langchain-core langchain-huggingface sentence-transformers langchain_milvus \"pymilvus[milvus_lite]\" langchain-text-splitters langchain-classic langchain-openai python-dotenv langchain_ibm"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/dol/codes/docling-workshops/workshops/2025_12_04/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
"/usr/local/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import logging\n",
"import os\n",
"import requests\n",
"\n",
"from dotenv import load_dotenv\n",
"from langchain_core.prompts import PromptTemplate\n",
"\n",
"load_dotenv()\n",
"api_key = os.environ.get(\"WX_API_KEY\")\n",
"project_id = os.environ.get(\"WX_PROJECT_ID\")\n",
"\n",
"\n",
"logging.basicConfig(level=logging.ERROR)\n",
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# to check that your .env file has been read correctly\n",
"print(api_key)\n",
"print(project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -121,21 +128,58 @@
"## RAG pipeline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### setting the parameters to access the wx.ai model"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from langchain_ibm import ChatWatsonx\n",
"\n",
"model_id = \"ibm/granite-4-h-small\"\n",
"base_url = \"https://us-south.ml.cloud.ibm.com\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### building the RAG pipeline"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from langchain_classic.chains import create_retrieval_chain\n",
"from langchain_classic.chains.combine_documents import create_stuff_documents_chain\n",
"from langchain_openai import ChatOpenAI\n",
"from langchain_ibm import ChatWatsonx\n",
"\n",
"def clip_text(text, limit=100):\n",
" return f\"{text[:limit]}...\" if len(text) > limit else text\n",
"\n",
"def do_rag(*, retriever, question, lm_model_id, lm_prompt, lm_base_url=\"http://localhost:1234/v1\", lm_api_key=\"none\"):\n",
" llm = ChatOpenAI(model=lm_model_id, base_url=lm_base_url, api_key=lm_api_key)\n",
" generation_params = {\n",
" \"temperature\": 0.7, # 0.0 (deterministic) to 1.0 (creative)\n",
" \"max_tokens\": 1000, # Maximum output length\n",
" \"top_p\": 0.9, # Nucleus sampling threshold\n",
" }\n",
" llm = ChatWatsonx(\n",
" model_id=model_id,\n",
" url=base_url,\n",
" project_id=project_id,\n",
" apikey=api_key,\n",
" params=generation_params # Pass the structured params\n",
" ) \n",
" question_answer_chain = create_stuff_documents_chain(llm=llm, prompt=lm_prompt)\n",
" rag_chain = create_retrieval_chain(retriever, question_answer_chain)\n",
" resp_dict = rag_chain.invoke({\"input\": question})\n",
Expand Down Expand Up @@ -166,20 +210,38 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Running an end-to-end example:\n"
"Running an end-to-end example in English:\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m[INFO] 2025-12-02 17:01:05,967 [RapidOCR] base.py:22: Using engine_name: torch\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:01:05,978 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:01:06,765 [RapidOCR] download_file.py:82: Download size: 13.83MB\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:01:07,396 [RapidOCR] download_file.py:95: Successfully saved to: /usr/local/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:01:07,399 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:01:07,823 [RapidOCR] base.py:22: Using engine_name: torch\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:01:07,824 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/cls/ch_ptocr_mobile_v2.0_cls_infer.pth\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:01:08,824 [RapidOCR] download_file.py:82: Download size: 0.56MB\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:01:08,881 [RapidOCR] download_file.py:95: Successfully saved to: /usr/local/lib/python3.12/site-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:01:08,883 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/site-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:01:08,947 [RapidOCR] base.py:22: Using engine_name: torch\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:01:08,947 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.pth\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:01:09,855 [RapidOCR] download_file.py:82: Download size: 25.67MB\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:01:10,924 [RapidOCR] download_file.py:95: Successfully saved to: /usr/local/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_rec_infer.pth\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:01:10,928 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_rec_infer.pth\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:02:31,866 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/resources/fonts/FZYTK.TTF\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:02:32,811 [RapidOCR] download_file.py:82: Download size: 3.09MB\u001b[0m\n",
"\u001b[32m[INFO] 2025-12-02 17:02:32,979 [RapidOCR] download_file.py:95: Successfully saved to: /usr/local/lib/python3.12/site-packages/rapidocr/models/FZYTK.TTF\u001b[0m\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (619 > 512). Running this sequence through the model will result in indexing errors\n",
"/Users/dol/codes/docling-workshops/workshops/2025_12_04/.venv/lib/python3.12/site-packages/milvus_lite/__init__.py:15: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n",
"/usr/local/lib/python3.12/site-packages/milvus_lite/__init__.py:15: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n",
" from pkg_resources import DistributionNotFound, get_distribution\n"
]
}
Expand All @@ -202,7 +264,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand All @@ -213,25 +275,25 @@
"Briefly name the main AI models used in Docling.\n",
"\n",
"Answer:\n",
"The primary AI models integrated into **Docling** are:\n",
"Based on the provided context, the two main AI models used in Docling are:\n",
"\n",
"1. **Layout Analysis Model** – an accurate objectdetector for identifying page elements such as headings, paragraphs, images, etc. \n",
"2. **TableFormer** – a state‑of‑theart table structure recognition model that extracts tables and their internal layout from documents.\n"
"1. Layout analysis model - an accurate object-detector for page elements\n",
"2. TableFormer - a state-of-the-art table structure recognition model\n"
]
}
],
"source": [
"rag_result = do_rag(\n",
" retriever=retriever,\n",
" question=\"Briefly name the main AI models used in Docling.\",\n",
" lm_model_id=\"openai/gpt-oss-20b\",\n",
" lm_model_id=\"ibm/granite-4-h-small\",\n",
" lm_prompt=PromptTemplate.from_template(\"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {input}\\nAnswer:\\n\"),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -270,7 +332,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "2025_12_04",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -284,9 +346,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}