diff --git a/workshops/2025_12_04/docling_lab_2.ipynb b/workshops/2025_12_04/docling_lab_2.ipynb index 1ee69ca..80f55ae 100644 --- a/workshops/2025_12_04/docling_lab_2.ipynb +++ b/workshops/2025_12_04/docling_lab_2.ipynb @@ -16,39 +16,31 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 1, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", - "I0000 00:00:1764342422.086269 24419707 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2mAudited \u001b[1m10 packages\u001b[0m \u001b[2min 61ms\u001b[0m\u001b[0m\n" + "/usr/bin/sh: 1: uv: not found\n" ] } ], "source": [ - "!uv pip install langchain-docling langchain-core langchain-huggingface sentence-transformers langchain_milvus \"pymilvus[milvus_lite]\" langchain-text-splitters langchain-classic langchain-openai python-dotenv" + "!uv pip install langchain-docling langchain-core langchain-huggingface sentence-transformers langchain_milvus \"pymilvus[milvus_lite]\" langchain-text-splitters langchain-classic langchain-openai python-dotenv langchain_ibm" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/Users/dol/codes/docling-workshops/workshops/2025_12_04/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + "/usr/local/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } @@ -56,16 +48,31 @@ "source": [ "import logging\n", "import os\n", + "import requests\n", "\n", "from dotenv import load_dotenv\n", "from langchain_core.prompts import PromptTemplate\n", "\n", "load_dotenv()\n", + "api_key = os.environ.get(\"WX_API_KEY\")\n", + "project_id = os.environ.get(\"WX_PROJECT_ID\")\n", + "\n", "\n", "logging.basicConfig(level=logging.ERROR)\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# to check that your .env file has been read correctly\n", + "print(api_key)\n", + "print(project_id)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -121,21 +128,58 @@ "## RAG pipeline" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### setting the parameters to access the wx.ai model" + ] + }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], + "source": [ + "from langchain_ibm import ChatWatsonx\n", + "\n", + "model_id = \"ibm/granite-4-h-small\"\n", + "base_url = \"https://us-south.ml.cloud.ibm.com\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### building the RAG pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], "source": [ "from langchain_classic.chains import create_retrieval_chain\n", "from langchain_classic.chains.combine_documents import create_stuff_documents_chain\n", - "from langchain_openai import ChatOpenAI\n", + "from langchain_ibm import ChatWatsonx\n", "\n", "def clip_text(text, limit=100):\n", " return f\"{text[:limit]}...\" if len(text) > limit else text\n", "\n", "def do_rag(*, retriever, question, lm_model_id, lm_prompt, lm_base_url=\"http://localhost:1234/v1\", lm_api_key=\"none\"):\n", - " llm = ChatOpenAI(model=lm_model_id, base_url=lm_base_url, api_key=lm_api_key)\n", + " generation_params = {\n", + " \"temperature\": 0.7, # 0.0 (deterministic) to 1.0 (creative)\n", + " \"max_tokens\": 1000, # Maximum output length\n", + " \"top_p\": 0.9, # Nucleus sampling threshold\n", + " }\n", + " llm = ChatWatsonx(\n", + " model_id=model_id,\n", + " url=base_url,\n", + " project_id=project_id,\n", + " apikey=api_key,\n", + " params=generation_params # Pass the structured params\n", + " ) \n", " question_answer_chain = create_stuff_documents_chain(llm=llm, prompt=lm_prompt)\n", " rag_chain = create_retrieval_chain(retriever, question_answer_chain)\n", " resp_dict = rag_chain.invoke({\"input\": question})\n", @@ -166,20 +210,38 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Running an end-to-end example:\n" + "Running an end-to-end example in English:\n" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ + "\u001b[32m[INFO] 2025-12-02 17:01:05,967 [RapidOCR] base.py:22: Using engine_name: torch\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:01:05,978 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:01:06,765 [RapidOCR] download_file.py:82: Download size: 13.83MB\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:01:07,396 [RapidOCR] download_file.py:95: Successfully saved to: /usr/local/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:01:07,399 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:01:07,823 [RapidOCR] base.py:22: Using engine_name: torch\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:01:07,824 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/cls/ch_ptocr_mobile_v2.0_cls_infer.pth\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:01:08,824 [RapidOCR] download_file.py:82: Download size: 0.56MB\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:01:08,881 [RapidOCR] download_file.py:95: Successfully saved to: /usr/local/lib/python3.12/site-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:01:08,883 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/site-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:01:08,947 [RapidOCR] base.py:22: Using engine_name: torch\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:01:08,947 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.pth\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:01:09,855 [RapidOCR] download_file.py:82: Download size: 25.67MB\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:01:10,924 [RapidOCR] download_file.py:95: Successfully saved to: /usr/local/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_rec_infer.pth\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:01:10,928 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_rec_infer.pth\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:02:31,866 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/resources/fonts/FZYTK.TTF\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:02:32,811 [RapidOCR] download_file.py:82: Download size: 3.09MB\u001b[0m\n", + "\u001b[32m[INFO] 2025-12-02 17:02:32,979 [RapidOCR] download_file.py:95: Successfully saved to: /usr/local/lib/python3.12/site-packages/rapidocr/models/FZYTK.TTF\u001b[0m\n", "Token indices sequence length is longer than the specified maximum sequence length for this model (619 > 512). Running this sequence through the model will result in indexing errors\n", - "/Users/dol/codes/docling-workshops/workshops/2025_12_04/.venv/lib/python3.12/site-packages/milvus_lite/__init__.py:15: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n", + "/usr/local/lib/python3.12/site-packages/milvus_lite/__init__.py:15: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n", " from pkg_resources import DistributionNotFound, get_distribution\n" ] } @@ -202,7 +264,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -213,10 +275,10 @@ "Briefly name the main AI models used in Docling.\n", "\n", "Answer:\n", - "The primary AI models integrated into **Docling** are:\n", + "Based on the provided context, the two main AI models used in Docling are:\n", "\n", - "1. **Layout Analysis Model** – an accurate object‑detector for identifying page elements such as headings, paragraphs, images, etc. \n", - "2. **TableFormer** – a state‑of‑the‑art table structure recognition model that extracts tables and their internal layout from documents.\n" + "1. Layout analysis model - an accurate object-detector for page elements\n", + "2. TableFormer - a state-of-the-art table structure recognition model\n" ] } ], @@ -224,14 +286,14 @@ "rag_result = do_rag(\n", " retriever=retriever,\n", " question=\"Briefly name the main AI models used in Docling.\",\n", - " lm_model_id=\"openai/gpt-oss-20b\",\n", + " lm_model_id=\"ibm/granite-4-h-small\",\n", " lm_prompt=PromptTemplate.from_template(\"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {input}\\nAnswer:\\n\"),\n", ")" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -270,7 +332,7 @@ ], "metadata": { "kernelspec": { - "display_name": "2025_12_04", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -284,9 +346,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.9" + "version": "3.12.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }