From 2662c12c77692a7718a9c4dbcbab407386b0838c Mon Sep 17 00:00:00 2001
From: JosephChataignon <joseph.chataignon@unibe.ch>
Date: Tue, 4 Mar 2025 23:26:02 +0100
Subject: [PATCH] changed function call to solve an incomprehensible bug with
 NLTK not finding punkt

---
 embedding/utils.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/embedding/utils.py b/embedding/utils.py
index b2d311b..d32ac49 100644
--- a/embedding/utils.py
+++ b/embedding/utils.py
@@ -71,16 +71,23 @@ def read_pdf_file(file_path: str) -> str:
 def split_text_into_sentences(text: str, language: str) -> list[str]:
     """
     Splits the given text into a list of sentences using NLTK's sentence tokenizer.
-
+    
     Args:
         text (str): The input text to split into sentences.
         language (str): The language of the text for the sentence tokenizer
-
+        
     Returns:
         list[str]: A list of sentences.
     """
-    sentences = nltk.sent_tokenize(text, language=language)
-    return sentences
+    try:
+        # Try to load the standard tokenizer which we know works
+        from nltk.tokenize.punkt import PunktSentenceTokenizer
+        tokenizer = PunktSentenceTokenizer()
+        sentences = tokenizer.tokenize(text)
+        return sentences
+    except:
+        # Fallback to the simple case
+        return nltk.sent_tokenize(text)
 
 
 def chunk_sentences(sentences: list[str], chunk_size: int, overlap_size: int) -> list[str]:
@@ -111,7 +118,4 @@ def chunk_sentences(sentences: list[str], chunk_size: int, overlap_size: int) ->
 
 
 
-    
-        
 
-    
\ No newline at end of file