From 2662c12c77692a7718a9c4dbcbab407386b0838c Mon Sep 17 00:00:00 2001 From: JosephChataignon Date: Tue, 4 Mar 2025 23:26:02 +0100 Subject: [PATCH] changed function call to solve an incomprehensible bug with NLTK not finding punkt --- embedding/utils.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/embedding/utils.py b/embedding/utils.py index b2d311b..d32ac49 100644 --- a/embedding/utils.py +++ b/embedding/utils.py @@ -71,16 +71,23 @@ def read_pdf_file(file_path: str) -> str: def split_text_into_sentences(text: str, language: str) -> list[str]: """ Splits the given text into a list of sentences using NLTK's sentence tokenizer. - + Args: text (str): The input text to split into sentences. language (str): The language of the text for the sentence tokenizer - + Returns: list[str]: A list of sentences. """ - sentences = nltk.sent_tokenize(text, language=language) - return sentences + try: + # Try to load the standard tokenizer which we know works + from nltk.tokenize.punkt import PunktSentenceTokenizer + tokenizer = PunktSentenceTokenizer() + sentences = tokenizer.tokenize(text) + return sentences + except: + # Fallback to the simple case + return nltk.sent_tokenize(text) def chunk_sentences(sentences: list[str], chunk_size: int, overlap_size: int) -> list[str]: @@ -111,7 +118,4 @@ def chunk_sentences(sentences: list[str], chunk_size: int, overlap_size: int) -> - - - \ No newline at end of file