dsl-unibe-ch · JosephChataignon · Apr 11, 2025 · Apr 11, 2025 · Apr 24, 2025 · Apr 24, 2025
diff --git a/.env_example b/.env_example
@@ -1,4 +1,21 @@
-OPENAI_API_KEY=
+## Django settings
+# In production add your URLs here
+DJANGO_ALLOWED_HOSTS=localhost,127.0.0.1
+# Set to False in production
+DJANGO_DEBUG=True
+# Fill with a secret key
+DJANGO_SECRET_KEY=some-secret-key-CHANGE-THIS-IN-PRODUCTION-41auGrAJ7X
 
+# Path to the documents to be indexed
+FRAG_RAW_DB=/path/to/your/actual/data
+# Path to the database directory
+FRAG_DB_DIRECTORY=/path/to/your/actual/database
+# files to be indexed. Only pdf and txt are supported
+FRAG_FILE_TYPES=pdf,txt
 
-DJANGO_KEY=
+# Ollama host: use this if you are running Ollama on a distant server
+# To use the local Ollama server, set it to localhost or comment it out
+OLLAMA_HOST=http://ip.address.or.url:11434
+
+# API keys
+OPENAI_API_KEY=your_openai_api_key
diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,13 @@ chroma_data
 # Ignore Django migrations
 **/migrations/*.py
 !**/migrations/__init__.py
+*.sock
+
+# Space to store the database and logs
+data/
+logs/
+
+# Ignore actual config files but keep examples
+config/*.yaml
+!config/*.example.yaml
 
diff --git a/cl-tools/chat.py b/cl-tools/chat.py
@@ -1,21 +1,33 @@
 import os
 import sys
-from dotenv import load_dotenv
 from openai import OpenAI
 
 # Add the parent directory to sys.path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
 
+# Import configurations using the new config loader
+from config.config_loader import get_embedding_config, get_llm_config
 from retrieval.main import ChromaRetriever
-from config.embedding_config import model_name, db_directory, collection_name
-
 from llm.main import Responder, OpenAIResponder
-from config.llm_config import llm_model, prompt, openai_model, use_openai
 
+# Get configurations
+embedding_config = get_embedding_config()
+llm_config = get_llm_config()
+
+# Extract embedding configuration values
+model_name = embedding_config['model_name']
+collection_name = embedding_config['collection_name']
 
-load_dotenv(os.path.join(parent_dir, '.env'))
+# Extract LLM configuration values
+llm_model = llm_config['llm_model']
+prompt = llm_config['prompt']
+openai_model = llm_config['openai_model']
+use_openai = llm_config['use_openai']
+number_docs_response = llm_config['number_docs_response']
 
+# Extract DB location from environment (defined in .env file)
+db_directory = os.environ.get("FRAG_DB_DIRECTORY")
 
 openai_client = OpenAI(
     api_key=os.environ.get("OPENAI_API_KEY"),  
@@ -26,7 +38,7 @@ def main():
         retriever = ChromaRetriever(embedding_model=model_name, 
                                 db_path=db_directory, 
                                 db_collection=collection_name, 
-                                n_results=5)
+                                n_results=number_docs_response)
 
         user_query = str(input("Ask a question. Type quit to exit:  "))
         if user_query.lower() == "quit":

diff --git a/cl-tools/search.py b/cl-tools/search.py
@@ -6,9 +6,19 @@
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
 
+# Import configurations using the new config loader
+from config.config_loader import get_embedding_config
 from retrieval.main import ChromaRetriever
-from config.embedding_config import model_name, db_directory, collection_name
 
+# Get the configuration
+embedding_config = get_embedding_config()
+
+# Extract configuration values
+model_name = embedding_config['model_name']
+collection_name = embedding_config['collection_name']
+
+# Extract DB location from environment (defined in .env file)
+db_directory = os.environ.get("FRAG_DB_DIRECTORY")
 
 def create_argument_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description='Script to perform vectorDB semantic search')

diff --git a/config/config_loader.py b/config/config_loader.py
@@ -0,0 +1,65 @@
+import os
+import yaml
+import logging
+from pathlib import Path
+from dotenv import load_dotenv
+
+root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+load_dotenv(os.path.join(root_dir, '.env')) # Load environment variables from .env file
+
+
+class ConfigLoader:
+    """
+    Loads configuration from YAML files with environment variable override support.
+    """
+
+    def __init__(self):
+        self.config_dir = Path(__file__).parent
+        self._config_cache = {}
+
+    def load_config(self, name):
+        """
+        Load configuration from a YAML file.
+
+        Args:
+            name: Name of the configuration file without extension (e.g., 'embedding_config')
+
+        Returns:
+            dict: Configuration as a dictionary
+        """
+        # If this file was already loaded, serve the version from the cache
+        if name in self._config_cache:
+            return self._config_cache[name]
+
+        # Check if the config file exists and load it
+        config_path = self.config_dir / f"{name}.yaml"
+        if not config_path.exists():
+            raise FileNotFoundError(f"Configuration file {config_path} not found")
+        with open(config_path, 'r') as f:
+            config = yaml.safe_load(f)
+
+        # Process any path expansions like ~ for home directory
+        self._process_paths(config)
+
+        self._config_cache[name] = config
+        return config
+
+    def _process_paths(self, config):
+        """Process any paths in the config to expand user paths."""
+        for key, value in config.items():
+            if isinstance(value, str) and '~' in value:
+                config[key] = os.path.expanduser(value)
+            elif isinstance(value, dict):
+                self._process_paths(value)
+
+# Create a singleton instance for use throughout the application
+config_loader = ConfigLoader()
+
+# Helper functions to easily access specific configs
+def get_embedding_config():
+    """Get the embedding configuration."""
+    return config_loader.load_config('embedding_config')
+
+def get_llm_config():
+    """Get the LLM configuration."""
+    return config_loader.load_config('llm_config')
diff --git a/config/embedding_config.example.yaml b/config/embedding_config.example.yaml
@@ -0,0 +1,21 @@
+# Embedding and vector database configuration
+
+# Embedding model to use
+model_name: "Lajavaness/bilingual-embedding-large"
+
+# Vector database type
+vector_db: "chromaDB"  # Allowed Values ['chromaDB', 'FAISS']. Only ChromaDB works now
+
+# Collection name in the vector database
+collection_name: "my_collection"
+
+# Language for the tokenizer
+# Supported languages: czech, danish, dutch, english, estonian, finnish, french,
+# german, greek, italian, norwegian, polish, portuguese, russian, slovene,
+# spanish, swedish, turkish
+data_language: "english"
+
+# Number of sentences each chunk will contain
+chunk_size: 20
+
+overlap_size: 5 # must be less than chunk_size. Indicates how many sentences overlap when splitting chunks
diff --git a/config/embedding_config.py b/config/embedding_config.py
diff --git a/config/llm_config.example.yaml b/config/llm_config.example.yaml
@@ -0,0 +1,31 @@
+# LLM configuration settings
+
+# LLM model to use with Ollama
+llm_model: "llama3.1:8b"
+
+# Whether to use OpenAI (true) or Ollama (false)
+use_openai: false
+
+# OpenAI model to use if use_openai is true
+openai_model: "gpt-4o"
+
+# Number of documents to retrieve for generating a response
+number_docs_response: 12
+
+# Prompt template for the RAG system
+prompt: |
+  DOCUMENTS:
+
+  {data}
+
+
+  QUESTION:
+  {query}
+
+
+  INSTRUCTIONS:
+  Answer the users QUESTION using the DOCUMENTS text above.
+  Keep your answer ground in the facts of the DOCUMENT.
+  If the DOCUMENT doesn't contain the facts to answer the QUESTION return NO Answer found
+
+record_data: true
diff --git a/config/llm_config.py b/config/llm_config.py