From 6c8d6d094007cbcbe591f74cbcf6b4dc4e0ea566 Mon Sep 17 00:00:00 2001
From: dongho <dongho@ekstrah.com>
Date: Wed, 10 Dec 2025 12:46:17 +0000
Subject: [PATCH] Add ChromaDB implementation

---
 backend/chroma_client.py        | 165 ++++++++++++++++++++++++++++++++
 backend/requirements.txt        |   1 +
 docker-compose.yml              |  34 +++++--
 news_crawler/chroma_client.py   | 165 ++++++++++++++++++++++++++++++++
 news_crawler/config.py          |   5 +
 news_crawler/crawler_service.py |  20 ++++
 news_crawler/requirements.txt   |   1 +
 7 files changed, 384 insertions(+), 7 deletions(-)
 create mode 100644 backend/chroma_client.py
 create mode 100644 news_crawler/chroma_client.py

diff --git a/backend/chroma_client.py b/backend/chroma_client.py
new file mode 100644
index 0000000..7476c0b
--- /dev/null
+++ b/backend/chroma_client.py
@@ -0,0 +1,165 @@
+"""
+ChromaDB Client for storing and retrieving document embeddings
+"""
+import chromadb
+from chromadb.config import Settings
+from chromadb.utils import embedding_functions
+import logging
+import os
+import time
+
+class ChromaClient:
+    """
+    Client for interacting with ChromaDB vector database.
+    Uses Ollama for generating embeddings if available, otherwise falls back to default.
+    """
+    
+    def __init__(self, host, port, collection_name='munich_news_articles', ollama_base_url=None):
+        """
+        Initialize ChromaDB client
+        
+        Args:
+            host: ChromaDB host (e.g. 'localhost' or 'chromadb')
+            port: ChromaDB port (default 8000)
+            collection_name: Name of the collection to use
+            ollama_base_url: Optional URL for Ollama embedding function
+        """
+        self.host = host
+        self.port = port
+        self.collection_name = collection_name
+        self.client = None
+        self.collection = None
+        
+        # Setup embedding function
+        # We prefer using a local embedding model compatible with Ollama or SentenceTransformers
+        # For simplicity in this stack, we can use the default SentenceTransformer (all-MiniLM-L6-v2)
+        # which is downloaded automatically by chromadb utils.
+        # Alternatively, we could define a custom function using Ollama's /api/embeddings
+        self.embedding_function = embedding_functions.DefaultEmbeddingFunction()
+        
+    def connect(self):
+        """Establish connection to ChromaDB"""
+        try:
+            self.client = chromadb.HttpClient(
+                host=self.host,
+                port=self.port,
+                settings=Settings(allow_reset=True, anonymized_telemetry=False)
+            )
+            
+            # Create or get collection
+            self.collection = self.client.get_or_create_collection(
+                name=self.collection_name,
+                embedding_function=self.embedding_function,
+                metadata={"hnsw:space": "cosine"}
+            )
+            print(f"✓ Connected to ChromaDB at {self.host}:{self.port}")
+            return True
+        except Exception as e:
+            print(f"⚠ Could not connect to ChromaDB: {e}")
+            return False
+
+    def add_articles(self, articles):
+        """
+        Add articles to the vector database
+        
+        Args:
+            articles: List of dictionaries containing article data.
+                     Must have 'link' (used as ID), 'title', 'content', etc.
+        """
+        if not self.client or not self.collection:
+            if not self.connect():
+                return False
+                
+        if not articles:
+            return True
+            
+        ids = []
+        documents = []
+        metadatas = []
+        
+        for article in articles:
+            # Skip if critical data missing
+            if not article.get('link') or not article.get('content'):
+                continue
+                
+            # Use link as unique ID
+            article_id = article.get('link')
+            
+            # Prepare text for embedding (Title + Summary + Start of Content)
+            # This gives semantic search a good overview
+            title = article.get('title', '')
+            summary = article.get('summary') or ''
+            content_snippet = article.get('content', '')[:1000]
+            
+            text_to_embed = f"{title}\n\n{summary}\n\n{content_snippet}"
+            
+            # robust metadata (flat dict, no nested objects)
+            metadata = {
+                "title": title[:100], # Truncate for metadata limits
+                "url": article_id,
+                "source": article.get('source', 'unknown'),
+                "category": article.get('category', 'general'),
+                "published_at": str(article.get('published_at', '')),
+                "mongo_id": str(article.get('_id', ''))
+            }
+            
+            ids.append(article_id)
+            documents.append(text_to_embed)
+            metadatas.append(metadata)
+            
+        if not ids:
+            return True
+            
+        try:
+            self.collection.upsert(
+                ids=ids,
+                documents=documents,
+                metadatas=metadatas
+            )
+            print(f"✓ Indexed {len(ids)} articles in ChromaDB")
+            return True
+        except Exception as e:
+            print(f"✗ Failed to index in ChromaDB: {e}")
+            return False
+
+    def search(self, query_text, n_results=5, where=None):
+        """
+        Search for relevant articles
+        
+        Args:
+            query_text: The search query
+            n_results: Number of results to return
+            where: Metadata filter dict (e.g. {"category": "sports"})
+        """
+        if not self.client or not self.collection:
+            if not self.connect():
+                return []
+                
+        try:
+            results = self.collection.query(
+                query_texts=[query_text],
+                n_results=n_results,
+                where=where
+            )
+            
+            # Format results into a nice list of dicts
+            formatted_results = []
+            if results and results['ids']:
+                for i, id in enumerate(results['ids'][0]):
+                    item = {
+                        'id': id,
+                        'document': results['documents'][0][i] if results['documents'] else None,
+                        'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
+                        'distance': results['distances'][0][i] if results['distances'] else 0
+                    }
+                    formatted_results.append(item)
+                    
+            return formatted_results
+        except Exception as e:
+            print(f"✗ Search failed: {e}")
+            return []
+
+if __name__ == "__main__":
+    # Test client
+    client = ChromaClient(host='localhost', port=8000)
+    client.connect()
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 51643e0..c6033e9 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -7,3 +7,4 @@ requests==2.31.0
 Jinja2==3.1.2
 redis==5.0.1
 
+chromadb
diff --git a/docker-compose.yml b/docker-compose.yml
index 202e8fb..f1c8f97 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -38,7 +38,7 @@ services:
     #           count: all
     #           capabilities: [gpu]
     healthcheck:
-      test: ["CMD-SHELL", "ollama list || exit 1"]
+      test: [ "CMD-SHELL", "ollama list || exit 1" ]
       interval: 30s
       timeout: 10s
       retries: 3
@@ -72,7 +72,7 @@ services:
     networks:
       - munich-news-network
     healthcheck:
-      test: ["CMD", "redis-cli", "ping"]
+      test: [ "CMD", "redis-cli", "ping" ]
       interval: 30s
       timeout: 10s
       retries: 3
@@ -100,6 +100,24 @@ services:
       timeout: 10s
       retries: 3
 
+  # ChromaDB - Vector Database for AI features
+  chromadb:
+    image: chromadb/chroma:latest
+    container_name: munich-news-chromadb
+    restart: unless-stopped
+    # No ports exposed - only accessible within Docker network
+    environment:
+      - IS_PERSISTENT=TRUE
+    volumes:
+      - chromadb_data:/chroma/chroma
+    networks:
+      - munich-news-network
+    healthcheck:
+      test: [ "CMD", "curl", "-f", "http://localhost:8000/api/v1/heartbeat" ]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
   # News Crawler - Runs at 6 AM Berlin time
   crawler:
     build:
@@ -120,7 +138,7 @@ services:
     networks:
       - munich-news-network
     healthcheck:
-      test: ["CMD", "python", "-c", "import sys; sys.exit(0)"]
+      test: [ "CMD", "python", "-c", "import sys; sys.exit(0)" ]
       interval: 1m
       timeout: 10s
       retries: 3
@@ -149,7 +167,7 @@ services:
       - munich-news-network
       - proxy
     healthcheck:
-      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"]
+      test: [ "CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')" ]
       interval: 30s
       timeout: 10s
       retries: 3
@@ -188,7 +206,7 @@ services:
     networks:
       - munich-news-network
     healthcheck:
-      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')"]
+      test: [ "CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')" ]
       interval: 30s
       timeout: 10s
       retries: 3
@@ -214,7 +232,7 @@ services:
     networks:
       - munich-news-network
     healthcheck:
-      test: ["CMD", "python", "-c", "import sys; sys.exit(0)"]
+      test: [ "CMD", "python", "-c", "import sys; sys.exit(0)" ]
       interval: 1m
       timeout: 10s
       retries: 3
@@ -239,7 +257,7 @@ services:
       - munich-news-network
       - proxy
     healthcheck:
-      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000"]
+      test: [ "CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000" ]
       interval: 30s
       timeout: 10s
       retries: 3
@@ -264,6 +282,8 @@ volumes:
     driver: local
   ollama_data:
     driver: local
+  chromadb_data:
+    driver: local
 
 networks:
   munich-news-network:
diff --git a/news_crawler/chroma_client.py b/news_crawler/chroma_client.py
new file mode 100644
index 0000000..7476c0b
--- /dev/null
+++ b/news_crawler/chroma_client.py
@@ -0,0 +1,165 @@
+"""
+ChromaDB Client for storing and retrieving document embeddings
+"""
+import chromadb
+from chromadb.config import Settings
+from chromadb.utils import embedding_functions
+import logging
+import os
+import time
+
+class ChromaClient:
+    """
+    Client for interacting with ChromaDB vector database.
+    Uses Ollama for generating embeddings if available, otherwise falls back to default.
+    """
+    
+    def __init__(self, host, port, collection_name='munich_news_articles', ollama_base_url=None):
+        """
+        Initialize ChromaDB client
+        
+        Args:
+            host: ChromaDB host (e.g. 'localhost' or 'chromadb')
+            port: ChromaDB port (default 8000)
+            collection_name: Name of the collection to use
+            ollama_base_url: Optional URL for Ollama embedding function
+        """
+        self.host = host
+        self.port = port
+        self.collection_name = collection_name
+        self.client = None
+        self.collection = None
+        
+        # Setup embedding function
+        # We prefer using a local embedding model compatible with Ollama or SentenceTransformers
+        # For simplicity in this stack, we can use the default SentenceTransformer (all-MiniLM-L6-v2)
+        # which is downloaded automatically by chromadb utils.
+        # Alternatively, we could define a custom function using Ollama's /api/embeddings
+        self.embedding_function = embedding_functions.DefaultEmbeddingFunction()
+        
+    def connect(self):
+        """Establish connection to ChromaDB"""
+        try:
+            self.client = chromadb.HttpClient(
+                host=self.host,
+                port=self.port,
+                settings=Settings(allow_reset=True, anonymized_telemetry=False)
+            )
+            
+            # Create or get collection
+            self.collection = self.client.get_or_create_collection(
+                name=self.collection_name,
+                embedding_function=self.embedding_function,
+                metadata={"hnsw:space": "cosine"}
+            )
+            print(f"✓ Connected to ChromaDB at {self.host}:{self.port}")
+            return True
+        except Exception as e:
+            print(f"⚠ Could not connect to ChromaDB: {e}")
+            return False
+
+    def add_articles(self, articles):
+        """
+        Add articles to the vector database
+        
+        Args:
+            articles: List of dictionaries containing article data.
+                     Must have 'link' (used as ID), 'title', 'content', etc.
+        """
+        if not self.client or not self.collection:
+            if not self.connect():
+                return False
+                
+        if not articles:
+            return True
+            
+        ids = []
+        documents = []
+        metadatas = []
+        
+        for article in articles:
+            # Skip if critical data missing
+            if not article.get('link') or not article.get('content'):
+                continue
+                
+            # Use link as unique ID
+            article_id = article.get('link')
+            
+            # Prepare text for embedding (Title + Summary + Start of Content)
+            # This gives semantic search a good overview
+            title = article.get('title', '')
+            summary = article.get('summary') or ''
+            content_snippet = article.get('content', '')[:1000]
+            
+            text_to_embed = f"{title}\n\n{summary}\n\n{content_snippet}"
+            
+            # robust metadata (flat dict, no nested objects)
+            metadata = {
+                "title": title[:100], # Truncate for metadata limits
+                "url": article_id,
+                "source": article.get('source', 'unknown'),
+                "category": article.get('category', 'general'),
+                "published_at": str(article.get('published_at', '')),
+                "mongo_id": str(article.get('_id', ''))
+            }
+            
+            ids.append(article_id)
+            documents.append(text_to_embed)
+            metadatas.append(metadata)
+            
+        if not ids:
+            return True
+            
+        try:
+            self.collection.upsert(
+                ids=ids,
+                documents=documents,
+                metadatas=metadatas
+            )
+            print(f"✓ Indexed {len(ids)} articles in ChromaDB")
+            return True
+        except Exception as e:
+            print(f"✗ Failed to index in ChromaDB: {e}")
+            return False
+
+    def search(self, query_text, n_results=5, where=None):
+        """
+        Search for relevant articles
+        
+        Args:
+            query_text: The search query
+            n_results: Number of results to return
+            where: Metadata filter dict (e.g. {"category": "sports"})
+        """
+        if not self.client or not self.collection:
+            if not self.connect():
+                return []
+                
+        try:
+            results = self.collection.query(
+                query_texts=[query_text],
+                n_results=n_results,
+                where=where
+            )
+            
+            # Format results into a nice list of dicts
+            formatted_results = []
+            if results and results['ids']:
+                for i, id in enumerate(results['ids'][0]):
+                    item = {
+                        'id': id,
+                        'document': results['documents'][0][i] if results['documents'] else None,
+                        'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
+                        'distance': results['distances'][0][i] if results['distances'] else 0
+                    }
+                    formatted_results.append(item)
+                    
+            return formatted_results
+        except Exception as e:
+            print(f"✗ Search failed: {e}")
+            return []
+
+if __name__ == "__main__":
+    # Test client
+    client = ChromaClient(host='localhost', port=8000)
+    client.connect()
diff --git a/news_crawler/config.py b/news_crawler/config.py
index f9b6463..65f95d1 100644
--- a/news_crawler/config.py
+++ b/news_crawler/config.py
@@ -34,6 +34,11 @@ class Config:
     MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
     DB_NAME = 'munich_news'
     
+    # ChromaDB Configuration
+    CHROMA_HOST = os.getenv('CHROMA_HOST', 'chromadb')
+    CHROMA_PORT = int(os.getenv('CHROMA_PORT', '8000'))
+    CHROMA_COLLECTION = 'munich_news_articles'
+    
     # Ollama Configuration
     OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
     OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'phi3:latest')
diff --git a/news_crawler/crawler_service.py b/news_crawler/crawler_service.py
index 1540a71..2f08c33 100644
--- a/news_crawler/crawler_service.py
+++ b/news_crawler/crawler_service.py
@@ -14,7 +14,9 @@ from rss_utils import extract_article_url, extract_article_summary, extract_publ
 from config import Config
 from ollama_client import OllamaClient
 from article_clustering import ArticleClusterer
+from article_clustering import ArticleClusterer
 from cluster_summarizer import create_cluster_summaries
+from chroma_client import ChromaClient
 
 # Load environment variables
 load_dotenv(dotenv_path='../.env')
@@ -38,6 +40,13 @@ ollama_client = OllamaClient(
 # Initialize Article Clusterer (will be initialized after ollama_client)
 article_clusterer = None
 
+# Initialize ChromaDB client
+chroma_client = ChromaClient(
+    host=Config.CHROMA_HOST,
+    port=Config.CHROMA_PORT,
+    collection_name=Config.CHROMA_COLLECTION
+)
+
 # Print configuration on startup
 if __name__ != '__main__':
     Config.print_config()
@@ -440,6 +449,17 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
                     crawled_count += 1
                     print(f"   ✓ Saved ({article_data.get('word_count', 0)} words)")
                     
+                    # Index in ChromaDB
+                    try:
+                        # Add mongo _id to article doc for reference
+                        saved_article = articles_collection.find_one({'link': article_url})
+                        if saved_article:
+                            article_doc['_id'] = str(saved_article['_id'])
+                            chroma_client.add_articles([article_doc])
+                    except Exception as e:
+                        print(f"   ⚠ Failed to index in ChromaDB: {e}")
+                    
+                    
                 except DuplicateKeyError:
                     print(f"   ⚠ Duplicate key error")
                 except Exception as e:
diff --git a/news_crawler/requirements.txt b/news_crawler/requirements.txt
index 1c6e14f..eec975b 100644
--- a/news_crawler/requirements.txt
+++ b/news_crawler/requirements.txt
@@ -7,3 +7,4 @@ python-dotenv==1.0.0
 schedule==1.2.0
 pytz==2023.3
 redis==5.0.1
+chromadb