update chromadb version

Add ChromaDB implementation
2025-12-10 12:46:47 +00:00 · 2025-12-10 12:46:17 +00:00
7 changed files with 384 additions and 7 deletions
--- a/backend/chroma_client.py
+++ b/backend/chroma_client.py
@@ -0,0 +1,165 @@
 """
 ChromaDB Client for storing and retrieving document embeddings
 """
 import chromadb
 from chromadb.config import Settings
 from chromadb.utils import embedding_functions
 import logging
 import os
 import time
 class ChromaClient:
    """
    Client for interacting with ChromaDB vector database.
    Uses Ollama for generating embeddings if available, otherwise falls back to default.
    """
    def __init__(self, host, port, collection_name='munich_news_articles', ollama_base_url=None):
        """
        Initialize ChromaDB client
        Args:
            host: ChromaDB host (e.g. 'localhost' or 'chromadb')
            port: ChromaDB port (default 8000)
            collection_name: Name of the collection to use
            ollama_base_url: Optional URL for Ollama embedding function
        """
        self.host = host
        self.port = port
        self.collection_name = collection_name
        self.client = None
        self.collection = None
        # Setup embedding function
        # We prefer using a local embedding model compatible with Ollama or SentenceTransformers
        # For simplicity in this stack, we can use the default SentenceTransformer (all-MiniLM-L6-v2)
        # which is downloaded automatically by chromadb utils.
        # Alternatively, we could define a custom function using Ollama's /api/embeddings
        self.embedding_function = embedding_functions.DefaultEmbeddingFunction()
    def connect(self):
        """Establish connection to ChromaDB"""
        try:
            self.client = chromadb.HttpClient(
                host=self.host,
                port=self.port,
                settings=Settings(allow_reset=True, anonymized_telemetry=False)
            )
            # Create or get collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                embedding_function=self.embedding_function,
                metadata={"hnsw:space": "cosine"}
            )
            print(f"✓ Connected to ChromaDB at {self.host}:{self.port}")
            return True
        except Exception as e:
            print(f"⚠ Could not connect to ChromaDB: {e}")
            return False
    def add_articles(self, articles):
        """
        Add articles to the vector database
        Args:
            articles: List of dictionaries containing article data.
                     Must have 'link' (used as ID), 'title', 'content', etc.
        """
        if not self.client or not self.collection:
            if not self.connect():
                return False
        if not articles:
            return True
        ids = []
        documents = []
        metadatas = []
        for article in articles:
            # Skip if critical data missing
            if not article.get('link') or not article.get('content'):
                continue
            # Use link as unique ID
            article_id = article.get('link')
            # Prepare text for embedding (Title + Summary + Start of Content)
            # This gives semantic search a good overview
            title = article.get('title', '')
            summary = article.get('summary') or ''
            content_snippet = article.get('content', '')[:1000]
            text_to_embed = f"{title}\n\n{summary}\n\n{content_snippet}"
            # robust metadata (flat dict, no nested objects)
            metadata = {
                "title": title[:100], # Truncate for metadata limits
                "url": article_id,
                "source": article.get('source', 'unknown'),
                "category": article.get('category', 'general'),
                "published_at": str(article.get('published_at', '')),
                "mongo_id": str(article.get('_id', ''))
            }
            ids.append(article_id)
            documents.append(text_to_embed)
            metadatas.append(metadata)
        if not ids:
            return True
        try:
            self.collection.upsert(
                ids=ids,
                documents=documents,
                metadatas=metadatas
            )
            print(f"✓ Indexed {len(ids)} articles in ChromaDB")
            return True
        except Exception as e:
            print(f"✗ Failed to index in ChromaDB: {e}")
            return False
    def search(self, query_text, n_results=5, where=None):
        """
        Search for relevant articles
        Args:
            query_text: The search query
            n_results: Number of results to return
            where: Metadata filter dict (e.g. {"category": "sports"})
        """
        if not self.client or not self.collection:
            if not self.connect():
                return []
        try:
            results = self.collection.query(
                query_texts=[query_text],
                n_results=n_results,
                where=where
            )
            # Format results into a nice list of dicts
            formatted_results = []
            if results and results['ids']:
                for i, id in enumerate(results['ids'][0]):
                    item = {
                        'id': id,
                        'document': results['documents'][0][i] if results['documents'] else None,
                        'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
                        'distance': results['distances'][0][i] if results['distances'] else 0
                    }
                    formatted_results.append(item)
            return formatted_results
        except Exception as e:
            print(f"✗ Search failed: {e}")
            return []
 if __name__ == "__main__":
    # Test client
    client = ChromaClient(host='localhost', port=8000)
    client.connect()
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -7,3 +7,4 @@ requests==2.31.0
 Jinja2==3.1.2
 redis==5.0.1
 chromadb>=0.4.0
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -100,6 +100,24 @@ services:
      timeout: 10s
      retries: 3
  # ChromaDB - Vector Database for AI features
  chromadb:
    image: chromadb/chroma:latest
    container_name: munich-news-chromadb
    restart: unless-stopped
    # No ports exposed - only accessible within Docker network
    environment:
      - IS_PERSISTENT=TRUE
    volumes:
      - chromadb_data:/chroma/chroma
    networks:
      - munich-news-network
    healthcheck:
      test: [ "CMD", "curl", "-f", "http://localhost:8000/api/v1/heartbeat" ]
      interval: 30s
      timeout: 10s
      retries: 3
  # News Crawler - Runs at 6 AM Berlin time
  crawler:
    build:
@@ -264,6 +282,8 @@ volumes:
    driver: local
  ollama_data:
    driver: local
  chromadb_data:
    driver: local
 networks:
  munich-news-network:
--- a/news_crawler/chroma_client.py
+++ b/news_crawler/chroma_client.py
@@ -0,0 +1,165 @@
 """
 ChromaDB Client for storing and retrieving document embeddings
 """
 import chromadb
 from chromadb.config import Settings
 from chromadb.utils import embedding_functions
 import logging
 import os
 import time
 class ChromaClient:
    """
    Client for interacting with ChromaDB vector database.
    Uses Ollama for generating embeddings if available, otherwise falls back to default.
    """
    def __init__(self, host, port, collection_name='munich_news_articles', ollama_base_url=None):
        """
        Initialize ChromaDB client
        Args:
            host: ChromaDB host (e.g. 'localhost' or 'chromadb')
            port: ChromaDB port (default 8000)
            collection_name: Name of the collection to use
            ollama_base_url: Optional URL for Ollama embedding function
        """
        self.host = host
        self.port = port
        self.collection_name = collection_name
        self.client = None
        self.collection = None
        # Setup embedding function
        # We prefer using a local embedding model compatible with Ollama or SentenceTransformers
        # For simplicity in this stack, we can use the default SentenceTransformer (all-MiniLM-L6-v2)
        # which is downloaded automatically by chromadb utils.
        # Alternatively, we could define a custom function using Ollama's /api/embeddings
        self.embedding_function = embedding_functions.DefaultEmbeddingFunction()
    def connect(self):
        """Establish connection to ChromaDB"""
        try:
            self.client = chromadb.HttpClient(
                host=self.host,
                port=self.port,
                settings=Settings(allow_reset=True, anonymized_telemetry=False)
            )
            # Create or get collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                embedding_function=self.embedding_function,
                metadata={"hnsw:space": "cosine"}
            )
            print(f"✓ Connected to ChromaDB at {self.host}:{self.port}")
            return True
        except Exception as e:
            print(f"⚠ Could not connect to ChromaDB: {e}")
            return False
    def add_articles(self, articles):
        """
        Add articles to the vector database
        Args:
            articles: List of dictionaries containing article data.
                     Must have 'link' (used as ID), 'title', 'content', etc.
        """
        if not self.client or not self.collection:
            if not self.connect():
                return False
        if not articles:
            return True
        ids = []
        documents = []
        metadatas = []
        for article in articles:
            # Skip if critical data missing
            if not article.get('link') or not article.get('content'):
                continue
            # Use link as unique ID
            article_id = article.get('link')
            # Prepare text for embedding (Title + Summary + Start of Content)
            # This gives semantic search a good overview
            title = article.get('title', '')
            summary = article.get('summary') or ''
            content_snippet = article.get('content', '')[:1000]
            text_to_embed = f"{title}\n\n{summary}\n\n{content_snippet}"
            # robust metadata (flat dict, no nested objects)
            metadata = {
                "title": title[:100], # Truncate for metadata limits
                "url": article_id,
                "source": article.get('source', 'unknown'),
                "category": article.get('category', 'general'),
                "published_at": str(article.get('published_at', '')),
                "mongo_id": str(article.get('_id', ''))
            }
            ids.append(article_id)
            documents.append(text_to_embed)
            metadatas.append(metadata)
        if not ids:
            return True
        try:
            self.collection.upsert(
                ids=ids,
                documents=documents,
                metadatas=metadatas
            )
            print(f"✓ Indexed {len(ids)} articles in ChromaDB")
            return True
        except Exception as e:
            print(f"✗ Failed to index in ChromaDB: {e}")
            return False
    def search(self, query_text, n_results=5, where=None):
        """
        Search for relevant articles
        Args:
            query_text: The search query
            n_results: Number of results to return
            where: Metadata filter dict (e.g. {"category": "sports"})
        """
        if not self.client or not self.collection:
            if not self.connect():
                return []
        try:
            results = self.collection.query(
                query_texts=[query_text],
                n_results=n_results,
                where=where
            )
            # Format results into a nice list of dicts
            formatted_results = []
            if results and results['ids']:
                for i, id in enumerate(results['ids'][0]):
                    item = {
                        'id': id,
                        'document': results['documents'][0][i] if results['documents'] else None,
                        'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
                        'distance': results['distances'][0][i] if results['distances'] else 0
                    }
                    formatted_results.append(item)
            return formatted_results
        except Exception as e:
            print(f"✗ Search failed: {e}")
            return []
 if __name__ == "__main__":
    # Test client
    client = ChromaClient(host='localhost', port=8000)
    client.connect()
--- a/news_crawler/config.py
+++ b/news_crawler/config.py
@@ -34,6 +34,11 @@ class Config:
    MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
    DB_NAME = 'munich_news'
    # ChromaDB Configuration
    CHROMA_HOST = os.getenv('CHROMA_HOST', 'chromadb')
    CHROMA_PORT = int(os.getenv('CHROMA_PORT', '8000'))
    CHROMA_COLLECTION = 'munich_news_articles'
    # Ollama Configuration
    OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
    OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'phi3:latest')
--- a/news_crawler/crawler_service.py
+++ b/news_crawler/crawler_service.py
@@ -14,7 +14,9 @@ from rss_utils import extract_article_url, extract_article_summary, extract_publ
 from config import Config
 from ollama_client import OllamaClient
 from article_clustering import ArticleClusterer
 from article_clustering import ArticleClusterer
 from cluster_summarizer import create_cluster_summaries
 from chroma_client import ChromaClient
 # Load environment variables
 load_dotenv(dotenv_path='../.env')
@@ -38,6 +40,13 @@ ollama_client = OllamaClient(
 # Initialize Article Clusterer (will be initialized after ollama_client)
 article_clusterer = None
 # Initialize ChromaDB client
 chroma_client = ChromaClient(
    host=Config.CHROMA_HOST,
    port=Config.CHROMA_PORT,
    collection_name=Config.CHROMA_COLLECTION
 )
 # Print configuration on startup
 if __name__ != '__main__':
    Config.print_config()
@@ -440,6 +449,17 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
                    crawled_count += 1
                    print(f"   ✓ Saved ({article_data.get('word_count', 0)} words)")
                    # Index in ChromaDB
                    try:
                        # Add mongo _id to article doc for reference
                        saved_article = articles_collection.find_one({'link': article_url})
                        if saved_article:
                            article_doc['_id'] = str(saved_article['_id'])
                            chroma_client.add_articles([article_doc])
                    except Exception as e:
                        print(f"   ⚠ Failed to index in ChromaDB: {e}")
                except DuplicateKeyError:
                    print(f"   ⚠ Duplicate key error")
                except Exception as e:
--- a/news_crawler/requirements.txt
+++ b/news_crawler/requirements.txt
@@ -7,3 +7,4 @@ python-dotenv==1.0.0
 schedule==1.2.0
 pytz==2023.3
 redis==5.0.1
 chromadb>=0.4.0
Author	SHA1	Message	Date
dongho	50b9888004	update chromadb version	2025-12-10 12:46:47 +00:00
dongho	6c8d6d0940	Add ChromaDB implementation	2025-12-10 12:46:17 +00:00