From 6c8d6d094007cbcbe591f74cbcf6b4dc4e0ea566 Mon Sep 17 00:00:00 2001 From: dongho Date: Wed, 10 Dec 2025 12:46:17 +0000 Subject: [PATCH] Add ChromaDB implementation --- backend/chroma_client.py | 165 ++++++++++++++++++++++++++++++++ backend/requirements.txt | 1 + docker-compose.yml | 34 +++++-- news_crawler/chroma_client.py | 165 ++++++++++++++++++++++++++++++++ news_crawler/config.py | 5 + news_crawler/crawler_service.py | 20 ++++ news_crawler/requirements.txt | 1 + 7 files changed, 384 insertions(+), 7 deletions(-) create mode 100644 backend/chroma_client.py create mode 100644 news_crawler/chroma_client.py diff --git a/backend/chroma_client.py b/backend/chroma_client.py new file mode 100644 index 0000000..7476c0b --- /dev/null +++ b/backend/chroma_client.py @@ -0,0 +1,165 @@ +""" +ChromaDB Client for storing and retrieving document embeddings +""" +import chromadb +from chromadb.config import Settings +from chromadb.utils import embedding_functions +import logging +import os +import time + +class ChromaClient: + """ + Client for interacting with ChromaDB vector database. + Uses Ollama for generating embeddings if available, otherwise falls back to default. + """ + + def __init__(self, host, port, collection_name='munich_news_articles', ollama_base_url=None): + """ + Initialize ChromaDB client + + Args: + host: ChromaDB host (e.g. 'localhost' or 'chromadb') + port: ChromaDB port (default 8000) + collection_name: Name of the collection to use + ollama_base_url: Optional URL for Ollama embedding function + """ + self.host = host + self.port = port + self.collection_name = collection_name + self.client = None + self.collection = None + + # Setup embedding function + # We prefer using a local embedding model compatible with Ollama or SentenceTransformers + # For simplicity in this stack, we can use the default SentenceTransformer (all-MiniLM-L6-v2) + # which is downloaded automatically by chromadb utils. + # Alternatively, we could define a custom function using Ollama's /api/embeddings + self.embedding_function = embedding_functions.DefaultEmbeddingFunction() + + def connect(self): + """Establish connection to ChromaDB""" + try: + self.client = chromadb.HttpClient( + host=self.host, + port=self.port, + settings=Settings(allow_reset=True, anonymized_telemetry=False) + ) + + # Create or get collection + self.collection = self.client.get_or_create_collection( + name=self.collection_name, + embedding_function=self.embedding_function, + metadata={"hnsw:space": "cosine"} + ) + print(f"✓ Connected to ChromaDB at {self.host}:{self.port}") + return True + except Exception as e: + print(f"⚠ Could not connect to ChromaDB: {e}") + return False + + def add_articles(self, articles): + """ + Add articles to the vector database + + Args: + articles: List of dictionaries containing article data. + Must have 'link' (used as ID), 'title', 'content', etc. + """ + if not self.client or not self.collection: + if not self.connect(): + return False + + if not articles: + return True + + ids = [] + documents = [] + metadatas = [] + + for article in articles: + # Skip if critical data missing + if not article.get('link') or not article.get('content'): + continue + + # Use link as unique ID + article_id = article.get('link') + + # Prepare text for embedding (Title + Summary + Start of Content) + # This gives semantic search a good overview + title = article.get('title', '') + summary = article.get('summary') or '' + content_snippet = article.get('content', '')[:1000] + + text_to_embed = f"{title}\n\n{summary}\n\n{content_snippet}" + + # robust metadata (flat dict, no nested objects) + metadata = { + "title": title[:100], # Truncate for metadata limits + "url": article_id, + "source": article.get('source', 'unknown'), + "category": article.get('category', 'general'), + "published_at": str(article.get('published_at', '')), + "mongo_id": str(article.get('_id', '')) + } + + ids.append(article_id) + documents.append(text_to_embed) + metadatas.append(metadata) + + if not ids: + return True + + try: + self.collection.upsert( + ids=ids, + documents=documents, + metadatas=metadatas + ) + print(f"✓ Indexed {len(ids)} articles in ChromaDB") + return True + except Exception as e: + print(f"✗ Failed to index in ChromaDB: {e}") + return False + + def search(self, query_text, n_results=5, where=None): + """ + Search for relevant articles + + Args: + query_text: The search query + n_results: Number of results to return + where: Metadata filter dict (e.g. {"category": "sports"}) + """ + if not self.client or not self.collection: + if not self.connect(): + return [] + + try: + results = self.collection.query( + query_texts=[query_text], + n_results=n_results, + where=where + ) + + # Format results into a nice list of dicts + formatted_results = [] + if results and results['ids']: + for i, id in enumerate(results['ids'][0]): + item = { + 'id': id, + 'document': results['documents'][0][i] if results['documents'] else None, + 'metadata': results['metadatas'][0][i] if results['metadatas'] else {}, + 'distance': results['distances'][0][i] if results['distances'] else 0 + } + formatted_results.append(item) + + return formatted_results + except Exception as e: + print(f"✗ Search failed: {e}") + return [] + +if __name__ == "__main__": + # Test client + client = ChromaClient(host='localhost', port=8000) + client.connect() diff --git a/backend/requirements.txt b/backend/requirements.txt index 51643e0..c6033e9 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -7,3 +7,4 @@ requests==2.31.0 Jinja2==3.1.2 redis==5.0.1 +chromadb diff --git a/docker-compose.yml b/docker-compose.yml index 202e8fb..f1c8f97 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -38,7 +38,7 @@ services: # count: all # capabilities: [gpu] healthcheck: - test: ["CMD-SHELL", "ollama list || exit 1"] + test: [ "CMD-SHELL", "ollama list || exit 1" ] interval: 30s timeout: 10s retries: 3 @@ -72,7 +72,7 @@ services: networks: - munich-news-network healthcheck: - test: ["CMD", "redis-cli", "ping"] + test: [ "CMD", "redis-cli", "ping" ] interval: 30s timeout: 10s retries: 3 @@ -100,6 +100,24 @@ services: timeout: 10s retries: 3 + # ChromaDB - Vector Database for AI features + chromadb: + image: chromadb/chroma:latest + container_name: munich-news-chromadb + restart: unless-stopped + # No ports exposed - only accessible within Docker network + environment: + - IS_PERSISTENT=TRUE + volumes: + - chromadb_data:/chroma/chroma + networks: + - munich-news-network + healthcheck: + test: [ "CMD", "curl", "-f", "http://localhost:8000/api/v1/heartbeat" ] + interval: 30s + timeout: 10s + retries: 3 + # News Crawler - Runs at 6 AM Berlin time crawler: build: @@ -120,7 +138,7 @@ services: networks: - munich-news-network healthcheck: - test: ["CMD", "python", "-c", "import sys; sys.exit(0)"] + test: [ "CMD", "python", "-c", "import sys; sys.exit(0)" ] interval: 1m timeout: 10s retries: 3 @@ -149,7 +167,7 @@ services: - munich-news-network - proxy healthcheck: - test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"] + test: [ "CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')" ] interval: 30s timeout: 10s retries: 3 @@ -188,7 +206,7 @@ services: networks: - munich-news-network healthcheck: - test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')"] + test: [ "CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5002/health')" ] interval: 30s timeout: 10s retries: 3 @@ -214,7 +232,7 @@ services: networks: - munich-news-network healthcheck: - test: ["CMD", "python", "-c", "import sys; sys.exit(0)"] + test: [ "CMD", "python", "-c", "import sys; sys.exit(0)" ] interval: 1m timeout: 10s retries: 3 @@ -239,7 +257,7 @@ services: - munich-news-network - proxy healthcheck: - test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000"] + test: [ "CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000" ] interval: 30s timeout: 10s retries: 3 @@ -264,6 +282,8 @@ volumes: driver: local ollama_data: driver: local + chromadb_data: + driver: local networks: munich-news-network: diff --git a/news_crawler/chroma_client.py b/news_crawler/chroma_client.py new file mode 100644 index 0000000..7476c0b --- /dev/null +++ b/news_crawler/chroma_client.py @@ -0,0 +1,165 @@ +""" +ChromaDB Client for storing and retrieving document embeddings +""" +import chromadb +from chromadb.config import Settings +from chromadb.utils import embedding_functions +import logging +import os +import time + +class ChromaClient: + """ + Client for interacting with ChromaDB vector database. + Uses Ollama for generating embeddings if available, otherwise falls back to default. + """ + + def __init__(self, host, port, collection_name='munich_news_articles', ollama_base_url=None): + """ + Initialize ChromaDB client + + Args: + host: ChromaDB host (e.g. 'localhost' or 'chromadb') + port: ChromaDB port (default 8000) + collection_name: Name of the collection to use + ollama_base_url: Optional URL for Ollama embedding function + """ + self.host = host + self.port = port + self.collection_name = collection_name + self.client = None + self.collection = None + + # Setup embedding function + # We prefer using a local embedding model compatible with Ollama or SentenceTransformers + # For simplicity in this stack, we can use the default SentenceTransformer (all-MiniLM-L6-v2) + # which is downloaded automatically by chromadb utils. + # Alternatively, we could define a custom function using Ollama's /api/embeddings + self.embedding_function = embedding_functions.DefaultEmbeddingFunction() + + def connect(self): + """Establish connection to ChromaDB""" + try: + self.client = chromadb.HttpClient( + host=self.host, + port=self.port, + settings=Settings(allow_reset=True, anonymized_telemetry=False) + ) + + # Create or get collection + self.collection = self.client.get_or_create_collection( + name=self.collection_name, + embedding_function=self.embedding_function, + metadata={"hnsw:space": "cosine"} + ) + print(f"✓ Connected to ChromaDB at {self.host}:{self.port}") + return True + except Exception as e: + print(f"⚠ Could not connect to ChromaDB: {e}") + return False + + def add_articles(self, articles): + """ + Add articles to the vector database + + Args: + articles: List of dictionaries containing article data. + Must have 'link' (used as ID), 'title', 'content', etc. + """ + if not self.client or not self.collection: + if not self.connect(): + return False + + if not articles: + return True + + ids = [] + documents = [] + metadatas = [] + + for article in articles: + # Skip if critical data missing + if not article.get('link') or not article.get('content'): + continue + + # Use link as unique ID + article_id = article.get('link') + + # Prepare text for embedding (Title + Summary + Start of Content) + # This gives semantic search a good overview + title = article.get('title', '') + summary = article.get('summary') or '' + content_snippet = article.get('content', '')[:1000] + + text_to_embed = f"{title}\n\n{summary}\n\n{content_snippet}" + + # robust metadata (flat dict, no nested objects) + metadata = { + "title": title[:100], # Truncate for metadata limits + "url": article_id, + "source": article.get('source', 'unknown'), + "category": article.get('category', 'general'), + "published_at": str(article.get('published_at', '')), + "mongo_id": str(article.get('_id', '')) + } + + ids.append(article_id) + documents.append(text_to_embed) + metadatas.append(metadata) + + if not ids: + return True + + try: + self.collection.upsert( + ids=ids, + documents=documents, + metadatas=metadatas + ) + print(f"✓ Indexed {len(ids)} articles in ChromaDB") + return True + except Exception as e: + print(f"✗ Failed to index in ChromaDB: {e}") + return False + + def search(self, query_text, n_results=5, where=None): + """ + Search for relevant articles + + Args: + query_text: The search query + n_results: Number of results to return + where: Metadata filter dict (e.g. {"category": "sports"}) + """ + if not self.client or not self.collection: + if not self.connect(): + return [] + + try: + results = self.collection.query( + query_texts=[query_text], + n_results=n_results, + where=where + ) + + # Format results into a nice list of dicts + formatted_results = [] + if results and results['ids']: + for i, id in enumerate(results['ids'][0]): + item = { + 'id': id, + 'document': results['documents'][0][i] if results['documents'] else None, + 'metadata': results['metadatas'][0][i] if results['metadatas'] else {}, + 'distance': results['distances'][0][i] if results['distances'] else 0 + } + formatted_results.append(item) + + return formatted_results + except Exception as e: + print(f"✗ Search failed: {e}") + return [] + +if __name__ == "__main__": + # Test client + client = ChromaClient(host='localhost', port=8000) + client.connect() diff --git a/news_crawler/config.py b/news_crawler/config.py index f9b6463..65f95d1 100644 --- a/news_crawler/config.py +++ b/news_crawler/config.py @@ -34,6 +34,11 @@ class Config: MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/') DB_NAME = 'munich_news' + # ChromaDB Configuration + CHROMA_HOST = os.getenv('CHROMA_HOST', 'chromadb') + CHROMA_PORT = int(os.getenv('CHROMA_PORT', '8000')) + CHROMA_COLLECTION = 'munich_news_articles' + # Ollama Configuration OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434') OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'phi3:latest') diff --git a/news_crawler/crawler_service.py b/news_crawler/crawler_service.py index 1540a71..2f08c33 100644 --- a/news_crawler/crawler_service.py +++ b/news_crawler/crawler_service.py @@ -14,7 +14,9 @@ from rss_utils import extract_article_url, extract_article_summary, extract_publ from config import Config from ollama_client import OllamaClient from article_clustering import ArticleClusterer +from article_clustering import ArticleClusterer from cluster_summarizer import create_cluster_summaries +from chroma_client import ChromaClient # Load environment variables load_dotenv(dotenv_path='../.env') @@ -38,6 +40,13 @@ ollama_client = OllamaClient( # Initialize Article Clusterer (will be initialized after ollama_client) article_clusterer = None +# Initialize ChromaDB client +chroma_client = ChromaClient( + host=Config.CHROMA_HOST, + port=Config.CHROMA_PORT, + collection_name=Config.CHROMA_COLLECTION +) + # Print configuration on startup if __name__ != '__main__': Config.print_config() @@ -440,6 +449,17 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10 crawled_count += 1 print(f" ✓ Saved ({article_data.get('word_count', 0)} words)") + # Index in ChromaDB + try: + # Add mongo _id to article doc for reference + saved_article = articles_collection.find_one({'link': article_url}) + if saved_article: + article_doc['_id'] = str(saved_article['_id']) + chroma_client.add_articles([article_doc]) + except Exception as e: + print(f" ⚠ Failed to index in ChromaDB: {e}") + + except DuplicateKeyError: print(f" ⚠ Duplicate key error") except Exception as e: diff --git a/news_crawler/requirements.txt b/news_crawler/requirements.txt index 1c6e14f..eec975b 100644 --- a/news_crawler/requirements.txt +++ b/news_crawler/requirements.txt @@ -7,3 +7,4 @@ python-dotenv==1.0.0 schedule==1.2.0 pytz==2023.3 redis==5.0.1 +chromadb