Compare commits
2 Commits
57f37c8dc0
...
50b9888004
| Author | SHA1 | Date | |
|---|---|---|---|
| 50b9888004 | |||
| 6c8d6d0940 |
165
backend/chroma_client.py
Normal file
165
backend/chroma_client.py
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
"""
|
||||||
|
ChromaDB Client for storing and retrieving document embeddings
|
||||||
|
"""
|
||||||
|
import chromadb
|
||||||
|
from chromadb.config import Settings
|
||||||
|
from chromadb.utils import embedding_functions
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
class ChromaClient:
|
||||||
|
"""
|
||||||
|
Client for interacting with ChromaDB vector database.
|
||||||
|
Uses Ollama for generating embeddings if available, otherwise falls back to default.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, host, port, collection_name='munich_news_articles', ollama_base_url=None):
|
||||||
|
"""
|
||||||
|
Initialize ChromaDB client
|
||||||
|
|
||||||
|
Args:
|
||||||
|
host: ChromaDB host (e.g. 'localhost' or 'chromadb')
|
||||||
|
port: ChromaDB port (default 8000)
|
||||||
|
collection_name: Name of the collection to use
|
||||||
|
ollama_base_url: Optional URL for Ollama embedding function
|
||||||
|
"""
|
||||||
|
self.host = host
|
||||||
|
self.port = port
|
||||||
|
self.collection_name = collection_name
|
||||||
|
self.client = None
|
||||||
|
self.collection = None
|
||||||
|
|
||||||
|
# Setup embedding function
|
||||||
|
# We prefer using a local embedding model compatible with Ollama or SentenceTransformers
|
||||||
|
# For simplicity in this stack, we can use the default SentenceTransformer (all-MiniLM-L6-v2)
|
||||||
|
# which is downloaded automatically by chromadb utils.
|
||||||
|
# Alternatively, we could define a custom function using Ollama's /api/embeddings
|
||||||
|
self.embedding_function = embedding_functions.DefaultEmbeddingFunction()
|
||||||
|
|
||||||
|
def connect(self):
|
||||||
|
"""Establish connection to ChromaDB"""
|
||||||
|
try:
|
||||||
|
self.client = chromadb.HttpClient(
|
||||||
|
host=self.host,
|
||||||
|
port=self.port,
|
||||||
|
settings=Settings(allow_reset=True, anonymized_telemetry=False)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create or get collection
|
||||||
|
self.collection = self.client.get_or_create_collection(
|
||||||
|
name=self.collection_name,
|
||||||
|
embedding_function=self.embedding_function,
|
||||||
|
metadata={"hnsw:space": "cosine"}
|
||||||
|
)
|
||||||
|
print(f"✓ Connected to ChromaDB at {self.host}:{self.port}")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠ Could not connect to ChromaDB: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def add_articles(self, articles):
|
||||||
|
"""
|
||||||
|
Add articles to the vector database
|
||||||
|
|
||||||
|
Args:
|
||||||
|
articles: List of dictionaries containing article data.
|
||||||
|
Must have 'link' (used as ID), 'title', 'content', etc.
|
||||||
|
"""
|
||||||
|
if not self.client or not self.collection:
|
||||||
|
if not self.connect():
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not articles:
|
||||||
|
return True
|
||||||
|
|
||||||
|
ids = []
|
||||||
|
documents = []
|
||||||
|
metadatas = []
|
||||||
|
|
||||||
|
for article in articles:
|
||||||
|
# Skip if critical data missing
|
||||||
|
if not article.get('link') or not article.get('content'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Use link as unique ID
|
||||||
|
article_id = article.get('link')
|
||||||
|
|
||||||
|
# Prepare text for embedding (Title + Summary + Start of Content)
|
||||||
|
# This gives semantic search a good overview
|
||||||
|
title = article.get('title', '')
|
||||||
|
summary = article.get('summary') or ''
|
||||||
|
content_snippet = article.get('content', '')[:1000]
|
||||||
|
|
||||||
|
text_to_embed = f"{title}\n\n{summary}\n\n{content_snippet}"
|
||||||
|
|
||||||
|
# robust metadata (flat dict, no nested objects)
|
||||||
|
metadata = {
|
||||||
|
"title": title[:100], # Truncate for metadata limits
|
||||||
|
"url": article_id,
|
||||||
|
"source": article.get('source', 'unknown'),
|
||||||
|
"category": article.get('category', 'general'),
|
||||||
|
"published_at": str(article.get('published_at', '')),
|
||||||
|
"mongo_id": str(article.get('_id', ''))
|
||||||
|
}
|
||||||
|
|
||||||
|
ids.append(article_id)
|
||||||
|
documents.append(text_to_embed)
|
||||||
|
metadatas.append(metadata)
|
||||||
|
|
||||||
|
if not ids:
|
||||||
|
return True
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.collection.upsert(
|
||||||
|
ids=ids,
|
||||||
|
documents=documents,
|
||||||
|
metadatas=metadatas
|
||||||
|
)
|
||||||
|
print(f"✓ Indexed {len(ids)} articles in ChromaDB")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Failed to index in ChromaDB: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def search(self, query_text, n_results=5, where=None):
|
||||||
|
"""
|
||||||
|
Search for relevant articles
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query_text: The search query
|
||||||
|
n_results: Number of results to return
|
||||||
|
where: Metadata filter dict (e.g. {"category": "sports"})
|
||||||
|
"""
|
||||||
|
if not self.client or not self.collection:
|
||||||
|
if not self.connect():
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
results = self.collection.query(
|
||||||
|
query_texts=[query_text],
|
||||||
|
n_results=n_results,
|
||||||
|
where=where
|
||||||
|
)
|
||||||
|
|
||||||
|
# Format results into a nice list of dicts
|
||||||
|
formatted_results = []
|
||||||
|
if results and results['ids']:
|
||||||
|
for i, id in enumerate(results['ids'][0]):
|
||||||
|
item = {
|
||||||
|
'id': id,
|
||||||
|
'document': results['documents'][0][i] if results['documents'] else None,
|
||||||
|
'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
|
||||||
|
'distance': results['distances'][0][i] if results['distances'] else 0
|
||||||
|
}
|
||||||
|
formatted_results.append(item)
|
||||||
|
|
||||||
|
return formatted_results
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Search failed: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Test client
|
||||||
|
client = ChromaClient(host='localhost', port=8000)
|
||||||
|
client.connect()
|
||||||
@@ -7,3 +7,4 @@ requests==2.31.0
|
|||||||
Jinja2==3.1.2
|
Jinja2==3.1.2
|
||||||
redis==5.0.1
|
redis==5.0.1
|
||||||
|
|
||||||
|
chromadb>=0.4.0
|
||||||
|
|||||||
@@ -100,6 +100,24 @@ services:
|
|||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 3
|
retries: 3
|
||||||
|
|
||||||
|
# ChromaDB - Vector Database for AI features
|
||||||
|
chromadb:
|
||||||
|
image: chromadb/chroma:latest
|
||||||
|
container_name: munich-news-chromadb
|
||||||
|
restart: unless-stopped
|
||||||
|
# No ports exposed - only accessible within Docker network
|
||||||
|
environment:
|
||||||
|
- IS_PERSISTENT=TRUE
|
||||||
|
volumes:
|
||||||
|
- chromadb_data:/chroma/chroma
|
||||||
|
networks:
|
||||||
|
- munich-news-network
|
||||||
|
healthcheck:
|
||||||
|
test: [ "CMD", "curl", "-f", "http://localhost:8000/api/v1/heartbeat" ]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
# News Crawler - Runs at 6 AM Berlin time
|
# News Crawler - Runs at 6 AM Berlin time
|
||||||
crawler:
|
crawler:
|
||||||
build:
|
build:
|
||||||
@@ -264,6 +282,8 @@ volumes:
|
|||||||
driver: local
|
driver: local
|
||||||
ollama_data:
|
ollama_data:
|
||||||
driver: local
|
driver: local
|
||||||
|
chromadb_data:
|
||||||
|
driver: local
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
munich-news-network:
|
munich-news-network:
|
||||||
|
|||||||
165
news_crawler/chroma_client.py
Normal file
165
news_crawler/chroma_client.py
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
"""
|
||||||
|
ChromaDB Client for storing and retrieving document embeddings
|
||||||
|
"""
|
||||||
|
import chromadb
|
||||||
|
from chromadb.config import Settings
|
||||||
|
from chromadb.utils import embedding_functions
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
class ChromaClient:
|
||||||
|
"""
|
||||||
|
Client for interacting with ChromaDB vector database.
|
||||||
|
Uses Ollama for generating embeddings if available, otherwise falls back to default.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, host, port, collection_name='munich_news_articles', ollama_base_url=None):
|
||||||
|
"""
|
||||||
|
Initialize ChromaDB client
|
||||||
|
|
||||||
|
Args:
|
||||||
|
host: ChromaDB host (e.g. 'localhost' or 'chromadb')
|
||||||
|
port: ChromaDB port (default 8000)
|
||||||
|
collection_name: Name of the collection to use
|
||||||
|
ollama_base_url: Optional URL for Ollama embedding function
|
||||||
|
"""
|
||||||
|
self.host = host
|
||||||
|
self.port = port
|
||||||
|
self.collection_name = collection_name
|
||||||
|
self.client = None
|
||||||
|
self.collection = None
|
||||||
|
|
||||||
|
# Setup embedding function
|
||||||
|
# We prefer using a local embedding model compatible with Ollama or SentenceTransformers
|
||||||
|
# For simplicity in this stack, we can use the default SentenceTransformer (all-MiniLM-L6-v2)
|
||||||
|
# which is downloaded automatically by chromadb utils.
|
||||||
|
# Alternatively, we could define a custom function using Ollama's /api/embeddings
|
||||||
|
self.embedding_function = embedding_functions.DefaultEmbeddingFunction()
|
||||||
|
|
||||||
|
def connect(self):
|
||||||
|
"""Establish connection to ChromaDB"""
|
||||||
|
try:
|
||||||
|
self.client = chromadb.HttpClient(
|
||||||
|
host=self.host,
|
||||||
|
port=self.port,
|
||||||
|
settings=Settings(allow_reset=True, anonymized_telemetry=False)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create or get collection
|
||||||
|
self.collection = self.client.get_or_create_collection(
|
||||||
|
name=self.collection_name,
|
||||||
|
embedding_function=self.embedding_function,
|
||||||
|
metadata={"hnsw:space": "cosine"}
|
||||||
|
)
|
||||||
|
print(f"✓ Connected to ChromaDB at {self.host}:{self.port}")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠ Could not connect to ChromaDB: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def add_articles(self, articles):
|
||||||
|
"""
|
||||||
|
Add articles to the vector database
|
||||||
|
|
||||||
|
Args:
|
||||||
|
articles: List of dictionaries containing article data.
|
||||||
|
Must have 'link' (used as ID), 'title', 'content', etc.
|
||||||
|
"""
|
||||||
|
if not self.client or not self.collection:
|
||||||
|
if not self.connect():
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not articles:
|
||||||
|
return True
|
||||||
|
|
||||||
|
ids = []
|
||||||
|
documents = []
|
||||||
|
metadatas = []
|
||||||
|
|
||||||
|
for article in articles:
|
||||||
|
# Skip if critical data missing
|
||||||
|
if not article.get('link') or not article.get('content'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Use link as unique ID
|
||||||
|
article_id = article.get('link')
|
||||||
|
|
||||||
|
# Prepare text for embedding (Title + Summary + Start of Content)
|
||||||
|
# This gives semantic search a good overview
|
||||||
|
title = article.get('title', '')
|
||||||
|
summary = article.get('summary') or ''
|
||||||
|
content_snippet = article.get('content', '')[:1000]
|
||||||
|
|
||||||
|
text_to_embed = f"{title}\n\n{summary}\n\n{content_snippet}"
|
||||||
|
|
||||||
|
# robust metadata (flat dict, no nested objects)
|
||||||
|
metadata = {
|
||||||
|
"title": title[:100], # Truncate for metadata limits
|
||||||
|
"url": article_id,
|
||||||
|
"source": article.get('source', 'unknown'),
|
||||||
|
"category": article.get('category', 'general'),
|
||||||
|
"published_at": str(article.get('published_at', '')),
|
||||||
|
"mongo_id": str(article.get('_id', ''))
|
||||||
|
}
|
||||||
|
|
||||||
|
ids.append(article_id)
|
||||||
|
documents.append(text_to_embed)
|
||||||
|
metadatas.append(metadata)
|
||||||
|
|
||||||
|
if not ids:
|
||||||
|
return True
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.collection.upsert(
|
||||||
|
ids=ids,
|
||||||
|
documents=documents,
|
||||||
|
metadatas=metadatas
|
||||||
|
)
|
||||||
|
print(f"✓ Indexed {len(ids)} articles in ChromaDB")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Failed to index in ChromaDB: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def search(self, query_text, n_results=5, where=None):
|
||||||
|
"""
|
||||||
|
Search for relevant articles
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query_text: The search query
|
||||||
|
n_results: Number of results to return
|
||||||
|
where: Metadata filter dict (e.g. {"category": "sports"})
|
||||||
|
"""
|
||||||
|
if not self.client or not self.collection:
|
||||||
|
if not self.connect():
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
results = self.collection.query(
|
||||||
|
query_texts=[query_text],
|
||||||
|
n_results=n_results,
|
||||||
|
where=where
|
||||||
|
)
|
||||||
|
|
||||||
|
# Format results into a nice list of dicts
|
||||||
|
formatted_results = []
|
||||||
|
if results and results['ids']:
|
||||||
|
for i, id in enumerate(results['ids'][0]):
|
||||||
|
item = {
|
||||||
|
'id': id,
|
||||||
|
'document': results['documents'][0][i] if results['documents'] else None,
|
||||||
|
'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
|
||||||
|
'distance': results['distances'][0][i] if results['distances'] else 0
|
||||||
|
}
|
||||||
|
formatted_results.append(item)
|
||||||
|
|
||||||
|
return formatted_results
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Search failed: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Test client
|
||||||
|
client = ChromaClient(host='localhost', port=8000)
|
||||||
|
client.connect()
|
||||||
@@ -34,6 +34,11 @@ class Config:
|
|||||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
||||||
DB_NAME = 'munich_news'
|
DB_NAME = 'munich_news'
|
||||||
|
|
||||||
|
# ChromaDB Configuration
|
||||||
|
CHROMA_HOST = os.getenv('CHROMA_HOST', 'chromadb')
|
||||||
|
CHROMA_PORT = int(os.getenv('CHROMA_PORT', '8000'))
|
||||||
|
CHROMA_COLLECTION = 'munich_news_articles'
|
||||||
|
|
||||||
# Ollama Configuration
|
# Ollama Configuration
|
||||||
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
|
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
|
||||||
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'phi3:latest')
|
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'phi3:latest')
|
||||||
|
|||||||
@@ -14,7 +14,9 @@ from rss_utils import extract_article_url, extract_article_summary, extract_publ
|
|||||||
from config import Config
|
from config import Config
|
||||||
from ollama_client import OllamaClient
|
from ollama_client import OllamaClient
|
||||||
from article_clustering import ArticleClusterer
|
from article_clustering import ArticleClusterer
|
||||||
|
from article_clustering import ArticleClusterer
|
||||||
from cluster_summarizer import create_cluster_summaries
|
from cluster_summarizer import create_cluster_summaries
|
||||||
|
from chroma_client import ChromaClient
|
||||||
|
|
||||||
# Load environment variables
|
# Load environment variables
|
||||||
load_dotenv(dotenv_path='../.env')
|
load_dotenv(dotenv_path='../.env')
|
||||||
@@ -38,6 +40,13 @@ ollama_client = OllamaClient(
|
|||||||
# Initialize Article Clusterer (will be initialized after ollama_client)
|
# Initialize Article Clusterer (will be initialized after ollama_client)
|
||||||
article_clusterer = None
|
article_clusterer = None
|
||||||
|
|
||||||
|
# Initialize ChromaDB client
|
||||||
|
chroma_client = ChromaClient(
|
||||||
|
host=Config.CHROMA_HOST,
|
||||||
|
port=Config.CHROMA_PORT,
|
||||||
|
collection_name=Config.CHROMA_COLLECTION
|
||||||
|
)
|
||||||
|
|
||||||
# Print configuration on startup
|
# Print configuration on startup
|
||||||
if __name__ != '__main__':
|
if __name__ != '__main__':
|
||||||
Config.print_config()
|
Config.print_config()
|
||||||
@@ -440,6 +449,17 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
|
|||||||
crawled_count += 1
|
crawled_count += 1
|
||||||
print(f" ✓ Saved ({article_data.get('word_count', 0)} words)")
|
print(f" ✓ Saved ({article_data.get('word_count', 0)} words)")
|
||||||
|
|
||||||
|
# Index in ChromaDB
|
||||||
|
try:
|
||||||
|
# Add mongo _id to article doc for reference
|
||||||
|
saved_article = articles_collection.find_one({'link': article_url})
|
||||||
|
if saved_article:
|
||||||
|
article_doc['_id'] = str(saved_article['_id'])
|
||||||
|
chroma_client.add_articles([article_doc])
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠ Failed to index in ChromaDB: {e}")
|
||||||
|
|
||||||
|
|
||||||
except DuplicateKeyError:
|
except DuplicateKeyError:
|
||||||
print(f" ⚠ Duplicate key error")
|
print(f" ⚠ Duplicate key error")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -7,3 +7,4 @@ python-dotenv==1.0.0
|
|||||||
schedule==1.2.0
|
schedule==1.2.0
|
||||||
pytz==2023.3
|
pytz==2023.3
|
||||||
redis==5.0.1
|
redis==5.0.1
|
||||||
|
chromadb>=0.4.0
|
||||||
|
|||||||
Reference in New Issue
Block a user