Add ChromaDB implementation

This commit is contained in:
2025-12-10 12:46:17 +00:00
parent 57f37c8dc0
commit 6c8d6d0940
7 changed files with 384 additions and 7 deletions

View File

@@ -14,7 +14,9 @@ from rss_utils import extract_article_url, extract_article_summary, extract_publ
from config import Config
from ollama_client import OllamaClient
from article_clustering import ArticleClusterer
from article_clustering import ArticleClusterer
from cluster_summarizer import create_cluster_summaries
from chroma_client import ChromaClient
# Load environment variables
load_dotenv(dotenv_path='../.env')
@@ -38,6 +40,13 @@ ollama_client = OllamaClient(
# Initialize Article Clusterer (will be initialized after ollama_client)
article_clusterer = None
# Initialize ChromaDB client
chroma_client = ChromaClient(
host=Config.CHROMA_HOST,
port=Config.CHROMA_PORT,
collection_name=Config.CHROMA_COLLECTION
)
# Print configuration on startup
if __name__ != '__main__':
Config.print_config()
@@ -440,6 +449,17 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
crawled_count += 1
print(f" ✓ Saved ({article_data.get('word_count', 0)} words)")
# Index in ChromaDB
try:
# Add mongo _id to article doc for reference
saved_article = articles_collection.find_one({'link': article_url})
if saved_article:
article_doc['_id'] = str(saved_article['_id'])
chroma_client.add_articles([article_doc])
except Exception as e:
print(f" ⚠ Failed to index in ChromaDB: {e}")
except DuplicateKeyError:
print(f" ⚠ Duplicate key error")
except Exception as e: