Add ChromaDB implementation

2025-12-10 12:46:17 +00:00
parent 57f37c8dc0
commit 6c8d6d0940
7 changed files with 384 additions and 7 deletions
--- a/news_crawler/crawler_service.py
+++ b/news_crawler/crawler_service.py
@@ -14,7 +14,9 @@ from rss_utils import extract_article_url, extract_article_summary, extract_publ
 from config import Config
 from ollama_client import OllamaClient
 from article_clustering import ArticleClusterer
+from article_clustering import ArticleClusterer
 from cluster_summarizer import create_cluster_summaries
+from chroma_client import ChromaClient

 # Load environment variables
 load_dotenv(dotenv_path='../.env')
@@ -38,6 +40,13 @@ ollama_client = OllamaClient(
 # Initialize Article Clusterer (will be initialized after ollama_client)
 article_clusterer = None

+# Initialize ChromaDB client
+chroma_client = ChromaClient(
+    host=Config.CHROMA_HOST,
+    port=Config.CHROMA_PORT,
+    collection_name=Config.CHROMA_COLLECTION
+)
+
 # Print configuration on startup
 if __name__ != '__main__':
    Config.print_config()
@@ -440,6 +449,17 @@ def crawl_rss_feed(feed_url, feed_name, feed_category='general', max_articles=10
                    crawled_count += 1
                    print(f"   ✓ Saved ({article_data.get('word_count', 0)} words)")
                    
+                    # Index in ChromaDB
+                    try:
+                        # Add mongo _id to article doc for reference
+                        saved_article = articles_collection.find_one({'link': article_url})
+                        if saved_article:
+                            article_doc['_id'] = str(saved_article['_id'])
+                            chroma_client.add_articles([article_doc])
+                    except Exception as e:
+                        print(f"   ⚠ Failed to index in ChromaDB: {e}")
+                    
+                    
                except DuplicateKeyError:
                    print(f"   ⚠ Duplicate key error")
                except Exception as e: