91 lines
3.2 KiB
Python
91 lines
3.2 KiB
Python
import feedparser
|
|
from datetime import datetime
|
|
from pymongo.errors import DuplicateKeyError
|
|
from database import articles_collection, rss_feeds_collection
|
|
from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
|
|
|
|
|
def get_active_rss_feeds():
|
|
"""Get all active RSS feeds from database"""
|
|
feeds = []
|
|
cursor = rss_feeds_collection.find({'active': True})
|
|
for feed in cursor:
|
|
feeds.append({
|
|
'name': feed.get('name', ''),
|
|
'url': feed.get('url', '')
|
|
})
|
|
return feeds
|
|
|
|
|
|
def fetch_munich_news():
|
|
"""Fetch news from Munich news sources"""
|
|
articles = []
|
|
|
|
# Get RSS feeds from database instead of hardcoded list
|
|
sources = get_active_rss_feeds()
|
|
|
|
for source in sources:
|
|
try:
|
|
feed = feedparser.parse(source['url'])
|
|
for entry in feed.entries[:5]: # Get top 5 from each source
|
|
# Extract article URL using utility function
|
|
article_url = extract_article_url(entry)
|
|
|
|
if not article_url:
|
|
print(f" ⚠ No valid URL for: {entry.get('title', 'Unknown')[:50]}")
|
|
continue # Skip entries without valid URL
|
|
|
|
# Extract summary
|
|
summary = extract_article_summary(entry)
|
|
if summary:
|
|
summary = summary[:200] + '...' if len(summary) > 200 else summary
|
|
|
|
articles.append({
|
|
'title': entry.get('title', ''),
|
|
'link': article_url,
|
|
'summary': summary,
|
|
'source': source['name'],
|
|
'published': extract_published_date(entry)
|
|
})
|
|
except Exception as e:
|
|
print(f"Error fetching from {source['name']}: {e}")
|
|
|
|
return articles
|
|
|
|
|
|
def save_articles_to_db(articles):
|
|
"""Save articles to MongoDB, avoiding duplicates"""
|
|
saved_count = 0
|
|
|
|
for article in articles:
|
|
try:
|
|
# Prepare article document
|
|
article_doc = {
|
|
'title': article.get('title', ''),
|
|
'link': article.get('link', ''),
|
|
'summary': article.get('summary', ''),
|
|
'source': article.get('source', ''),
|
|
'published_at': article.get('published', ''),
|
|
'created_at': datetime.utcnow()
|
|
}
|
|
|
|
# Use update_one with upsert to handle duplicates
|
|
# This will insert if link doesn't exist, or update if it does
|
|
result = articles_collection.update_one(
|
|
{'link': article_doc['link']},
|
|
{'$setOnInsert': article_doc}, # Only set on insert, don't update existing
|
|
upsert=True
|
|
)
|
|
|
|
if result.upserted_id:
|
|
saved_count += 1
|
|
|
|
except DuplicateKeyError:
|
|
# Link already exists, skip
|
|
pass
|
|
except Exception as e:
|
|
print(f"Error saving article {article.get('link', 'unknown')}: {e}")
|
|
|
|
if saved_count > 0:
|
|
print(f"Saved {saved_count} new articles to database")
|