Munich-news/backend/services/news_service.py

import feedparser
from datetime import datetime
from pymongo.errors import DuplicateKeyError
from database import articles_collection, rss_feeds_collection
from utils.rss_utils import extract_article_url, extract_article_summary, extract_published_date


def get_active_rss_feeds():
    """Get all active RSS feeds from database"""
    feeds = []
    cursor = rss_feeds_collection.find({'active': True})
    for feed in cursor:
        feeds.append({
            'name': feed.get('name', ''),
            'url': feed.get('url', '')
        })
    return feeds


def fetch_munich_news():
    """Fetch news from Munich news sources"""
    articles = []

    # Get RSS feeds from database instead of hardcoded list
    sources = get_active_rss_feeds()

    for source in sources:
        try:
            feed = feedparser.parse(source['url'])
            for entry in feed.entries[:5]:  # Get top 5 from each source
                # Extract article URL using utility function
                article_url = extract_article_url(entry)

                if not article_url:
                    print(f"  ⚠ No valid URL for: {entry.get('title', 'Unknown')[:50]}")
                    continue  # Skip entries without valid URL

                # Extract summary
                summary = extract_article_summary(entry)
                if summary:
                    summary = summary[:200] + '...' if len(summary) > 200 else summary

                articles.append({
                    'title': entry.get('title', ''),
                    'link': article_url,
                    'summary': summary,
                    'source': source['name'],
                    'published': extract_published_date(entry)
                })
        except Exception as e:
            print(f"Error fetching from {source['name']}: {e}")

    return articles


def save_articles_to_db(articles):
    """Save articles to MongoDB, avoiding duplicates"""
    saved_count = 0

    for article in articles:
        try:
            # Prepare article document
            article_doc = {
                'title': article.get('title', ''),
                'link': article.get('link', ''),
                'summary': article.get('summary', ''),
                'source': article.get('source', ''),
                'published_at': article.get('published', ''),
                'created_at': datetime.utcnow()
            }

            # Use update_one with upsert to handle duplicates
            # This will insert if link doesn't exist, or update if it does
            result = articles_collection.update_one(
                {'link': article_doc['link']},
                {'$setOnInsert': article_doc},  # Only set on insert, don't update existing
                upsert=True
            )

            if result.upserted_id:
                saved_count += 1

        except DuplicateKeyError:
            # Link already exists, skip
            pass
        except Exception as e:
            print(f"Error saving article {article.get('link', 'unknown')}: {e}")

    if saved_count > 0:
        print(f"Saved {saved_count} new articles to database")