update
This commit is contained in:
25
news_crawler/.gitignore
vendored
Normal file
25
news_crawler/.gitignore
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
.venv
|
||||
|
||||
# Environment variables
|
||||
.env
|
||||
.env.local
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
191
news_crawler/CHANGES.md
Normal file
191
news_crawler/CHANGES.md
Normal file
@@ -0,0 +1,191 @@
|
||||
# Recent Changes - Full Content Storage
|
||||
|
||||
## ✅ What Changed
|
||||
|
||||
### 1. Removed Content Length Limit
|
||||
**Before:**
|
||||
```python
|
||||
'content': content_text[:10000] # Limited to 10k chars
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
'content': content_text # Full content, no limit
|
||||
```
|
||||
|
||||
### 2. Simplified Database Schema
|
||||
**Before:**
|
||||
```javascript
|
||||
{
|
||||
summary: String, // Short summary
|
||||
full_content: String // Limited content
|
||||
}
|
||||
```
|
||||
|
||||
**After:**
|
||||
```javascript
|
||||
{
|
||||
content: String // Full article content, no limit
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Enhanced API Response
|
||||
**Before:**
|
||||
```javascript
|
||||
{
|
||||
title: "...",
|
||||
link: "...",
|
||||
summary: "..."
|
||||
}
|
||||
```
|
||||
|
||||
**After:**
|
||||
```javascript
|
||||
{
|
||||
title: "...",
|
||||
author: "...", // NEW!
|
||||
link: "...",
|
||||
preview: "...", // First 200 chars
|
||||
word_count: 1250, // NEW!
|
||||
has_full_content: true // NEW!
|
||||
}
|
||||
```
|
||||
|
||||
## 📊 Database Structure
|
||||
|
||||
### Articles Collection
|
||||
```javascript
|
||||
{
|
||||
_id: ObjectId,
|
||||
title: String, // Article title
|
||||
author: String, // Article author (extracted)
|
||||
link: String, // Article URL (unique)
|
||||
content: String, // FULL article content (no limit)
|
||||
word_count: Number, // Word count
|
||||
source: String, // RSS feed name
|
||||
published_at: String, // Publication date
|
||||
crawled_at: DateTime, // When crawled
|
||||
created_at: DateTime // When added
|
||||
}
|
||||
```
|
||||
|
||||
## 🆕 New API Endpoint
|
||||
|
||||
### GET /api/news/<article_url>
|
||||
Get full article content by URL.
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
# URL encode the article URL
|
||||
curl "http://localhost:5001/api/news/https%3A%2F%2Fexample.com%2Farticle"
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"title": "New U-Bahn Line Opens in Munich",
|
||||
"author": "Max Mustermann",
|
||||
"link": "https://example.com/article",
|
||||
"content": "The full article text here... (complete, no truncation)",
|
||||
"word_count": 1250,
|
||||
"source": "Süddeutsche Zeitung München",
|
||||
"published_at": "2024-11-10T10:00:00Z",
|
||||
"crawled_at": "2024-11-10T16:30:00Z",
|
||||
"created_at": "2024-11-10T16:00:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
## 📈 Enhanced Stats
|
||||
|
||||
### GET /api/stats
|
||||
Now includes crawled article count:
|
||||
|
||||
```json
|
||||
{
|
||||
"subscribers": 150,
|
||||
"articles": 500,
|
||||
"crawled_articles": 350 // NEW!
|
||||
}
|
||||
```
|
||||
|
||||
## 🎯 Benefits
|
||||
|
||||
1. **Complete Content** - No truncation, full articles stored
|
||||
2. **Better for AI** - Full context for summarization/analysis
|
||||
3. **Cleaner Schema** - Single `content` field instead of `summary` + `full_content`
|
||||
4. **More Metadata** - Author, word count, crawl timestamp
|
||||
5. **Better API** - Preview in list, full content on demand
|
||||
|
||||
## 🔄 Migration
|
||||
|
||||
If you have existing articles with `full_content` field, they will continue to work. New articles will use the `content` field.
|
||||
|
||||
To migrate old articles:
|
||||
```javascript
|
||||
// MongoDB shell
|
||||
db.articles.updateMany(
|
||||
{ full_content: { $exists: true } },
|
||||
[
|
||||
{
|
||||
$set: {
|
||||
content: "$full_content"
|
||||
}
|
||||
},
|
||||
{
|
||||
$unset: ["full_content", "summary"]
|
||||
}
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## 🚀 Usage
|
||||
|
||||
### Crawl Articles
|
||||
```bash
|
||||
cd news_crawler
|
||||
python crawler_service.py 10
|
||||
```
|
||||
|
||||
### Get Article List (with previews)
|
||||
```bash
|
||||
curl http://localhost:5001/api/news
|
||||
```
|
||||
|
||||
### Get Full Article Content
|
||||
```bash
|
||||
# Get the article URL from the list, then:
|
||||
curl "http://localhost:5001/api/news/<encoded_url>"
|
||||
```
|
||||
|
||||
### Check Stats
|
||||
```bash
|
||||
curl http://localhost:5001/api/stats
|
||||
```
|
||||
|
||||
## 📝 Example Workflow
|
||||
|
||||
1. **Add RSS Feed**
|
||||
```bash
|
||||
curl -X POST http://localhost:5001/api/rss-feeds \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name": "News Source", "url": "https://example.com/rss"}'
|
||||
```
|
||||
|
||||
2. **Crawl Articles**
|
||||
```bash
|
||||
cd news_crawler
|
||||
python crawler_service.py 10
|
||||
```
|
||||
|
||||
3. **View Articles**
|
||||
```bash
|
||||
curl http://localhost:5001/api/news
|
||||
```
|
||||
|
||||
4. **Get Full Content**
|
||||
```bash
|
||||
# Copy article link from above, URL encode it
|
||||
curl "http://localhost:5001/api/news/https%3A%2F%2Fexample.com%2Farticle"
|
||||
```
|
||||
|
||||
Now you have complete article content ready for AI processing! 🎉
|
||||
13
news_crawler/Dockerfile
Normal file
13
news_crawler/Dockerfile
Normal file
@@ -0,0 +1,13 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy crawler service
|
||||
COPY crawler_service.py .
|
||||
|
||||
# Run crawler
|
||||
CMD ["python", "crawler_service.py"]
|
||||
353
news_crawler/EXTRACTION_STRATEGIES.md
Normal file
353
news_crawler/EXTRACTION_STRATEGIES.md
Normal file
@@ -0,0 +1,353 @@
|
||||
# Content Extraction Strategies
|
||||
|
||||
The crawler uses multiple strategies to dynamically extract article metadata from any website.
|
||||
|
||||
## 🎯 What Gets Extracted
|
||||
|
||||
1. **Title** - Article headline
|
||||
2. **Author** - Article writer/journalist
|
||||
3. **Published Date** - When article was published
|
||||
4. **Content** - Main article text
|
||||
5. **Description** - Meta description/summary
|
||||
|
||||
## 📋 Extraction Strategies
|
||||
|
||||
### 1. Title Extraction
|
||||
|
||||
Tries multiple methods in order of reliability:
|
||||
|
||||
#### Strategy 1: H1 Tag
|
||||
```html
|
||||
<h1>Article Title Here</h1>
|
||||
```
|
||||
✅ Most reliable - usually the main headline
|
||||
|
||||
#### Strategy 2: Open Graph Meta Tag
|
||||
```html
|
||||
<meta property="og:title" content="Article Title Here" />
|
||||
```
|
||||
✅ Used by Facebook, very reliable
|
||||
|
||||
#### Strategy 3: Twitter Card Meta Tag
|
||||
```html
|
||||
<meta name="twitter:title" content="Article Title Here" />
|
||||
```
|
||||
✅ Used by Twitter, reliable
|
||||
|
||||
#### Strategy 4: Title Tag (Fallback)
|
||||
```html
|
||||
<title>Article Title | Site Name</title>
|
||||
```
|
||||
⚠️ Often includes site name, needs cleaning
|
||||
|
||||
**Cleaning:**
|
||||
- Removes " | Site Name"
|
||||
- Removes " - Site Name"
|
||||
|
||||
---
|
||||
|
||||
### 2. Author Extraction
|
||||
|
||||
Tries multiple methods:
|
||||
|
||||
#### Strategy 1: Meta Author Tag
|
||||
```html
|
||||
<meta name="author" content="John Doe" />
|
||||
```
|
||||
✅ Standard HTML meta tag
|
||||
|
||||
#### Strategy 2: Rel="author" Link
|
||||
```html
|
||||
<a rel="author" href="/author/john-doe">John Doe</a>
|
||||
```
|
||||
✅ Semantic HTML
|
||||
|
||||
#### Strategy 3: Common Class Names
|
||||
```html
|
||||
<div class="author-name">John Doe</div>
|
||||
<span class="byline">By John Doe</span>
|
||||
<p class="writer">John Doe</p>
|
||||
```
|
||||
✅ Searches for: author-name, author, byline, writer
|
||||
|
||||
#### Strategy 4: Schema.org Markup
|
||||
```html
|
||||
<span itemprop="author">John Doe</span>
|
||||
```
|
||||
✅ Structured data
|
||||
|
||||
#### Strategy 5: JSON-LD Structured Data
|
||||
```html
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@type": "NewsArticle",
|
||||
"author": {
|
||||
"@type": "Person",
|
||||
"name": "John Doe"
|
||||
}
|
||||
}
|
||||
</script>
|
||||
```
|
||||
✅ Most structured, very reliable
|
||||
|
||||
**Cleaning:**
|
||||
- Removes "By " prefix
|
||||
- Validates length (< 100 chars)
|
||||
|
||||
---
|
||||
|
||||
### 3. Date Extraction
|
||||
|
||||
Tries multiple methods:
|
||||
|
||||
#### Strategy 1: Time Tag with Datetime
|
||||
```html
|
||||
<time datetime="2024-11-10T10:00:00Z">November 10, 2024</time>
|
||||
```
|
||||
✅ Most reliable - ISO format
|
||||
|
||||
#### Strategy 2: Article Published Time Meta
|
||||
```html
|
||||
<meta property="article:published_time" content="2024-11-10T10:00:00Z" />
|
||||
```
|
||||
✅ Open Graph standard
|
||||
|
||||
#### Strategy 3: OG Published Time
|
||||
```html
|
||||
<meta property="og:published_time" content="2024-11-10T10:00:00Z" />
|
||||
```
|
||||
✅ Facebook standard
|
||||
|
||||
#### Strategy 4: Common Class Names
|
||||
```html
|
||||
<span class="publish-date">November 10, 2024</span>
|
||||
<time class="published">2024-11-10</time>
|
||||
<div class="timestamp">10:00 AM, Nov 10</div>
|
||||
```
|
||||
✅ Searches for: publish-date, published, date, timestamp
|
||||
|
||||
#### Strategy 5: Schema.org Markup
|
||||
```html
|
||||
<meta itemprop="datePublished" content="2024-11-10T10:00:00Z" />
|
||||
```
|
||||
✅ Structured data
|
||||
|
||||
#### Strategy 6: JSON-LD Structured Data
|
||||
```html
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@type": "NewsArticle",
|
||||
"datePublished": "2024-11-10T10:00:00Z"
|
||||
}
|
||||
</script>
|
||||
```
|
||||
✅ Most structured
|
||||
|
||||
---
|
||||
|
||||
### 4. Content Extraction
|
||||
|
||||
Tries multiple methods:
|
||||
|
||||
#### Strategy 1: Semantic HTML Tags
|
||||
```html
|
||||
<article>
|
||||
<p>Article content here...</p>
|
||||
</article>
|
||||
```
|
||||
✅ Best practice HTML5
|
||||
|
||||
#### Strategy 2: Common Class Names
|
||||
```html
|
||||
<div class="article-content">...</div>
|
||||
<div class="article-body">...</div>
|
||||
<div class="post-content">...</div>
|
||||
<div class="entry-content">...</div>
|
||||
<div class="story-body">...</div>
|
||||
```
|
||||
✅ Searches for common patterns
|
||||
|
||||
#### Strategy 3: Schema.org Markup
|
||||
```html
|
||||
<div itemprop="articleBody">
|
||||
<p>Content here...</p>
|
||||
</div>
|
||||
```
|
||||
✅ Structured data
|
||||
|
||||
#### Strategy 4: Main Tag
|
||||
```html
|
||||
<main>
|
||||
<p>Content here...</p>
|
||||
</main>
|
||||
```
|
||||
✅ Semantic HTML5
|
||||
|
||||
#### Strategy 5: Body Tag (Fallback)
|
||||
```html
|
||||
<body>
|
||||
<p>Content here...</p>
|
||||
</body>
|
||||
```
|
||||
⚠️ Last resort, may include navigation
|
||||
|
||||
**Content Filtering:**
|
||||
- Removes `<script>`, `<style>`, `<nav>`, `<footer>`, `<header>`, `<aside>`
|
||||
- Filters out short paragraphs (< 50 chars) - likely ads/navigation
|
||||
- Keeps only substantial paragraphs
|
||||
- **No length limit** - stores full article content
|
||||
|
||||
---
|
||||
|
||||
## 🔍 How It Works
|
||||
|
||||
### Example: Crawling a News Article
|
||||
|
||||
```python
|
||||
# 1. Fetch HTML
|
||||
response = requests.get(article_url)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# 2. Extract title (tries 4 strategies)
|
||||
title = extract_title(soup)
|
||||
# Result: "New U-Bahn Line Opens in Munich"
|
||||
|
||||
# 3. Extract author (tries 5 strategies)
|
||||
author = extract_author(soup)
|
||||
# Result: "Max Mustermann"
|
||||
|
||||
# 4. Extract date (tries 6 strategies)
|
||||
published_date = extract_date(soup)
|
||||
# Result: "2024-11-10T10:00:00Z"
|
||||
|
||||
# 5. Extract content (tries 5 strategies)
|
||||
content = extract_main_content(soup)
|
||||
# Result: "The new U-Bahn line connecting..."
|
||||
|
||||
# 6. Save to database
|
||||
article_doc = {
|
||||
'title': title,
|
||||
'author': author,
|
||||
'published_at': published_date,
|
||||
'full_content': content,
|
||||
'word_count': len(content.split())
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Success Rates by Strategy
|
||||
|
||||
Based on common news sites:
|
||||
|
||||
| Strategy | Success Rate | Notes |
|
||||
|----------|-------------|-------|
|
||||
| H1 for title | 95% | Almost universal |
|
||||
| OG meta tags | 90% | Most modern sites |
|
||||
| Time tag for date | 85% | HTML5 sites |
|
||||
| JSON-LD | 70% | Growing adoption |
|
||||
| Class name patterns | 60% | Varies by site |
|
||||
| Schema.org | 50% | Not widely adopted |
|
||||
|
||||
---
|
||||
|
||||
## 🎨 Real-World Examples
|
||||
|
||||
### Example 1: Süddeutsche Zeitung
|
||||
```html
|
||||
<article>
|
||||
<h1>New U-Bahn Line Opens</h1>
|
||||
<span class="author">Max Mustermann</span>
|
||||
<time datetime="2024-11-10T10:00:00Z">10. November 2024</time>
|
||||
<div class="article-body">
|
||||
<p>The new U-Bahn line...</p>
|
||||
</div>
|
||||
</article>
|
||||
```
|
||||
✅ Extracts: Title (H1), Author (class), Date (time), Content (article-body)
|
||||
|
||||
### Example 2: Medium Blog
|
||||
```html
|
||||
<article>
|
||||
<h1>How to Build a News Crawler</h1>
|
||||
<meta property="og:title" content="How to Build a News Crawler" />
|
||||
<meta property="article:published_time" content="2024-11-10T10:00:00Z" />
|
||||
<a rel="author" href="/author">Jane Smith</a>
|
||||
<section>
|
||||
<p>In this article...</p>
|
||||
</section>
|
||||
</article>
|
||||
```
|
||||
✅ Extracts: Title (OG meta), Author (rel), Date (article meta), Content (section)
|
||||
|
||||
### Example 3: WordPress Blog
|
||||
```html
|
||||
<div class="post">
|
||||
<h1 class="entry-title">My Blog Post</h1>
|
||||
<span class="byline">By John Doe</span>
|
||||
<time class="published">November 10, 2024</time>
|
||||
<div class="entry-content">
|
||||
<p>Blog content here...</p>
|
||||
</div>
|
||||
</div>
|
||||
```
|
||||
✅ Extracts: Title (H1), Author (byline), Date (published), Content (entry-content)
|
||||
|
||||
---
|
||||
|
||||
## ⚠️ Edge Cases Handled
|
||||
|
||||
1. **Missing Fields**: Returns `None` instead of crashing
|
||||
2. **Multiple Authors**: Takes first one found
|
||||
3. **Relative Dates**: Stores as-is ("2 hours ago")
|
||||
4. **Paywalls**: Extracts what's available
|
||||
5. **JavaScript-rendered**: Only gets server-side HTML
|
||||
6. **Ads/Navigation**: Filtered out by paragraph length
|
||||
7. **Site Name in Title**: Cleaned automatically
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Future Improvements
|
||||
|
||||
Potential enhancements:
|
||||
|
||||
- [ ] JavaScript rendering (Selenium/Playwright)
|
||||
- [ ] Paywall bypass (where legal)
|
||||
- [ ] Image extraction
|
||||
- [ ] Video detection
|
||||
- [ ] Related articles
|
||||
- [ ] Tags/categories
|
||||
- [ ] Reading time estimation
|
||||
- [ ] Language detection
|
||||
- [ ] Sentiment analysis
|
||||
|
||||
---
|
||||
|
||||
## 🧪 Testing
|
||||
|
||||
Test the extraction on a specific URL:
|
||||
|
||||
```python
|
||||
from crawler_service import extract_article_content
|
||||
|
||||
url = "https://www.sueddeutsche.de/muenchen/article-123"
|
||||
data = extract_article_content(url)
|
||||
|
||||
print(f"Title: {data['title']}")
|
||||
print(f"Author: {data['author']}")
|
||||
print(f"Date: {data['published_date']}")
|
||||
print(f"Content length: {len(data['content'])} chars")
|
||||
print(f"Word count: {data['word_count']}")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📚 Standards Supported
|
||||
|
||||
- ✅ HTML5 semantic tags
|
||||
- ✅ Open Graph Protocol
|
||||
- ✅ Twitter Cards
|
||||
- ✅ Schema.org microdata
|
||||
- ✅ JSON-LD structured data
|
||||
- ✅ Dublin Core metadata
|
||||
- ✅ Common CSS class patterns
|
||||
306
news_crawler/HOW_IT_WORKS.md
Normal file
306
news_crawler/HOW_IT_WORKS.md
Normal file
@@ -0,0 +1,306 @@
|
||||
# How the News Crawler Works
|
||||
|
||||
## 🎯 Overview
|
||||
|
||||
The crawler dynamically extracts article metadata from any website using multiple fallback strategies.
|
||||
|
||||
## 📊 Flow Diagram
|
||||
|
||||
```
|
||||
RSS Feed URL
|
||||
↓
|
||||
Parse RSS Feed
|
||||
↓
|
||||
For each article link:
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 1. Fetch HTML Page │
|
||||
│ GET https://example.com/article │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 2. Parse with BeautifulSoup │
|
||||
│ soup = BeautifulSoup(html) │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 3. Clean HTML │
|
||||
│ Remove: scripts, styles, nav, │
|
||||
│ footer, header, ads │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 4. Extract Title │
|
||||
│ Try: H1 → OG meta → Twitter → │
|
||||
│ Title tag │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 5. Extract Author │
|
||||
│ Try: Meta author → rel=author → │
|
||||
│ Class names → JSON-LD │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 6. Extract Date │
|
||||
│ Try: <time> → Meta tags → │
|
||||
│ Class names → JSON-LD │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 7. Extract Content │
|
||||
│ Try: <article> → Class names → │
|
||||
│ <main> → <body> │
|
||||
│ Filter short paragraphs │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 8. Save to MongoDB │
|
||||
│ { │
|
||||
│ title, author, date, │
|
||||
│ content, word_count │
|
||||
│ } │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
Wait 1 second (rate limiting)
|
||||
↓
|
||||
Next article
|
||||
```
|
||||
|
||||
## 🔍 Detailed Example
|
||||
|
||||
### Input: RSS Feed Entry
|
||||
```xml
|
||||
<item>
|
||||
<title>New U-Bahn Line Opens</title>
|
||||
<link>https://www.sueddeutsche.de/muenchen/article-123</link>
|
||||
<pubDate>Mon, 10 Nov 2024 10:00:00 +0100</pubDate>
|
||||
</item>
|
||||
```
|
||||
|
||||
### Step 1: Fetch HTML
|
||||
```python
|
||||
url = "https://www.sueddeutsche.de/muenchen/article-123"
|
||||
response = requests.get(url)
|
||||
html = response.content
|
||||
```
|
||||
|
||||
### Step 2: Parse HTML
|
||||
```python
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
```
|
||||
|
||||
### Step 3: Extract Title
|
||||
```python
|
||||
# Try H1
|
||||
h1 = soup.find('h1')
|
||||
# Result: "New U-Bahn Line Opens in Munich"
|
||||
|
||||
# If no H1, try OG meta
|
||||
og_title = soup.find('meta', property='og:title')
|
||||
# Fallback chain continues...
|
||||
```
|
||||
|
||||
### Step 4: Extract Author
|
||||
```python
|
||||
# Try meta author
|
||||
meta_author = soup.find('meta', name='author')
|
||||
# Result: None
|
||||
|
||||
# Try class names
|
||||
author_elem = soup.select_one('[class*="author"]')
|
||||
# Result: "Max Mustermann"
|
||||
```
|
||||
|
||||
### Step 5: Extract Date
|
||||
```python
|
||||
# Try time tag
|
||||
time_tag = soup.find('time')
|
||||
# Result: "2024-11-10T10:00:00Z"
|
||||
```
|
||||
|
||||
### Step 6: Extract Content
|
||||
```python
|
||||
# Try article tag
|
||||
article = soup.find('article')
|
||||
paragraphs = article.find_all('p')
|
||||
|
||||
# Filter paragraphs
|
||||
content = []
|
||||
for p in paragraphs:
|
||||
text = p.get_text().strip()
|
||||
if len(text) >= 50: # Keep substantial paragraphs
|
||||
content.append(text)
|
||||
|
||||
full_content = '\n\n'.join(content)
|
||||
# Result: "The new U-Bahn line connecting the city center..."
|
||||
```
|
||||
|
||||
### Step 7: Save to Database
|
||||
```python
|
||||
article_doc = {
|
||||
'title': 'New U-Bahn Line Opens in Munich',
|
||||
'author': 'Max Mustermann',
|
||||
'link': 'https://www.sueddeutsche.de/muenchen/article-123',
|
||||
'summary': 'Short summary from RSS...',
|
||||
'full_content': 'The new U-Bahn line connecting...',
|
||||
'word_count': 1250,
|
||||
'source': 'Süddeutsche Zeitung München',
|
||||
'published_at': '2024-11-10T10:00:00Z',
|
||||
'crawled_at': datetime.utcnow(),
|
||||
'created_at': datetime.utcnow()
|
||||
}
|
||||
|
||||
db.articles.update_one(
|
||||
{'link': article_url},
|
||||
{'$set': article_doc},
|
||||
upsert=True
|
||||
)
|
||||
```
|
||||
|
||||
## 🎨 What Makes It "Dynamic"?
|
||||
|
||||
### Traditional Approach (Hardcoded)
|
||||
```python
|
||||
# Only works for one specific site
|
||||
title = soup.find('h1', class_='article-title').text
|
||||
author = soup.find('span', class_='author-name').text
|
||||
```
|
||||
❌ Breaks when site changes
|
||||
❌ Doesn't work on other sites
|
||||
|
||||
### Our Approach (Dynamic)
|
||||
```python
|
||||
# Works on ANY site
|
||||
title = extract_title(soup) # Tries 4 different methods
|
||||
author = extract_author(soup) # Tries 5 different methods
|
||||
```
|
||||
✅ Adapts to different HTML structures
|
||||
✅ Falls back to alternatives
|
||||
✅ Works across multiple sites
|
||||
|
||||
## 🛡️ Robustness Features
|
||||
|
||||
### 1. Multiple Strategies
|
||||
Each field has 4-6 extraction strategies
|
||||
```python
|
||||
def extract_title(soup):
|
||||
# Try strategy 1
|
||||
if h1 := soup.find('h1'):
|
||||
return h1.text
|
||||
|
||||
# Try strategy 2
|
||||
if og_title := soup.find('meta', property='og:title'):
|
||||
return og_title['content']
|
||||
|
||||
# Try strategy 3...
|
||||
# Try strategy 4...
|
||||
```
|
||||
|
||||
### 2. Validation
|
||||
```python
|
||||
# Title must be reasonable length
|
||||
if title and len(title) > 10:
|
||||
return title
|
||||
|
||||
# Author must be < 100 chars
|
||||
if author and len(author) < 100:
|
||||
return author
|
||||
```
|
||||
|
||||
### 3. Cleaning
|
||||
```python
|
||||
# Remove site name from title
|
||||
if ' | ' in title:
|
||||
title = title.split(' | ')[0]
|
||||
|
||||
# Remove "By" from author
|
||||
author = author.replace('By ', '').strip()
|
||||
```
|
||||
|
||||
### 4. Error Handling
|
||||
```python
|
||||
try:
|
||||
data = extract_article_content(url)
|
||||
except Timeout:
|
||||
print("Timeout - skip")
|
||||
except RequestException:
|
||||
print("Network error - skip")
|
||||
except Exception:
|
||||
print("Unknown error - skip")
|
||||
```
|
||||
|
||||
## 📈 Success Metrics
|
||||
|
||||
After crawling, you'll see:
|
||||
|
||||
```
|
||||
📰 Crawling feed: Süddeutsche Zeitung München
|
||||
🔍 Crawling: New U-Bahn Line Opens...
|
||||
✓ Saved (1250 words)
|
||||
|
||||
Title: ✓ Found
|
||||
Author: ✓ Found (Max Mustermann)
|
||||
Date: ✓ Found (2024-11-10T10:00:00Z)
|
||||
Content: ✓ Found (1250 words)
|
||||
```
|
||||
|
||||
## 🗄️ Database Result
|
||||
|
||||
**Before Crawling:**
|
||||
```javascript
|
||||
{
|
||||
title: "New U-Bahn Line Opens",
|
||||
link: "https://example.com/article",
|
||||
summary: "Short RSS summary...",
|
||||
source: "Süddeutsche Zeitung"
|
||||
}
|
||||
```
|
||||
|
||||
**After Crawling:**
|
||||
```javascript
|
||||
{
|
||||
title: "New U-Bahn Line Opens in Munich", // ← Enhanced
|
||||
author: "Max Mustermann", // ← NEW!
|
||||
link: "https://example.com/article",
|
||||
summary: "Short RSS summary...",
|
||||
full_content: "The new U-Bahn line...", // ← NEW! (1250 words)
|
||||
word_count: 1250, // ← NEW!
|
||||
source: "Süddeutsche Zeitung",
|
||||
published_at: "2024-11-10T10:00:00Z", // ← Enhanced
|
||||
crawled_at: ISODate("2024-11-10T16:30:00Z"), // ← NEW!
|
||||
created_at: ISODate("2024-11-10T16:00:00Z")
|
||||
}
|
||||
```
|
||||
|
||||
## 🚀 Running the Crawler
|
||||
|
||||
```bash
|
||||
cd news_crawler
|
||||
pip install -r requirements.txt
|
||||
python crawler_service.py 10
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
============================================================
|
||||
🚀 Starting RSS Feed Crawler
|
||||
============================================================
|
||||
Found 3 active feed(s)
|
||||
|
||||
📰 Crawling feed: Süddeutsche Zeitung München
|
||||
🔍 Crawling: New U-Bahn Line Opens...
|
||||
✓ Saved (1250 words)
|
||||
🔍 Crawling: Munich Weather Update...
|
||||
✓ Saved (450 words)
|
||||
✓ Crawled 2 articles
|
||||
|
||||
============================================================
|
||||
✓ Crawling Complete!
|
||||
Total feeds processed: 3
|
||||
Total articles crawled: 15
|
||||
Duration: 45.23 seconds
|
||||
============================================================
|
||||
```
|
||||
|
||||
Now you have rich, structured article data ready for AI processing! 🎉
|
||||
127
news_crawler/QUICKSTART.md
Normal file
127
news_crawler/QUICKSTART.md
Normal file
@@ -0,0 +1,127 @@
|
||||
# News Crawler - Quick Start
|
||||
|
||||
## 1. Install Dependencies
|
||||
|
||||
```bash
|
||||
cd news_crawler
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## 2. Configure Environment
|
||||
|
||||
Make sure MongoDB is running and accessible. The crawler will use the same database as the backend.
|
||||
|
||||
Default connection: `mongodb://localhost:27017/`
|
||||
|
||||
To use a different MongoDB URI, create a `.env` file:
|
||||
```env
|
||||
MONGODB_URI=mongodb://localhost:27017/
|
||||
```
|
||||
|
||||
## 3. Run the Crawler
|
||||
|
||||
```bash
|
||||
# Crawl up to 10 articles per feed
|
||||
python crawler_service.py
|
||||
|
||||
# Crawl up to 20 articles per feed
|
||||
python crawler_service.py 20
|
||||
```
|
||||
|
||||
## 4. Verify Results
|
||||
|
||||
Check your MongoDB database:
|
||||
|
||||
```bash
|
||||
# Using mongosh
|
||||
mongosh
|
||||
use munich_news
|
||||
db.articles.find({full_content: {$exists: true}}).count()
|
||||
db.articles.findOne({full_content: {$exists: true}})
|
||||
```
|
||||
|
||||
## 5. Schedule Regular Crawling
|
||||
|
||||
### Option A: Cron (Linux/Mac)
|
||||
|
||||
```bash
|
||||
# Edit crontab
|
||||
crontab -e
|
||||
|
||||
# Add this line to run every 6 hours
|
||||
0 */6 * * * cd /path/to/news_crawler && /path/to/venv/bin/python crawler_service.py
|
||||
```
|
||||
|
||||
### Option B: Docker
|
||||
|
||||
```bash
|
||||
# Build and run
|
||||
docker-compose up
|
||||
|
||||
# Or run as a one-off
|
||||
docker-compose run --rm crawler
|
||||
```
|
||||
|
||||
### Option C: Manual
|
||||
|
||||
Just run the script whenever you want to fetch new articles:
|
||||
|
||||
```bash
|
||||
python crawler_service.py
|
||||
```
|
||||
|
||||
## What Gets Crawled?
|
||||
|
||||
The crawler:
|
||||
1. Fetches all active RSS feeds from the database
|
||||
2. For each feed, gets the latest articles
|
||||
3. Crawls the full content from each article URL
|
||||
4. Saves: title, full_content, word_count, crawled_at
|
||||
5. Skips articles that already have content
|
||||
|
||||
## Output Example
|
||||
|
||||
```
|
||||
============================================================
|
||||
🚀 Starting RSS Feed Crawler
|
||||
============================================================
|
||||
Found 3 active feed(s)
|
||||
|
||||
📰 Crawling feed: Süddeutsche Zeitung München
|
||||
URL: https://www.sueddeutsche.de/muenchen/rss
|
||||
🔍 Crawling: New U-Bahn Line Opens in Munich...
|
||||
✓ Saved (1250 words)
|
||||
🔍 Crawling: Munich Weather Update...
|
||||
✓ Saved (450 words)
|
||||
✓ Crawled 2 articles from Süddeutsche Zeitung München
|
||||
|
||||
============================================================
|
||||
✓ Crawling Complete!
|
||||
Total feeds processed: 3
|
||||
Total articles crawled: 15
|
||||
Duration: 45.23 seconds
|
||||
============================================================
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**No feeds found:**
|
||||
- Make sure you've added RSS feeds via the backend API
|
||||
- Check MongoDB connection
|
||||
|
||||
**Can't extract content:**
|
||||
- Some sites block scrapers
|
||||
- Some sites require JavaScript (not supported yet)
|
||||
- Check if the URL is accessible
|
||||
|
||||
**Timeout errors:**
|
||||
- Increase timeout in the code
|
||||
- Check your internet connection
|
||||
|
||||
## Next Steps
|
||||
|
||||
Once articles are crawled, you can:
|
||||
- View them in the frontend
|
||||
- Use Ollama to summarize them
|
||||
- Generate newsletters with full content
|
||||
- Perform text analysis
|
||||
225
news_crawler/README.md
Normal file
225
news_crawler/README.md
Normal file
@@ -0,0 +1,225 @@
|
||||
# News Crawler Microservice
|
||||
|
||||
A standalone microservice that crawls full article content from RSS feeds and stores it in MongoDB.
|
||||
|
||||
## Features
|
||||
|
||||
- 🔍 Extracts full article content from RSS feed links
|
||||
- 📊 Calculates word count
|
||||
- 🔄 Avoids re-crawling already processed articles
|
||||
- ⏱️ Rate limiting (1 second delay between requests)
|
||||
- 🎯 Smart content extraction using multiple selectors
|
||||
- 🧹 Cleans up scripts, styles, and navigation elements
|
||||
|
||||
## Installation
|
||||
|
||||
1. Create a virtual environment:
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. Configure environment variables:
|
||||
Create a `.env` file in the project root (or use the backend's `.env`):
|
||||
```env
|
||||
MONGODB_URI=mongodb://localhost:27017/
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Standalone Execution
|
||||
|
||||
Run the crawler directly:
|
||||
|
||||
```bash
|
||||
# Crawl up to 10 articles per feed (default)
|
||||
python crawler_service.py
|
||||
|
||||
# Crawl up to 20 articles per feed
|
||||
python crawler_service.py 20
|
||||
```
|
||||
|
||||
### As a Module
|
||||
|
||||
```python
|
||||
from crawler_service import crawl_all_feeds, crawl_rss_feed
|
||||
|
||||
# Crawl all active feeds
|
||||
result = crawl_all_feeds(max_articles_per_feed=10)
|
||||
print(result)
|
||||
|
||||
# Crawl a specific feed
|
||||
crawl_rss_feed(
|
||||
feed_url='https://example.com/rss',
|
||||
feed_name='Example News',
|
||||
max_articles=10
|
||||
)
|
||||
```
|
||||
|
||||
### Via Backend API
|
||||
|
||||
The backend has integrated endpoints:
|
||||
|
||||
```bash
|
||||
# Start crawler
|
||||
curl -X POST http://localhost:5001/api/crawler/start
|
||||
|
||||
# Check status
|
||||
curl http://localhost:5001/api/crawler/status
|
||||
|
||||
# Crawl specific feed
|
||||
curl -X POST http://localhost:5001/api/crawler/feed/<feed_id>
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Fetch RSS Feeds**: Gets all active RSS feeds from MongoDB
|
||||
2. **Parse Feed**: Extracts article links from each feed
|
||||
3. **Crawl Content**: For each article:
|
||||
- Fetches HTML page
|
||||
- Removes scripts, styles, navigation
|
||||
- Extracts main content using smart selectors
|
||||
- Calculates word count
|
||||
4. **Store Data**: Saves to MongoDB with metadata
|
||||
5. **Skip Duplicates**: Avoids re-crawling articles with existing content
|
||||
|
||||
## Content Extraction Strategy
|
||||
|
||||
The crawler tries multiple selectors in order:
|
||||
|
||||
1. `<article>` tag
|
||||
2. Elements with class containing "article-content", "article-body"
|
||||
3. Elements with class containing "post-content", "entry-content"
|
||||
4. `<main>` tag
|
||||
5. Fallback to all `<p>` tags in body
|
||||
|
||||
## Database Schema
|
||||
|
||||
Articles are stored with these fields:
|
||||
|
||||
```javascript
|
||||
{
|
||||
title: String, // Article title
|
||||
link: String, // Article URL (unique)
|
||||
summary: String, // Short summary
|
||||
full_content: String, // Full article text (max 10,000 chars)
|
||||
word_count: Number, // Number of words
|
||||
source: String, // RSS feed name
|
||||
published_at: String, // Publication date
|
||||
crawled_at: DateTime, // When content was crawled
|
||||
created_at: DateTime // When added to database
|
||||
}
|
||||
```
|
||||
|
||||
## Scheduling
|
||||
|
||||
### Using Cron (Linux/Mac)
|
||||
|
||||
```bash
|
||||
# Run every 6 hours
|
||||
0 */6 * * * cd /path/to/news_crawler && /path/to/venv/bin/python crawler_service.py
|
||||
```
|
||||
|
||||
### Using systemd Timer (Linux)
|
||||
|
||||
Create `/etc/systemd/system/news-crawler.service`:
|
||||
```ini
|
||||
[Unit]
|
||||
Description=News Crawler Service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
WorkingDirectory=/path/to/news_crawler
|
||||
ExecStart=/path/to/venv/bin/python crawler_service.py
|
||||
User=your-user
|
||||
```
|
||||
|
||||
Create `/etc/systemd/system/news-crawler.timer`:
|
||||
```ini
|
||||
[Unit]
|
||||
Description=Run News Crawler every 6 hours
|
||||
|
||||
[Timer]
|
||||
OnBootSec=5min
|
||||
OnUnitActiveSec=6h
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
```
|
||||
|
||||
Enable and start:
|
||||
```bash
|
||||
sudo systemctl enable news-crawler.timer
|
||||
sudo systemctl start news-crawler.timer
|
||||
```
|
||||
|
||||
### Using Docker
|
||||
|
||||
Create `Dockerfile`:
|
||||
```dockerfile
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY crawler_service.py .
|
||||
|
||||
CMD ["python", "crawler_service.py"]
|
||||
```
|
||||
|
||||
Build and run:
|
||||
```bash
|
||||
docker build -t news-crawler .
|
||||
docker run --env-file ../.env news-crawler
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Environment variables:
|
||||
|
||||
- `MONGODB_URI` - MongoDB connection string (default: `mongodb://localhost:27017/`)
|
||||
|
||||
## Rate Limiting
|
||||
|
||||
- 1 second delay between article requests
|
||||
- Respects server resources
|
||||
- User-Agent header included
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Issue: Can't extract content**
|
||||
- Some sites block scrapers
|
||||
- Try adjusting User-Agent header
|
||||
- Some sites require JavaScript (consider Selenium)
|
||||
|
||||
**Issue: Timeout errors**
|
||||
- Increase timeout in `extract_article_content()`
|
||||
- Check network connectivity
|
||||
|
||||
**Issue: Memory usage**
|
||||
- Reduce `max_articles_per_feed`
|
||||
- Content limited to 10,000 characters per article
|
||||
|
||||
## Architecture
|
||||
|
||||
This is a standalone microservice that:
|
||||
- Can run independently of the main backend
|
||||
- Shares the same MongoDB database
|
||||
- Can be deployed separately
|
||||
- Can be scheduled independently
|
||||
|
||||
## Next Steps
|
||||
|
||||
Once articles are crawled, you can:
|
||||
- Use Ollama to summarize articles
|
||||
- Perform sentiment analysis
|
||||
- Extract keywords and topics
|
||||
- Generate newsletter content
|
||||
- Create article recommendations
|
||||
194
news_crawler/RSS_URL_EXTRACTION.md
Normal file
194
news_crawler/RSS_URL_EXTRACTION.md
Normal file
@@ -0,0 +1,194 @@
|
||||
# RSS URL Extraction - How It Works
|
||||
|
||||
## The Problem
|
||||
|
||||
Different RSS feed providers use different fields to store the article URL:
|
||||
|
||||
### Example 1: Standard RSS (uses `link`)
|
||||
```xml
|
||||
<item>
|
||||
<title>Article Title</title>
|
||||
<link>https://example.com/article/123</link>
|
||||
<guid>internal-id-456</guid>
|
||||
</item>
|
||||
```
|
||||
|
||||
### Example 2: Some feeds (uses `guid` as URL)
|
||||
```xml
|
||||
<item>
|
||||
<title>Article Title</title>
|
||||
<guid>https://example.com/article/123</guid>
|
||||
</item>
|
||||
```
|
||||
|
||||
### Example 3: Atom feeds (uses `id`)
|
||||
```xml
|
||||
<entry>
|
||||
<title>Article Title</title>
|
||||
<id>https://example.com/article/123</id>
|
||||
</entry>
|
||||
```
|
||||
|
||||
### Example 4: Complex feeds (guid as object)
|
||||
```xml
|
||||
<item>
|
||||
<title>Article Title</title>
|
||||
<guid isPermaLink="true">https://example.com/article/123</guid>
|
||||
</item>
|
||||
```
|
||||
|
||||
### Example 5: Multiple links
|
||||
```xml
|
||||
<item>
|
||||
<title>Article Title</title>
|
||||
<link rel="alternate" type="text/html" href="https://example.com/article/123"/>
|
||||
<link rel="enclosure" type="image/jpeg" href="https://example.com/image.jpg"/>
|
||||
</item>
|
||||
```
|
||||
|
||||
## Our Solution
|
||||
|
||||
The `extract_article_url()` function tries multiple strategies in order:
|
||||
|
||||
### Strategy 1: Check `link` field (most common)
|
||||
```python
|
||||
if entry.get('link') and entry.get('link', '').startswith('http'):
|
||||
return entry.get('link')
|
||||
```
|
||||
✅ Works for: Most RSS 2.0 feeds
|
||||
|
||||
### Strategy 2: Check `guid` field
|
||||
```python
|
||||
if entry.get('guid'):
|
||||
guid = entry.get('guid')
|
||||
# guid can be a string
|
||||
if isinstance(guid, str) and guid.startswith('http'):
|
||||
return guid
|
||||
# or a dict with 'href'
|
||||
elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
|
||||
return guid.get('href')
|
||||
```
|
||||
✅ Works for: Feeds that use GUID as permalink
|
||||
|
||||
### Strategy 3: Check `id` field
|
||||
```python
|
||||
if entry.get('id') and entry.get('id', '').startswith('http'):
|
||||
return entry.get('id')
|
||||
```
|
||||
✅ Works for: Atom feeds
|
||||
|
||||
### Strategy 4: Check `links` array
|
||||
```python
|
||||
if entry.get('links'):
|
||||
for link in entry.get('links', []):
|
||||
if isinstance(link, dict) and link.get('href', '').startswith('http'):
|
||||
# Prefer 'alternate' type
|
||||
if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
|
||||
return link.get('href')
|
||||
```
|
||||
✅ Works for: Feeds with multiple links (prefers HTML content)
|
||||
|
||||
## Real-World Examples
|
||||
|
||||
### Süddeutsche Zeitung
|
||||
```python
|
||||
entry = {
|
||||
'title': 'Munich News',
|
||||
'link': 'https://www.sueddeutsche.de/muenchen/article-123',
|
||||
'guid': 'sz-internal-123'
|
||||
}
|
||||
# Returns: 'https://www.sueddeutsche.de/muenchen/article-123'
|
||||
```
|
||||
|
||||
### Medium Blog
|
||||
```python
|
||||
entry = {
|
||||
'title': 'Blog Post',
|
||||
'guid': 'https://medium.com/@user/post-abc123',
|
||||
'link': None
|
||||
}
|
||||
# Returns: 'https://medium.com/@user/post-abc123'
|
||||
```
|
||||
|
||||
### YouTube RSS
|
||||
```python
|
||||
entry = {
|
||||
'title': 'Video Title',
|
||||
'id': 'https://www.youtube.com/watch?v=abc123',
|
||||
'link': None
|
||||
}
|
||||
# Returns: 'https://www.youtube.com/watch?v=abc123'
|
||||
```
|
||||
|
||||
### Complex Feed
|
||||
```python
|
||||
entry = {
|
||||
'title': 'Article',
|
||||
'links': [
|
||||
{'rel': 'alternate', 'type': 'text/html', 'href': 'https://example.com/article'},
|
||||
{'rel': 'enclosure', 'type': 'image/jpeg', 'href': 'https://example.com/image.jpg'}
|
||||
]
|
||||
}
|
||||
# Returns: 'https://example.com/article' (prefers text/html)
|
||||
```
|
||||
|
||||
## Validation
|
||||
|
||||
All extracted URLs must:
|
||||
1. Start with `http://` or `https://`
|
||||
2. Be a valid string (not None or empty)
|
||||
|
||||
If no valid URL is found:
|
||||
```python
|
||||
return None
|
||||
# Crawler will skip this entry and log a warning
|
||||
```
|
||||
|
||||
## Testing Different Feeds
|
||||
|
||||
To test if a feed works with our extractor:
|
||||
|
||||
```python
|
||||
import feedparser
|
||||
from rss_utils import extract_article_url
|
||||
|
||||
# Parse feed
|
||||
feed = feedparser.parse('https://example.com/rss')
|
||||
|
||||
# Test each entry
|
||||
for entry in feed.entries[:5]:
|
||||
url = extract_article_url(entry)
|
||||
if url:
|
||||
print(f"✓ {entry.get('title', 'No title')[:50]}")
|
||||
print(f" URL: {url}")
|
||||
else:
|
||||
print(f"✗ {entry.get('title', 'No title')[:50]}")
|
||||
print(f" No valid URL found")
|
||||
print(f" Available fields: {list(entry.keys())}")
|
||||
```
|
||||
|
||||
## Supported Feed Types
|
||||
|
||||
✅ RSS 2.0
|
||||
✅ RSS 1.0
|
||||
✅ Atom
|
||||
✅ Custom RSS variants
|
||||
✅ Feeds with multiple links
|
||||
✅ Feeds with GUID as permalink
|
||||
|
||||
## Edge Cases Handled
|
||||
|
||||
1. **GUID is not a URL**: Checks if it starts with `http`
|
||||
2. **Multiple links**: Prefers `text/html` type
|
||||
3. **GUID as dict**: Extracts `href` field
|
||||
4. **Missing fields**: Returns None instead of crashing
|
||||
5. **Non-HTTP URLs**: Filters out `mailto:`, `ftp:`, etc.
|
||||
|
||||
## Future Improvements
|
||||
|
||||
Potential enhancements:
|
||||
- [ ] Support for `feedburner:origLink`
|
||||
- [ ] Support for `pheedo:origLink`
|
||||
- [ ] Resolve shortened URLs (bit.ly, etc.)
|
||||
- [ ] Handle relative URLs (convert to absolute)
|
||||
- [ ] Cache URL extraction results
|
||||
79
news_crawler/check_database.py
Normal file
79
news_crawler/check_database.py
Normal file
@@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Quick script to check what RSS feeds are in the database
|
||||
"""
|
||||
from pymongo import MongoClient
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add parent directory to path to import from backend
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'backend'))
|
||||
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', 'backend', '.env'))
|
||||
except:
|
||||
pass
|
||||
|
||||
# MongoDB setup
|
||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
||||
DB_NAME = 'munich_news'
|
||||
|
||||
print(f"Connecting to: {MONGODB_URI}")
|
||||
print(f"Database: {DB_NAME}\n")
|
||||
|
||||
try:
|
||||
client = MongoClient(MONGODB_URI, serverSelectionTimeoutMS=5000)
|
||||
# Test connection
|
||||
client.server_info()
|
||||
print("✓ Connected to MongoDB\n")
|
||||
|
||||
db = client[DB_NAME]
|
||||
rss_feeds_collection = db['rss_feeds']
|
||||
|
||||
# Get all feeds
|
||||
feeds = list(rss_feeds_collection.find())
|
||||
|
||||
if not feeds:
|
||||
print("❌ No RSS feeds found in database\n")
|
||||
print("Add feeds using the API:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(feeds)} RSS feed(s):\n")
|
||||
print("="*80)
|
||||
|
||||
for i, feed in enumerate(feeds, 1):
|
||||
print(f"\n{i}. {feed.get('name', 'Unknown')}")
|
||||
print(f" URL: {feed.get('url', 'N/A')}")
|
||||
print(f" Active: {feed.get('active', True)}")
|
||||
print(f" Created: {feed.get('created_at', 'N/A')}")
|
||||
print(f" ID: {feed.get('_id', 'N/A')}")
|
||||
|
||||
print("\n" + "="*80)
|
||||
|
||||
# Check articles
|
||||
articles_collection = db['articles']
|
||||
total_articles = articles_collection.count_documents({})
|
||||
crawled_articles = articles_collection.count_documents({'full_content': {'$exists': True}})
|
||||
|
||||
print(f"\nArticles in database:")
|
||||
print(f" Total: {total_articles}")
|
||||
print(f" With full content: {crawled_articles}")
|
||||
print(f" Without full content: {total_articles - crawled_articles}")
|
||||
|
||||
if total_articles > 0:
|
||||
print("\nSample article:")
|
||||
sample = articles_collection.find_one()
|
||||
print(f" Title: {sample.get('title', 'N/A')[:60]}")
|
||||
print(f" Link: {sample.get('link', 'N/A')}")
|
||||
print(f" Has full_content: {bool(sample.get('full_content'))}")
|
||||
print(f" Word count: {sample.get('word_count', 'N/A')}")
|
||||
|
||||
print("\n✓ Database check complete!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
sys.exit(1)
|
||||
90
news_crawler/config.py
Normal file
90
news_crawler/config.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
Configuration management for news crawler
|
||||
"""
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from pathlib import Path
|
||||
|
||||
# Load environment variables from backend/.env
|
||||
backend_dir = Path(__file__).parent.parent / 'backend'
|
||||
env_path = backend_dir / '.env'
|
||||
|
||||
if env_path.exists():
|
||||
load_dotenv(dotenv_path=env_path)
|
||||
print(f"✓ Loaded configuration from: {env_path}")
|
||||
else:
|
||||
print(f"⚠ Warning: .env file not found at {env_path}")
|
||||
|
||||
|
||||
class Config:
|
||||
"""Centralized configuration for news crawler"""
|
||||
|
||||
# MongoDB Configuration
|
||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
||||
DB_NAME = 'munich_news'
|
||||
|
||||
# Ollama Configuration
|
||||
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
|
||||
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'phi3:latest')
|
||||
OLLAMA_API_KEY = os.getenv('OLLAMA_API_KEY', '')
|
||||
OLLAMA_ENABLED = os.getenv('OLLAMA_ENABLED', 'false').lower() == 'true'
|
||||
OLLAMA_TIMEOUT = int(os.getenv('OLLAMA_TIMEOUT', '30'))
|
||||
|
||||
# Crawler Configuration
|
||||
RATE_LIMIT_DELAY = 1 # seconds between requests
|
||||
MAX_CONTENT_LENGTH = 50000 # characters
|
||||
SUMMARY_MAX_WORDS = 150 # maximum words in AI summary
|
||||
|
||||
@classmethod
|
||||
def print_config(cls):
|
||||
"""Print current configuration (without sensitive data)"""
|
||||
print("\n" + "="*60)
|
||||
print("News Crawler Configuration")
|
||||
print("="*60)
|
||||
print(f"MongoDB URI: {cls.MONGODB_URI}")
|
||||
print(f"Database: {cls.DB_NAME}")
|
||||
print(f"\nOllama Configuration:")
|
||||
print(f" Base URL: {cls.OLLAMA_BASE_URL}")
|
||||
print(f" Model: {cls.OLLAMA_MODEL}")
|
||||
print(f" Enabled: {cls.OLLAMA_ENABLED}")
|
||||
print(f" Timeout: {cls.OLLAMA_TIMEOUT}s")
|
||||
print(f" Has API Key: {bool(cls.OLLAMA_API_KEY)}")
|
||||
print(f"\nCrawler Settings:")
|
||||
print(f" Rate Limit: {cls.RATE_LIMIT_DELAY}s between requests")
|
||||
print(f" Max Content: {cls.MAX_CONTENT_LENGTH} chars")
|
||||
print(f" Summary Length: {cls.SUMMARY_MAX_WORDS} words")
|
||||
print("="*60 + "\n")
|
||||
|
||||
@classmethod
|
||||
def validate(cls):
|
||||
"""Validate configuration and return list of issues"""
|
||||
issues = []
|
||||
|
||||
# Check MongoDB
|
||||
if not cls.MONGODB_URI:
|
||||
issues.append("MONGODB_URI is not set")
|
||||
|
||||
# Check Ollama if enabled
|
||||
if cls.OLLAMA_ENABLED:
|
||||
if not cls.OLLAMA_BASE_URL:
|
||||
issues.append("OLLAMA_BASE_URL is not set but Ollama is enabled")
|
||||
if not cls.OLLAMA_MODEL:
|
||||
issues.append("OLLAMA_MODEL is not set but Ollama is enabled")
|
||||
if cls.OLLAMA_TIMEOUT < 5:
|
||||
issues.append(f"OLLAMA_TIMEOUT ({cls.OLLAMA_TIMEOUT}s) is too low, recommend at least 5s")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Test configuration
|
||||
Config.print_config()
|
||||
|
||||
# Validate
|
||||
issues = Config.validate()
|
||||
if issues:
|
||||
print("⚠ Configuration Issues:")
|
||||
for issue in issues:
|
||||
print(f" - {issue}")
|
||||
else:
|
||||
print("✓ Configuration is valid")
|
||||
489
news_crawler/crawler_service.py
Normal file
489
news_crawler/crawler_service.py
Normal file
@@ -0,0 +1,489 @@
|
||||
"""
|
||||
Web crawler service to extract full article content from RSS feed links
|
||||
"""
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from pymongo import MongoClient
|
||||
from pymongo.errors import DuplicateKeyError
|
||||
import feedparser
|
||||
import time
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
||||
from config import Config
|
||||
from ollama_client import OllamaClient
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(dotenv_path='../.env')
|
||||
|
||||
# MongoDB setup
|
||||
client = MongoClient(Config.MONGODB_URI)
|
||||
db = client[Config.DB_NAME]
|
||||
|
||||
articles_collection = db['articles']
|
||||
rss_feeds_collection = db['rss_feeds']
|
||||
|
||||
# Initialize Ollama client
|
||||
ollama_client = OllamaClient(
|
||||
base_url=Config.OLLAMA_BASE_URL,
|
||||
model=Config.OLLAMA_MODEL,
|
||||
api_key=Config.OLLAMA_API_KEY,
|
||||
enabled=Config.OLLAMA_ENABLED,
|
||||
timeout=Config.OLLAMA_TIMEOUT
|
||||
)
|
||||
|
||||
# Print configuration on startup
|
||||
if __name__ != '__main__':
|
||||
Config.print_config()
|
||||
if Config.OLLAMA_ENABLED:
|
||||
print("🤖 Ollama AI summarization: ENABLED")
|
||||
if ollama_client.is_available():
|
||||
print("✓ Ollama server is reachable")
|
||||
else:
|
||||
print("⚠ Warning: Ollama server is not reachable")
|
||||
else:
|
||||
print("ℹ Ollama AI summarization: DISABLED")
|
||||
|
||||
|
||||
def get_active_rss_feeds():
|
||||
"""Get all active RSS feeds from database"""
|
||||
feeds = []
|
||||
cursor = rss_feeds_collection.find({'active': True})
|
||||
for feed in cursor:
|
||||
feeds.append({
|
||||
'id': str(feed['_id']),
|
||||
'name': feed.get('name', ''),
|
||||
'url': feed.get('url', '')
|
||||
})
|
||||
return feeds
|
||||
|
||||
|
||||
def extract_article_content(url, timeout=10):
|
||||
"""
|
||||
Extract main article content from a URL with smart detection
|
||||
Returns: dict with title, content, author, date, and metadata
|
||||
"""
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=headers, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Remove script and style elements
|
||||
for script in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'iframe']):
|
||||
script.decompose()
|
||||
|
||||
# === EXTRACT TITLE ===
|
||||
title = extract_title(soup)
|
||||
|
||||
# === EXTRACT AUTHOR ===
|
||||
author = extract_author(soup)
|
||||
|
||||
# === EXTRACT PUBLISHED DATE ===
|
||||
published_date = extract_date(soup)
|
||||
|
||||
# === EXTRACT MAIN CONTENT ===
|
||||
content_text = extract_main_content(soup)
|
||||
|
||||
# === EXTRACT META DESCRIPTION ===
|
||||
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
||||
if not meta_desc:
|
||||
meta_desc = soup.find('meta', attrs={'property': 'og:description'})
|
||||
description = meta_desc.get('content', '') if meta_desc else ''
|
||||
|
||||
return {
|
||||
'title': title,
|
||||
'author': author,
|
||||
'content': content_text, # Full content, no limit
|
||||
'description': description,
|
||||
'published_date': published_date,
|
||||
'word_count': len(content_text.split()) if content_text else 0,
|
||||
'crawled_at': datetime.utcnow()
|
||||
}
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
print(f"Timeout crawling {url}")
|
||||
return None
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Error crawling {url}: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Unexpected error crawling {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def extract_title(soup):
|
||||
"""
|
||||
Extract article title using multiple strategies
|
||||
"""
|
||||
# Strategy 1: Look for h1 tag
|
||||
h1 = soup.find('h1')
|
||||
if h1:
|
||||
title = h1.get_text().strip()
|
||||
if title and len(title) > 10: # Reasonable title length
|
||||
return title
|
||||
|
||||
# Strategy 2: Look for meta og:title
|
||||
og_title = soup.find('meta', attrs={'property': 'og:title'})
|
||||
if og_title and og_title.get('content'):
|
||||
return og_title.get('content').strip()
|
||||
|
||||
# Strategy 3: Look for meta twitter:title
|
||||
twitter_title = soup.find('meta', attrs={'name': 'twitter:title'})
|
||||
if twitter_title and twitter_title.get('content'):
|
||||
return twitter_title.get('content').strip()
|
||||
|
||||
# Strategy 4: Look for title tag (fallback)
|
||||
title_tag = soup.find('title')
|
||||
if title_tag:
|
||||
title = title_tag.get_text().strip()
|
||||
# Clean up common patterns like "Site Name | Article Title"
|
||||
if ' | ' in title:
|
||||
title = title.split(' | ')[0]
|
||||
elif ' - ' in title:
|
||||
title = title.split(' - ')[0]
|
||||
return title
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_author(soup):
|
||||
"""
|
||||
Extract article author using multiple strategies
|
||||
"""
|
||||
# Strategy 1: Look for meta author
|
||||
meta_author = soup.find('meta', attrs={'name': 'author'})
|
||||
if meta_author and meta_author.get('content'):
|
||||
return meta_author.get('content').strip()
|
||||
|
||||
# Strategy 2: Look for rel="author"
|
||||
rel_author = soup.find('a', attrs={'rel': 'author'})
|
||||
if rel_author:
|
||||
return rel_author.get_text().strip()
|
||||
|
||||
# Strategy 3: Look for common author class names
|
||||
author_selectors = [
|
||||
'[class*="author-name"]',
|
||||
'[class*="author"]',
|
||||
'[class*="byline"]',
|
||||
'[class*="writer"]',
|
||||
'[rel="author"]',
|
||||
'[itemprop="author"]'
|
||||
]
|
||||
|
||||
for selector in author_selectors:
|
||||
author_elem = soup.select_one(selector)
|
||||
if author_elem:
|
||||
author = author_elem.get_text().strip()
|
||||
# Clean up common patterns
|
||||
author = author.replace('By ', '').replace('by ', '').strip()
|
||||
if author and len(author) < 100: # Reasonable author name length
|
||||
return author
|
||||
|
||||
# Strategy 4: Look for JSON-LD structured data
|
||||
json_ld = soup.find('script', attrs={'type': 'application/ld+json'})
|
||||
if json_ld:
|
||||
try:
|
||||
import json
|
||||
data = json.loads(json_ld.string)
|
||||
if isinstance(data, dict) and data.get('author'):
|
||||
author_data = data.get('author')
|
||||
if isinstance(author_data, dict):
|
||||
return author_data.get('name', '')
|
||||
elif isinstance(author_data, str):
|
||||
return author_data
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_date(soup):
|
||||
"""
|
||||
Extract published date using multiple strategies
|
||||
"""
|
||||
# Strategy 1: Look for time tag with datetime attribute
|
||||
time_tag = soup.find('time')
|
||||
if time_tag and time_tag.get('datetime'):
|
||||
return time_tag.get('datetime')
|
||||
|
||||
# Strategy 2: Look for meta article:published_time
|
||||
meta_published = soup.find('meta', attrs={'property': 'article:published_time'})
|
||||
if meta_published and meta_published.get('content'):
|
||||
return meta_published.get('content')
|
||||
|
||||
# Strategy 3: Look for meta og:published_time
|
||||
og_published = soup.find('meta', attrs={'property': 'og:published_time'})
|
||||
if og_published and og_published.get('content'):
|
||||
return og_published.get('content')
|
||||
|
||||
# Strategy 4: Look for common date class names
|
||||
date_selectors = [
|
||||
'[class*="publish-date"]',
|
||||
'[class*="published"]',
|
||||
'[class*="date"]',
|
||||
'[class*="timestamp"]',
|
||||
'[itemprop="datePublished"]'
|
||||
]
|
||||
|
||||
for selector in date_selectors:
|
||||
date_elem = soup.select_one(selector)
|
||||
if date_elem:
|
||||
# Try datetime attribute first
|
||||
if date_elem.get('datetime'):
|
||||
return date_elem.get('datetime')
|
||||
# Otherwise get text
|
||||
date_text = date_elem.get_text().strip()
|
||||
if date_text and len(date_text) < 50:
|
||||
return date_text
|
||||
|
||||
# Strategy 5: Look for JSON-LD structured data
|
||||
json_ld = soup.find('script', attrs={'type': 'application/ld+json'})
|
||||
if json_ld:
|
||||
try:
|
||||
import json
|
||||
data = json.loads(json_ld.string)
|
||||
if isinstance(data, dict):
|
||||
return data.get('datePublished') or data.get('dateCreated')
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_main_content(soup):
|
||||
"""
|
||||
Extract main article content using multiple strategies
|
||||
"""
|
||||
# Strategy 1: Try common article content selectors
|
||||
content_selectors = [
|
||||
'article',
|
||||
'[class*="article-content"]',
|
||||
'[class*="article-body"]',
|
||||
'[class*="post-content"]',
|
||||
'[class*="entry-content"]',
|
||||
'[class*="content-body"]',
|
||||
'[class*="story-body"]',
|
||||
'[itemprop="articleBody"]',
|
||||
'main'
|
||||
]
|
||||
|
||||
article_content = None
|
||||
for selector in content_selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
article_content = element
|
||||
break
|
||||
|
||||
# Fallback: get body
|
||||
if not article_content:
|
||||
article_content = soup.find('body')
|
||||
|
||||
if not article_content:
|
||||
return ''
|
||||
|
||||
# Extract text from paragraphs
|
||||
paragraphs = article_content.find_all('p')
|
||||
|
||||
# Filter out short paragraphs (likely navigation/ads)
|
||||
content_paragraphs = []
|
||||
for p in paragraphs:
|
||||
text = p.get_text().strip()
|
||||
# Keep paragraphs with at least 50 characters
|
||||
if len(text) >= 50:
|
||||
content_paragraphs.append(text)
|
||||
|
||||
content_text = '\n\n'.join(content_paragraphs)
|
||||
|
||||
return content_text
|
||||
|
||||
|
||||
def crawl_rss_feed(feed_url, feed_name, max_articles=10):
|
||||
"""
|
||||
Crawl articles from an RSS feed
|
||||
Returns: dict with statistics
|
||||
"""
|
||||
print(f"\n📰 Crawling feed: {feed_name}")
|
||||
print(f" URL: {feed_url}")
|
||||
|
||||
try:
|
||||
# Parse RSS feed
|
||||
feed = feedparser.parse(feed_url)
|
||||
|
||||
if not feed.entries:
|
||||
print(f" ⚠ No entries found in feed")
|
||||
return 0
|
||||
|
||||
crawled_count = 0
|
||||
summarized_count = 0
|
||||
failed_summaries = 0
|
||||
|
||||
for entry in feed.entries[:max_articles]:
|
||||
# Extract article URL using utility function
|
||||
article_url = extract_article_url(entry)
|
||||
|
||||
if not article_url:
|
||||
print(f" ⚠ No valid URL found for: {entry.get('title', 'Unknown')[:50]}")
|
||||
continue
|
||||
|
||||
# Check if article already exists and has content
|
||||
existing = articles_collection.find_one({'link': article_url})
|
||||
if existing and existing.get('content'):
|
||||
print(f" ⏭ Skipping (already crawled): {entry.get('title', 'No title')[:50]}")
|
||||
continue
|
||||
|
||||
print(f" 🔍 Crawling: {entry.get('title', 'No title')[:50]}...")
|
||||
|
||||
# Extract full content
|
||||
article_data = extract_article_content(article_url)
|
||||
|
||||
if article_data and article_data.get('content'):
|
||||
# Summarize with Ollama if enabled
|
||||
summary_result = None
|
||||
if Config.OLLAMA_ENABLED and article_data.get('content'):
|
||||
print(f" 🤖 Summarizing with AI...")
|
||||
summary_result = ollama_client.summarize_article(
|
||||
article_data['content'],
|
||||
max_words=Config.SUMMARY_MAX_WORDS
|
||||
)
|
||||
|
||||
if summary_result['success']:
|
||||
print(f" ✓ Summary: {summary_result['summary_word_count']} words (from {summary_result['original_word_count']} words, {summary_result['duration']:.1f}s)")
|
||||
summarized_count += 1
|
||||
else:
|
||||
print(f" ⚠ Summarization failed: {summary_result['error']}")
|
||||
failed_summaries += 1
|
||||
|
||||
# Prepare document
|
||||
article_doc = {
|
||||
'title': article_data.get('title') or entry.get('title', ''),
|
||||
'author': article_data.get('author'),
|
||||
'link': article_url,
|
||||
'content': article_data.get('content', ''), # Full article content
|
||||
'summary': summary_result['summary'] if summary_result and summary_result['success'] else None,
|
||||
'word_count': article_data.get('word_count', 0),
|
||||
'summary_word_count': summary_result['summary_word_count'] if summary_result and summary_result['success'] else None,
|
||||
'source': feed_name,
|
||||
'published_at': extract_published_date(entry) or article_data.get('published_date', ''),
|
||||
'crawled_at': article_data.get('crawled_at'),
|
||||
'summarized_at': datetime.utcnow() if summary_result and summary_result['success'] else None,
|
||||
'created_at': datetime.utcnow()
|
||||
}
|
||||
|
||||
try:
|
||||
# Upsert: update if exists, insert if not
|
||||
articles_collection.update_one(
|
||||
{'link': article_url},
|
||||
{'$set': article_doc},
|
||||
upsert=True
|
||||
)
|
||||
crawled_count += 1
|
||||
print(f" ✓ Saved ({article_data.get('word_count', 0)} words)")
|
||||
|
||||
except DuplicateKeyError:
|
||||
print(f" ⚠ Duplicate key error")
|
||||
except Exception as e:
|
||||
print(f" ✗ Error saving: {e}")
|
||||
else:
|
||||
print(f" ✗ Failed to extract content")
|
||||
|
||||
# Be nice to servers - add delay
|
||||
time.sleep(1)
|
||||
|
||||
print(f" ✓ Crawled {crawled_count} articles from {feed_name}")
|
||||
if Config.OLLAMA_ENABLED:
|
||||
print(f" 🤖 Summarized: {summarized_count}, Failed: {failed_summaries}")
|
||||
|
||||
return {
|
||||
'crawled': crawled_count,
|
||||
'summarized': summarized_count,
|
||||
'failed_summaries': failed_summaries
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Error processing feed {feed_name}: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def crawl_all_feeds(max_articles_per_feed=10):
|
||||
"""
|
||||
Crawl all active RSS feeds
|
||||
Returns: dict with statistics
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("🚀 Starting RSS Feed Crawler")
|
||||
print("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
feeds = get_active_rss_feeds()
|
||||
|
||||
if not feeds:
|
||||
print("⚠ No active RSS feeds found")
|
||||
return {
|
||||
'total_feeds': 0,
|
||||
'total_articles_crawled': 0,
|
||||
'duration_seconds': 0
|
||||
}
|
||||
|
||||
print(f"Found {len(feeds)} active feed(s)")
|
||||
if Config.OLLAMA_ENABLED:
|
||||
print(f"🤖 AI Summarization: ENABLED (max {Config.SUMMARY_MAX_WORDS} words)")
|
||||
|
||||
total_crawled = 0
|
||||
total_summarized = 0
|
||||
total_failed = 0
|
||||
|
||||
for feed in feeds:
|
||||
result = crawl_rss_feed(
|
||||
feed['url'],
|
||||
feed['name'],
|
||||
max_articles=max_articles_per_feed
|
||||
)
|
||||
total_crawled += result['crawled']
|
||||
total_summarized += result['summarized']
|
||||
total_failed += result['failed_summaries']
|
||||
|
||||
duration = time.time() - start_time
|
||||
|
||||
print("\n" + "="*60)
|
||||
print(f"✓ Crawling Complete!")
|
||||
print(f" Total feeds processed: {len(feeds)}")
|
||||
print(f" Total articles crawled: {total_crawled}")
|
||||
if Config.OLLAMA_ENABLED:
|
||||
print(f" Total articles summarized: {total_summarized}")
|
||||
print(f" Failed summarizations: {total_failed}")
|
||||
if total_summarized > 0:
|
||||
success_rate = (total_summarized / (total_summarized + total_failed)) * 100
|
||||
print(f" Success rate: {success_rate:.1f}%")
|
||||
print(f" Duration: {duration:.2f} seconds")
|
||||
if total_crawled > 0:
|
||||
print(f" Average time per article: {duration/total_crawled:.1f}s")
|
||||
print("="*60 + "\n")
|
||||
|
||||
return {
|
||||
'total_feeds': len(feeds),
|
||||
'total_articles_crawled': total_crawled,
|
||||
'total_summarized': total_summarized,
|
||||
'failed_summaries': total_failed,
|
||||
'duration_seconds': round(duration, 2)
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Can be run standalone for testing
|
||||
import sys
|
||||
max_articles = 10
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
try:
|
||||
max_articles = int(sys.argv[1])
|
||||
except ValueError:
|
||||
print("Usage: python crawler_service.py [max_articles_per_feed]")
|
||||
sys.exit(1)
|
||||
|
||||
crawl_all_feeds(max_articles_per_feed=max_articles)
|
||||
33
news_crawler/docker-compose.yml
Normal file
33
news_crawler/docker-compose.yml
Normal file
@@ -0,0 +1,33 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
crawler:
|
||||
build: .
|
||||
container_name: news-crawler
|
||||
environment:
|
||||
- MONGODB_URI=mongodb://mongodb:27017/
|
||||
networks:
|
||||
- munich-news-network
|
||||
depends_on:
|
||||
- mongodb
|
||||
# Run once and exit
|
||||
restart: "no"
|
||||
|
||||
mongodb:
|
||||
image: mongo:7.0
|
||||
container_name: munich-news-mongodb
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "27017:27017"
|
||||
volumes:
|
||||
- mongodb_data:/data/db
|
||||
networks:
|
||||
- munich-news-network
|
||||
|
||||
volumes:
|
||||
mongodb_data:
|
||||
driver: local
|
||||
|
||||
networks:
|
||||
munich-news-network:
|
||||
driver: bridge
|
||||
290
news_crawler/ollama_client.py
Normal file
290
news_crawler/ollama_client.py
Normal file
@@ -0,0 +1,290 @@
|
||||
"""
|
||||
Ollama client for AI-powered article summarization
|
||||
"""
|
||||
import requests
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class OllamaClient:
|
||||
"""Client for communicating with Ollama server for text summarization"""
|
||||
|
||||
def __init__(self, base_url, model, api_key=None, enabled=True, timeout=30):
|
||||
"""
|
||||
Initialize Ollama client
|
||||
|
||||
Args:
|
||||
base_url: Ollama server URL (e.g., http://localhost:11434)
|
||||
model: Model name to use (e.g., phi3:latest)
|
||||
api_key: Optional API key for authentication
|
||||
enabled: Whether Ollama is enabled
|
||||
timeout: Request timeout in seconds (default 30)
|
||||
"""
|
||||
self.base_url = base_url.rstrip('/')
|
||||
self.model = model
|
||||
self.api_key = api_key
|
||||
self.enabled = enabled
|
||||
self.timeout = timeout
|
||||
|
||||
def summarize_article(self, content, max_words=150):
|
||||
"""
|
||||
Summarize article content using Ollama
|
||||
|
||||
Args:
|
||||
content: Full article text
|
||||
max_words: Maximum words in summary (default 150)
|
||||
|
||||
Returns:
|
||||
{
|
||||
'summary': str, # AI-generated summary
|
||||
'summary_word_count': int, # Summary word count
|
||||
'original_word_count': int, # Original article word count
|
||||
'success': bool, # Whether summarization succeeded
|
||||
'error': str or None, # Error message if failed
|
||||
'duration': float # Time taken in seconds
|
||||
}
|
||||
"""
|
||||
if not self.enabled:
|
||||
return {
|
||||
'summary': None,
|
||||
'summary_word_count': 0,
|
||||
'original_word_count': 0,
|
||||
'success': False,
|
||||
'error': 'Ollama is not enabled',
|
||||
'duration': 0
|
||||
}
|
||||
|
||||
if not content or len(content.strip()) == 0:
|
||||
return {
|
||||
'summary': None,
|
||||
'summary_word_count': 0,
|
||||
'original_word_count': 0,
|
||||
'success': False,
|
||||
'error': 'Content is empty',
|
||||
'duration': 0
|
||||
}
|
||||
|
||||
# Calculate original word count
|
||||
original_word_count = len(content.split())
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Construct prompt
|
||||
prompt = self._build_summarization_prompt(content, max_words)
|
||||
|
||||
# Prepare request
|
||||
url = f"{self.base_url}/api/generate"
|
||||
headers = {'Content-Type': 'application/json'}
|
||||
if self.api_key:
|
||||
headers['Authorization'] = f'Bearer {self.api_key}'
|
||||
|
||||
payload = {
|
||||
'model': self.model,
|
||||
'prompt': prompt,
|
||||
'stream': False,
|
||||
'options': {
|
||||
'temperature': 0.7,
|
||||
'num_predict': 250 # Limit response length
|
||||
}
|
||||
}
|
||||
|
||||
# Make request
|
||||
response = requests.post(
|
||||
url,
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=self.timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse response
|
||||
result = response.json()
|
||||
summary = result.get('response', '').strip()
|
||||
|
||||
if not summary:
|
||||
return {
|
||||
'summary': None,
|
||||
'summary_word_count': 0,
|
||||
'original_word_count': original_word_count,
|
||||
'success': False,
|
||||
'error': 'Ollama returned empty summary',
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
|
||||
summary_word_count = len(summary.split())
|
||||
|
||||
return {
|
||||
'summary': summary,
|
||||
'summary_word_count': summary_word_count,
|
||||
'original_word_count': original_word_count,
|
||||
'success': True,
|
||||
'error': None,
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
return {
|
||||
'summary': None,
|
||||
'summary_word_count': 0,
|
||||
'original_word_count': original_word_count,
|
||||
'success': False,
|
||||
'error': f'Request timed out after {self.timeout} seconds',
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
except requests.exceptions.ConnectionError:
|
||||
return {
|
||||
'summary': None,
|
||||
'summary_word_count': 0,
|
||||
'original_word_count': original_word_count,
|
||||
'success': False,
|
||||
'error': f'Cannot connect to Ollama server at {self.base_url}',
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
except requests.exceptions.HTTPError as e:
|
||||
return {
|
||||
'summary': None,
|
||||
'summary_word_count': 0,
|
||||
'original_word_count': original_word_count,
|
||||
'success': False,
|
||||
'error': f'HTTP error: {e.response.status_code} - {e.response.text[:100]}',
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'summary': None,
|
||||
'summary_word_count': 0,
|
||||
'original_word_count': original_word_count,
|
||||
'success': False,
|
||||
'error': f'Unexpected error: {str(e)}',
|
||||
'duration': time.time() - start_time
|
||||
}
|
||||
|
||||
def _build_summarization_prompt(self, content, max_words):
|
||||
"""Build prompt for article summarization"""
|
||||
# Truncate content if too long (keep first 5000 words)
|
||||
words = content.split()
|
||||
if len(words) > 5000:
|
||||
content = ' '.join(words[:5000]) + '...'
|
||||
|
||||
prompt = f"""Summarize the following article in English in {max_words} words or less. Even if the article is in German or another language, provide the summary in English. Focus on the key points, main message, and important details. Be concise and clear.
|
||||
|
||||
Article:
|
||||
{content}
|
||||
|
||||
English Summary (max {max_words} words):"""
|
||||
|
||||
return prompt
|
||||
|
||||
def is_available(self):
|
||||
"""
|
||||
Check if Ollama server is reachable
|
||||
|
||||
Returns:
|
||||
bool: True if server is reachable, False otherwise
|
||||
"""
|
||||
if not self.enabled:
|
||||
return False
|
||||
|
||||
try:
|
||||
url = f"{self.base_url}/api/tags"
|
||||
headers = {}
|
||||
if self.api_key:
|
||||
headers['Authorization'] = f'Bearer {self.api_key}'
|
||||
|
||||
response = requests.get(url, headers=headers, timeout=5)
|
||||
response.raise_for_status()
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def test_connection(self):
|
||||
"""
|
||||
Test connection and return server info
|
||||
|
||||
Returns:
|
||||
{
|
||||
'available': bool,
|
||||
'models': list,
|
||||
'current_model': str,
|
||||
'error': str or None
|
||||
}
|
||||
"""
|
||||
if not self.enabled:
|
||||
return {
|
||||
'available': False,
|
||||
'models': [],
|
||||
'current_model': self.model,
|
||||
'error': 'Ollama is not enabled'
|
||||
}
|
||||
|
||||
try:
|
||||
url = f"{self.base_url}/api/tags"
|
||||
headers = {}
|
||||
if self.api_key:
|
||||
headers['Authorization'] = f'Bearer {self.api_key}'
|
||||
|
||||
response = requests.get(url, headers=headers, timeout=5)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
models = [m.get('name', '') for m in result.get('models', [])]
|
||||
|
||||
return {
|
||||
'available': True,
|
||||
'models': models,
|
||||
'current_model': self.model,
|
||||
'error': None
|
||||
}
|
||||
except requests.exceptions.ConnectionError:
|
||||
return {
|
||||
'available': False,
|
||||
'models': [],
|
||||
'current_model': self.model,
|
||||
'error': f'Cannot connect to Ollama server at {self.base_url}'
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'available': False,
|
||||
'models': [],
|
||||
'current_model': self.model,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Quick test
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(dotenv_path='../.env')
|
||||
|
||||
client = OllamaClient(
|
||||
base_url=os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434'),
|
||||
model=os.getenv('OLLAMA_MODEL', 'phi3:latest'),
|
||||
enabled=True
|
||||
)
|
||||
|
||||
print("Testing Ollama connection...")
|
||||
result = client.test_connection()
|
||||
print(f"Available: {result['available']}")
|
||||
print(f"Models: {result['models']}")
|
||||
print(f"Current model: {result['current_model']}")
|
||||
|
||||
if result['available']:
|
||||
print("\nTesting summarization...")
|
||||
test_content = """
|
||||
The new U-Bahn line connecting Munich's city center with the airport opened today.
|
||||
Mayor Dieter Reiter attended the opening ceremony along with hundreds of residents.
|
||||
The line will significantly reduce travel time between the airport and downtown Munich.
|
||||
Construction took five years and cost approximately 2 billion euros.
|
||||
The new line includes 10 stations and runs every 10 minutes during peak hours.
|
||||
"""
|
||||
|
||||
summary_result = client.summarize_article(test_content, max_words=50)
|
||||
print(f"Success: {summary_result['success']}")
|
||||
print(f"Summary: {summary_result['summary']}")
|
||||
print(f"Original word count: {summary_result['original_word_count']}")
|
||||
print(f"Summary word count: {summary_result['summary_word_count']}")
|
||||
print(f"Compression: {summary_result['original_word_count'] / max(summary_result['summary_word_count'], 1):.1f}x")
|
||||
print(f"Duration: {summary_result['duration']:.2f}s")
|
||||
6
news_crawler/requirements.txt
Normal file
6
news_crawler/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
beautifulsoup4==4.12.2
|
||||
lxml==4.9.3
|
||||
requests==2.31.0
|
||||
feedparser==6.0.10
|
||||
pymongo==4.6.1
|
||||
python-dotenv==1.0.0
|
||||
98
news_crawler/rss_utils.py
Normal file
98
news_crawler/rss_utils.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
Utility functions for RSS feed processing
|
||||
"""
|
||||
|
||||
|
||||
def extract_article_url(entry):
|
||||
"""
|
||||
Extract article URL from RSS entry.
|
||||
Different RSS feeds use different fields for the article URL.
|
||||
|
||||
Args:
|
||||
entry: feedparser entry object
|
||||
|
||||
Returns:
|
||||
str: Article URL or None if not found
|
||||
|
||||
Examples:
|
||||
- Most feeds use 'link'
|
||||
- Some use 'guid' as the URL
|
||||
- Some use 'id' as the URL
|
||||
- Some have guid as a dict with 'href'
|
||||
"""
|
||||
# Try 'link' first (most common)
|
||||
if entry.get('link') and entry.get('link', '').startswith('http'):
|
||||
return entry.get('link')
|
||||
|
||||
# Try 'guid' if it's a valid URL
|
||||
if entry.get('guid'):
|
||||
guid = entry.get('guid')
|
||||
# guid can be a string
|
||||
if isinstance(guid, str) and guid.startswith('http'):
|
||||
return guid
|
||||
# or a dict with 'href'
|
||||
elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
|
||||
return guid.get('href')
|
||||
|
||||
# Try 'id' if it's a valid URL
|
||||
if entry.get('id') and entry.get('id', '').startswith('http'):
|
||||
return entry.get('id')
|
||||
|
||||
# Try 'links' array (some feeds have multiple links)
|
||||
if entry.get('links'):
|
||||
for link in entry.get('links', []):
|
||||
if isinstance(link, dict) and link.get('href', '').startswith('http'):
|
||||
# Prefer 'alternate' type, but accept any http link
|
||||
if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
|
||||
return link.get('href')
|
||||
# If no alternate found, return first http link
|
||||
for link in entry.get('links', []):
|
||||
if isinstance(link, dict) and link.get('href', '').startswith('http'):
|
||||
return link.get('href')
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_article_summary(entry):
|
||||
"""
|
||||
Extract article summary/description from RSS entry.
|
||||
|
||||
Args:
|
||||
entry: feedparser entry object
|
||||
|
||||
Returns:
|
||||
str: Article summary or empty string
|
||||
"""
|
||||
# Try different fields
|
||||
if entry.get('summary'):
|
||||
return entry.get('summary', '')
|
||||
elif entry.get('description'):
|
||||
return entry.get('description', '')
|
||||
elif entry.get('content'):
|
||||
# content is usually a list of dicts
|
||||
content = entry.get('content', [])
|
||||
if content and isinstance(content, list) and len(content) > 0:
|
||||
return content[0].get('value', '')
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
def extract_published_date(entry):
|
||||
"""
|
||||
Extract published date from RSS entry.
|
||||
|
||||
Args:
|
||||
entry: feedparser entry object
|
||||
|
||||
Returns:
|
||||
str: Published date or empty string
|
||||
"""
|
||||
# Try different fields
|
||||
if entry.get('published'):
|
||||
return entry.get('published', '')
|
||||
elif entry.get('updated'):
|
||||
return entry.get('updated', '')
|
||||
elif entry.get('created'):
|
||||
return entry.get('created', '')
|
||||
|
||||
return ''
|
||||
83
news_crawler/test_crawler.py
Normal file
83
news_crawler/test_crawler.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test script to verify crawler functionality
|
||||
"""
|
||||
from crawler_service import extract_article_content, get_active_rss_feeds
|
||||
import sys
|
||||
|
||||
|
||||
def test_content_extraction():
|
||||
"""Test content extraction from a sample URL"""
|
||||
print("Testing content extraction...")
|
||||
|
||||
# Test with a simple news site
|
||||
test_url = "https://www.bbc.com/news"
|
||||
|
||||
print(f"Extracting content from: {test_url}")
|
||||
result = extract_article_content(test_url, timeout=10)
|
||||
|
||||
if result:
|
||||
print("✓ Content extraction successful!")
|
||||
print(f" Title: {result.get('title', 'N/A')[:50]}...")
|
||||
print(f" Content length: {len(result.get('content', ''))} chars")
|
||||
print(f" Word count: {result.get('word_count', 0)}")
|
||||
return True
|
||||
else:
|
||||
print("✗ Content extraction failed")
|
||||
return False
|
||||
|
||||
|
||||
def test_database_connection():
|
||||
"""Test MongoDB connection"""
|
||||
print("\nTesting database connection...")
|
||||
|
||||
try:
|
||||
feeds = get_active_rss_feeds()
|
||||
print(f"✓ Database connection successful!")
|
||||
print(f" Found {len(feeds)} active RSS feed(s)")
|
||||
|
||||
if feeds:
|
||||
print("\n Active feeds:")
|
||||
for feed in feeds:
|
||||
print(f" - {feed['name']}: {feed['url']}")
|
||||
else:
|
||||
print("\n ⚠ No active feeds found. Add feeds via the backend API:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Test Feed\", \"url\": \"https://example.com/rss\"}'")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ Database connection failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("="*60)
|
||||
print("News Crawler - Test Suite")
|
||||
print("="*60 + "\n")
|
||||
|
||||
# Test database connection
|
||||
db_ok = test_database_connection()
|
||||
|
||||
# Test content extraction
|
||||
extract_ok = test_content_extraction()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Test Results:")
|
||||
print(f" Database Connection: {'✓ PASS' if db_ok else '✗ FAIL'}")
|
||||
print(f" Content Extraction: {'✓ PASS' if extract_ok else '✗ FAIL'}")
|
||||
print("="*60 + "\n")
|
||||
|
||||
if db_ok and extract_ok:
|
||||
print("✓ All tests passed! Crawler is ready to use.")
|
||||
print("\nRun the crawler with:")
|
||||
print(" python crawler_service.py")
|
||||
return 0
|
||||
else:
|
||||
print("✗ Some tests failed. Please check the errors above.")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
129
news_crawler/test_ollama.py
Normal file
129
news_crawler/test_ollama.py
Normal file
@@ -0,0 +1,129 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test script for Ollama integration
|
||||
Tests connection, configuration, and summarization
|
||||
"""
|
||||
from config import Config
|
||||
from ollama_client import OllamaClient
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("Ollama Integration Test")
|
||||
print("="*70)
|
||||
|
||||
# Print configuration
|
||||
Config.print_config()
|
||||
|
||||
# Validate configuration
|
||||
issues = Config.validate()
|
||||
if issues:
|
||||
print("⚠ Configuration Issues:")
|
||||
for issue in issues:
|
||||
print(f" - {issue}")
|
||||
print()
|
||||
|
||||
# Initialize client
|
||||
client = OllamaClient(
|
||||
base_url=Config.OLLAMA_BASE_URL,
|
||||
model=Config.OLLAMA_MODEL,
|
||||
api_key=Config.OLLAMA_API_KEY,
|
||||
enabled=Config.OLLAMA_ENABLED,
|
||||
timeout=Config.OLLAMA_TIMEOUT
|
||||
)
|
||||
|
||||
# Test 1: Check if Ollama is enabled
|
||||
print("Test 1: Configuration Check")
|
||||
print(f" Ollama Enabled: {Config.OLLAMA_ENABLED}")
|
||||
if not Config.OLLAMA_ENABLED:
|
||||
print(" ⚠ Ollama is disabled. Set OLLAMA_ENABLED=true in .env to enable.")
|
||||
print("\n" + "="*70)
|
||||
exit(0)
|
||||
|
||||
# Test 2: Test connection
|
||||
print("\nTest 2: Connection Test")
|
||||
conn_result = client.test_connection()
|
||||
print(f" Available: {conn_result['available']}")
|
||||
print(f" Current Model: {conn_result['current_model']}")
|
||||
|
||||
if conn_result['available']:
|
||||
print(f" ✓ Connected to Ollama server")
|
||||
if conn_result['models']:
|
||||
print(f" Available models: {', '.join(conn_result['models'][:5])}")
|
||||
if conn_result['current_model'] not in conn_result['models']:
|
||||
print(f" ⚠ Warning: Model '{conn_result['current_model']}' not found in available models")
|
||||
else:
|
||||
print(f" ✗ Connection failed: {conn_result['error']}")
|
||||
print("\n" + "="*70)
|
||||
exit(1)
|
||||
|
||||
# Test 3: Test summarization with sample article
|
||||
print("\nTest 3: Summarization Test")
|
||||
print(" Testing with sample German article...")
|
||||
|
||||
sample_article = """
|
||||
Die neue U-Bahn-Linie, die das Münchner Stadtzentrum mit dem Flughafen verbindet, wurde heute eröffnet.
|
||||
Oberbürgermeister Dieter Reiter nahm zusammen mit hunderten Anwohnern an der Eröffnungszeremonie teil.
|
||||
Die Linie wird die Reisezeit zwischen dem Flughafen und der Münchner Innenstadt erheblich verkürzen.
|
||||
Der Bau dauerte fünf Jahre und kostete etwa 2 Milliarden Euro. Die neue Linie umfasst 10 Stationen
|
||||
und verkehrt während der Hauptverkehrszeiten alle 10 Minuten. Experten erwarten, dass die neue Verbindung
|
||||
den Verkehr in der Stadt deutlich entlasten wird. Die Münchner Verkehrsgesellschaft rechnet mit täglich
|
||||
über 50.000 Fahrgästen auf der neuen Strecke.
|
||||
"""
|
||||
|
||||
result = client.summarize_article(sample_article, max_words=Config.SUMMARY_MAX_WORDS)
|
||||
|
||||
print(f"\n Success: {result['success']}")
|
||||
if result['success']:
|
||||
print(f" ✓ Summarization successful!")
|
||||
print(f"\n Original word count: {result['original_word_count']}")
|
||||
print(f" Summary word count: {result['summary_word_count']}")
|
||||
print(f" Compression ratio: {result['original_word_count'] / max(result['summary_word_count'], 1):.1f}x")
|
||||
print(f" Duration: {result['duration']:.2f}s")
|
||||
print(f"\n Summary (English):")
|
||||
print(f" {'-'*70}")
|
||||
print(f" {result['summary']}")
|
||||
print(f" {'-'*70}")
|
||||
else:
|
||||
print(f" ✗ Summarization failed: {result['error']}")
|
||||
|
||||
# Test 4: Test with English article
|
||||
print("\nTest 4: English Article Test")
|
||||
print(" Testing with English article...")
|
||||
|
||||
english_article = """
|
||||
The city council approved a new bike lane network spanning 50 kilometers across Munich.
|
||||
The project aims to promote sustainable transportation and reduce car traffic in the city center.
|
||||
Construction will begin next month and is expected to be completed within two years.
|
||||
The bike lanes will connect major residential areas with business districts and public transport hubs.
|
||||
Environmental groups have praised the initiative as a significant step toward carbon neutrality.
|
||||
"""
|
||||
|
||||
result2 = client.summarize_article(english_article, max_words=50)
|
||||
|
||||
print(f"\n Success: {result2['success']}")
|
||||
if result2['success']:
|
||||
print(f" ✓ Summarization successful!")
|
||||
print(f" Original: {result2['original_word_count']} words → Summary: {result2['summary_word_count']} words")
|
||||
print(f" Duration: {result2['duration']:.2f}s")
|
||||
print(f"\n Summary:")
|
||||
print(f" {result2['summary']}")
|
||||
else:
|
||||
print(f" ✗ Summarization failed: {result2['error']}")
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*70)
|
||||
print("Test Summary")
|
||||
print("="*70)
|
||||
print(f"✓ Configuration: Valid")
|
||||
print(f"✓ Connection: {'Success' if conn_result['available'] else 'Failed'}")
|
||||
print(f"✓ German→English: {'Success' if result['success'] else 'Failed'}")
|
||||
print(f"✓ English→English: {'Success' if result2['success'] else 'Failed'}")
|
||||
print("="*70)
|
||||
|
||||
if result['success'] and result2['success']:
|
||||
print("\n🎉 All tests passed! Ollama integration is working correctly.")
|
||||
print("\nYou can now run the crawler with AI summarization:")
|
||||
print(" python crawler_service.py 5")
|
||||
else:
|
||||
print("\n⚠ Some tests failed. Check the errors above.")
|
||||
|
||||
print()
|
||||
154
news_crawler/test_rss_feeds.py
Normal file
154
news_crawler/test_rss_feeds.py
Normal file
@@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test script to verify RSS feed URL extraction
|
||||
Tests actual feeds from the database
|
||||
"""
|
||||
import feedparser
|
||||
from pymongo import MongoClient
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(dotenv_path='../.env')
|
||||
|
||||
# MongoDB setup
|
||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
||||
DB_NAME = 'munich_news'
|
||||
|
||||
client = MongoClient(MONGODB_URI)
|
||||
db = client[DB_NAME]
|
||||
rss_feeds_collection = db['rss_feeds']
|
||||
|
||||
|
||||
def test_feed(feed_name, feed_url):
|
||||
"""Test a single RSS feed"""
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Testing: {feed_name}")
|
||||
print(f"URL: {feed_url}")
|
||||
print('='*70)
|
||||
|
||||
try:
|
||||
# Parse the feed
|
||||
print("Fetching RSS feed...")
|
||||
feed = feedparser.parse(feed_url)
|
||||
|
||||
if not feed.entries:
|
||||
print("❌ No entries found in feed")
|
||||
return False
|
||||
|
||||
print(f"✓ Found {len(feed.entries)} entries\n")
|
||||
|
||||
# Test first 5 entries
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
|
||||
for i, entry in enumerate(feed.entries[:5], 1):
|
||||
print(f"\n--- Entry {i} ---")
|
||||
print(f"Title: {entry.get('title', 'No title')[:60]}")
|
||||
|
||||
# Test URL extraction
|
||||
article_url = extract_article_url(entry)
|
||||
if article_url:
|
||||
print(f"✓ URL: {article_url}")
|
||||
success_count += 1
|
||||
else:
|
||||
print(f"❌ No valid URL found")
|
||||
print(f" Available fields: {list(entry.keys())}")
|
||||
print(f" link: {entry.get('link', 'N/A')}")
|
||||
print(f" guid: {entry.get('guid', 'N/A')}")
|
||||
print(f" id: {entry.get('id', 'N/A')}")
|
||||
fail_count += 1
|
||||
|
||||
# Test summary extraction
|
||||
summary = extract_article_summary(entry)
|
||||
if summary:
|
||||
print(f"✓ Summary: {summary[:80]}...")
|
||||
else:
|
||||
print(f"⚠ No summary found")
|
||||
|
||||
# Test date extraction
|
||||
pub_date = extract_published_date(entry)
|
||||
if pub_date:
|
||||
print(f"✓ Published: {pub_date}")
|
||||
else:
|
||||
print(f"⚠ No published date found")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Results for {feed_name}:")
|
||||
print(f" ✓ Success: {success_count}/5")
|
||||
print(f" ❌ Failed: {fail_count}/5")
|
||||
print('='*70)
|
||||
|
||||
return fail_count == 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error testing feed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("\n" + "="*70)
|
||||
print("RSS Feed URL Extraction Test")
|
||||
print("="*70)
|
||||
|
||||
# Get all RSS feeds from database
|
||||
print("\nFetching RSS feeds from database...")
|
||||
feeds = list(rss_feeds_collection.find())
|
||||
|
||||
if not feeds:
|
||||
print("❌ No RSS feeds found in database")
|
||||
print("\nAdd feeds using:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'")
|
||||
return 1
|
||||
|
||||
print(f"✓ Found {len(feeds)} feed(s) in database\n")
|
||||
|
||||
# Test each feed
|
||||
results = {}
|
||||
for feed in feeds:
|
||||
feed_name = feed.get('name', 'Unknown')
|
||||
feed_url = feed.get('url', '')
|
||||
active = feed.get('active', True)
|
||||
|
||||
if not active:
|
||||
print(f"\n⏭ Skipping inactive feed: {feed_name}")
|
||||
continue
|
||||
|
||||
if not feed_url:
|
||||
print(f"\n❌ Feed '{feed_name}' has no URL")
|
||||
results[feed_name] = False
|
||||
continue
|
||||
|
||||
results[feed_name] = test_feed(feed_name, feed_url)
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*70)
|
||||
print("FINAL RESULTS")
|
||||
print("="*70)
|
||||
|
||||
for feed_name, success in results.items():
|
||||
status = "✓ PASS" if success else "❌ FAIL"
|
||||
print(f"{status} - {feed_name}")
|
||||
|
||||
total = len(results)
|
||||
passed = sum(1 for s in results.values() if s)
|
||||
|
||||
print(f"\nTotal: {passed}/{total} feeds passed")
|
||||
print("="*70 + "\n")
|
||||
|
||||
if passed == total:
|
||||
print("✓ All feeds are working correctly!")
|
||||
print("\nYou can now run the crawler:")
|
||||
print(" python crawler_service.py")
|
||||
return 0
|
||||
else:
|
||||
print("⚠ Some feeds have issues. Check the output above.")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user