update
This commit is contained in:
@@ -1,191 +0,0 @@
|
||||
# Recent Changes - Full Content Storage
|
||||
|
||||
## ✅ What Changed
|
||||
|
||||
### 1. Removed Content Length Limit
|
||||
**Before:**
|
||||
```python
|
||||
'content': content_text[:10000] # Limited to 10k chars
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
'content': content_text # Full content, no limit
|
||||
```
|
||||
|
||||
### 2. Simplified Database Schema
|
||||
**Before:**
|
||||
```javascript
|
||||
{
|
||||
summary: String, // Short summary
|
||||
full_content: String // Limited content
|
||||
}
|
||||
```
|
||||
|
||||
**After:**
|
||||
```javascript
|
||||
{
|
||||
content: String // Full article content, no limit
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Enhanced API Response
|
||||
**Before:**
|
||||
```javascript
|
||||
{
|
||||
title: "...",
|
||||
link: "...",
|
||||
summary: "..."
|
||||
}
|
||||
```
|
||||
|
||||
**After:**
|
||||
```javascript
|
||||
{
|
||||
title: "...",
|
||||
author: "...", // NEW!
|
||||
link: "...",
|
||||
preview: "...", // First 200 chars
|
||||
word_count: 1250, // NEW!
|
||||
has_full_content: true // NEW!
|
||||
}
|
||||
```
|
||||
|
||||
## 📊 Database Structure
|
||||
|
||||
### Articles Collection
|
||||
```javascript
|
||||
{
|
||||
_id: ObjectId,
|
||||
title: String, // Article title
|
||||
author: String, // Article author (extracted)
|
||||
link: String, // Article URL (unique)
|
||||
content: String, // FULL article content (no limit)
|
||||
word_count: Number, // Word count
|
||||
source: String, // RSS feed name
|
||||
published_at: String, // Publication date
|
||||
crawled_at: DateTime, // When crawled
|
||||
created_at: DateTime // When added
|
||||
}
|
||||
```
|
||||
|
||||
## 🆕 New API Endpoint
|
||||
|
||||
### GET /api/news/<article_url>
|
||||
Get full article content by URL.
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
# URL encode the article URL
|
||||
curl "http://localhost:5001/api/news/https%3A%2F%2Fexample.com%2Farticle"
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"title": "New U-Bahn Line Opens in Munich",
|
||||
"author": "Max Mustermann",
|
||||
"link": "https://example.com/article",
|
||||
"content": "The full article text here... (complete, no truncation)",
|
||||
"word_count": 1250,
|
||||
"source": "Süddeutsche Zeitung München",
|
||||
"published_at": "2024-11-10T10:00:00Z",
|
||||
"crawled_at": "2024-11-10T16:30:00Z",
|
||||
"created_at": "2024-11-10T16:00:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
## 📈 Enhanced Stats
|
||||
|
||||
### GET /api/stats
|
||||
Now includes crawled article count:
|
||||
|
||||
```json
|
||||
{
|
||||
"subscribers": 150,
|
||||
"articles": 500,
|
||||
"crawled_articles": 350 // NEW!
|
||||
}
|
||||
```
|
||||
|
||||
## 🎯 Benefits
|
||||
|
||||
1. **Complete Content** - No truncation, full articles stored
|
||||
2. **Better for AI** - Full context for summarization/analysis
|
||||
3. **Cleaner Schema** - Single `content` field instead of `summary` + `full_content`
|
||||
4. **More Metadata** - Author, word count, crawl timestamp
|
||||
5. **Better API** - Preview in list, full content on demand
|
||||
|
||||
## 🔄 Migration
|
||||
|
||||
If you have existing articles with `full_content` field, they will continue to work. New articles will use the `content` field.
|
||||
|
||||
To migrate old articles:
|
||||
```javascript
|
||||
// MongoDB shell
|
||||
db.articles.updateMany(
|
||||
{ full_content: { $exists: true } },
|
||||
[
|
||||
{
|
||||
$set: {
|
||||
content: "$full_content"
|
||||
}
|
||||
},
|
||||
{
|
||||
$unset: ["full_content", "summary"]
|
||||
}
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## 🚀 Usage
|
||||
|
||||
### Crawl Articles
|
||||
```bash
|
||||
cd news_crawler
|
||||
python crawler_service.py 10
|
||||
```
|
||||
|
||||
### Get Article List (with previews)
|
||||
```bash
|
||||
curl http://localhost:5001/api/news
|
||||
```
|
||||
|
||||
### Get Full Article Content
|
||||
```bash
|
||||
# Get the article URL from the list, then:
|
||||
curl "http://localhost:5001/api/news/<encoded_url>"
|
||||
```
|
||||
|
||||
### Check Stats
|
||||
```bash
|
||||
curl http://localhost:5001/api/stats
|
||||
```
|
||||
|
||||
## 📝 Example Workflow
|
||||
|
||||
1. **Add RSS Feed**
|
||||
```bash
|
||||
curl -X POST http://localhost:5001/api/rss-feeds \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"name": "News Source", "url": "https://example.com/rss"}'
|
||||
```
|
||||
|
||||
2. **Crawl Articles**
|
||||
```bash
|
||||
cd news_crawler
|
||||
python crawler_service.py 10
|
||||
```
|
||||
|
||||
3. **View Articles**
|
||||
```bash
|
||||
curl http://localhost:5001/api/news
|
||||
```
|
||||
|
||||
4. **Get Full Content**
|
||||
```bash
|
||||
# Copy article link from above, URL encode it
|
||||
curl "http://localhost:5001/api/news/https%3A%2F%2Fexample.com%2Farticle"
|
||||
```
|
||||
|
||||
Now you have complete article content ready for AI processing! 🎉
|
||||
@@ -6,8 +6,20 @@ WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy crawler service
|
||||
COPY crawler_service.py .
|
||||
# Copy crawler files
|
||||
COPY . .
|
||||
|
||||
# Run crawler
|
||||
CMD ["python", "crawler_service.py"]
|
||||
# Copy backend config files (needed for Config class)
|
||||
COPY ../backend/config.py /app/config.py
|
||||
COPY ../backend/ollama_client.py /app/ollama_client.py
|
||||
COPY ../backend/.env /app/.env
|
||||
|
||||
# Make the scheduler executable
|
||||
RUN chmod +x scheduled_crawler.py
|
||||
|
||||
# Set timezone to Berlin
|
||||
ENV TZ=Europe/Berlin
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
|
||||
# Run the scheduled crawler
|
||||
CMD ["python", "-u", "scheduled_crawler.py"]
|
||||
|
||||
@@ -1,353 +0,0 @@
|
||||
# Content Extraction Strategies
|
||||
|
||||
The crawler uses multiple strategies to dynamically extract article metadata from any website.
|
||||
|
||||
## 🎯 What Gets Extracted
|
||||
|
||||
1. **Title** - Article headline
|
||||
2. **Author** - Article writer/journalist
|
||||
3. **Published Date** - When article was published
|
||||
4. **Content** - Main article text
|
||||
5. **Description** - Meta description/summary
|
||||
|
||||
## 📋 Extraction Strategies
|
||||
|
||||
### 1. Title Extraction
|
||||
|
||||
Tries multiple methods in order of reliability:
|
||||
|
||||
#### Strategy 1: H1 Tag
|
||||
```html
|
||||
<h1>Article Title Here</h1>
|
||||
```
|
||||
✅ Most reliable - usually the main headline
|
||||
|
||||
#### Strategy 2: Open Graph Meta Tag
|
||||
```html
|
||||
<meta property="og:title" content="Article Title Here" />
|
||||
```
|
||||
✅ Used by Facebook, very reliable
|
||||
|
||||
#### Strategy 3: Twitter Card Meta Tag
|
||||
```html
|
||||
<meta name="twitter:title" content="Article Title Here" />
|
||||
```
|
||||
✅ Used by Twitter, reliable
|
||||
|
||||
#### Strategy 4: Title Tag (Fallback)
|
||||
```html
|
||||
<title>Article Title | Site Name</title>
|
||||
```
|
||||
⚠️ Often includes site name, needs cleaning
|
||||
|
||||
**Cleaning:**
|
||||
- Removes " | Site Name"
|
||||
- Removes " - Site Name"
|
||||
|
||||
---
|
||||
|
||||
### 2. Author Extraction
|
||||
|
||||
Tries multiple methods:
|
||||
|
||||
#### Strategy 1: Meta Author Tag
|
||||
```html
|
||||
<meta name="author" content="John Doe" />
|
||||
```
|
||||
✅ Standard HTML meta tag
|
||||
|
||||
#### Strategy 2: Rel="author" Link
|
||||
```html
|
||||
<a rel="author" href="/author/john-doe">John Doe</a>
|
||||
```
|
||||
✅ Semantic HTML
|
||||
|
||||
#### Strategy 3: Common Class Names
|
||||
```html
|
||||
<div class="author-name">John Doe</div>
|
||||
<span class="byline">By John Doe</span>
|
||||
<p class="writer">John Doe</p>
|
||||
```
|
||||
✅ Searches for: author-name, author, byline, writer
|
||||
|
||||
#### Strategy 4: Schema.org Markup
|
||||
```html
|
||||
<span itemprop="author">John Doe</span>
|
||||
```
|
||||
✅ Structured data
|
||||
|
||||
#### Strategy 5: JSON-LD Structured Data
|
||||
```html
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@type": "NewsArticle",
|
||||
"author": {
|
||||
"@type": "Person",
|
||||
"name": "John Doe"
|
||||
}
|
||||
}
|
||||
</script>
|
||||
```
|
||||
✅ Most structured, very reliable
|
||||
|
||||
**Cleaning:**
|
||||
- Removes "By " prefix
|
||||
- Validates length (< 100 chars)
|
||||
|
||||
---
|
||||
|
||||
### 3. Date Extraction
|
||||
|
||||
Tries multiple methods:
|
||||
|
||||
#### Strategy 1: Time Tag with Datetime
|
||||
```html
|
||||
<time datetime="2024-11-10T10:00:00Z">November 10, 2024</time>
|
||||
```
|
||||
✅ Most reliable - ISO format
|
||||
|
||||
#### Strategy 2: Article Published Time Meta
|
||||
```html
|
||||
<meta property="article:published_time" content="2024-11-10T10:00:00Z" />
|
||||
```
|
||||
✅ Open Graph standard
|
||||
|
||||
#### Strategy 3: OG Published Time
|
||||
```html
|
||||
<meta property="og:published_time" content="2024-11-10T10:00:00Z" />
|
||||
```
|
||||
✅ Facebook standard
|
||||
|
||||
#### Strategy 4: Common Class Names
|
||||
```html
|
||||
<span class="publish-date">November 10, 2024</span>
|
||||
<time class="published">2024-11-10</time>
|
||||
<div class="timestamp">10:00 AM, Nov 10</div>
|
||||
```
|
||||
✅ Searches for: publish-date, published, date, timestamp
|
||||
|
||||
#### Strategy 5: Schema.org Markup
|
||||
```html
|
||||
<meta itemprop="datePublished" content="2024-11-10T10:00:00Z" />
|
||||
```
|
||||
✅ Structured data
|
||||
|
||||
#### Strategy 6: JSON-LD Structured Data
|
||||
```html
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@type": "NewsArticle",
|
||||
"datePublished": "2024-11-10T10:00:00Z"
|
||||
}
|
||||
</script>
|
||||
```
|
||||
✅ Most structured
|
||||
|
||||
---
|
||||
|
||||
### 4. Content Extraction
|
||||
|
||||
Tries multiple methods:
|
||||
|
||||
#### Strategy 1: Semantic HTML Tags
|
||||
```html
|
||||
<article>
|
||||
<p>Article content here...</p>
|
||||
</article>
|
||||
```
|
||||
✅ Best practice HTML5
|
||||
|
||||
#### Strategy 2: Common Class Names
|
||||
```html
|
||||
<div class="article-content">...</div>
|
||||
<div class="article-body">...</div>
|
||||
<div class="post-content">...</div>
|
||||
<div class="entry-content">...</div>
|
||||
<div class="story-body">...</div>
|
||||
```
|
||||
✅ Searches for common patterns
|
||||
|
||||
#### Strategy 3: Schema.org Markup
|
||||
```html
|
||||
<div itemprop="articleBody">
|
||||
<p>Content here...</p>
|
||||
</div>
|
||||
```
|
||||
✅ Structured data
|
||||
|
||||
#### Strategy 4: Main Tag
|
||||
```html
|
||||
<main>
|
||||
<p>Content here...</p>
|
||||
</main>
|
||||
```
|
||||
✅ Semantic HTML5
|
||||
|
||||
#### Strategy 5: Body Tag (Fallback)
|
||||
```html
|
||||
<body>
|
||||
<p>Content here...</p>
|
||||
</body>
|
||||
```
|
||||
⚠️ Last resort, may include navigation
|
||||
|
||||
**Content Filtering:**
|
||||
- Removes `<script>`, `<style>`, `<nav>`, `<footer>`, `<header>`, `<aside>`
|
||||
- Filters out short paragraphs (< 50 chars) - likely ads/navigation
|
||||
- Keeps only substantial paragraphs
|
||||
- **No length limit** - stores full article content
|
||||
|
||||
---
|
||||
|
||||
## 🔍 How It Works
|
||||
|
||||
### Example: Crawling a News Article
|
||||
|
||||
```python
|
||||
# 1. Fetch HTML
|
||||
response = requests.get(article_url)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# 2. Extract title (tries 4 strategies)
|
||||
title = extract_title(soup)
|
||||
# Result: "New U-Bahn Line Opens in Munich"
|
||||
|
||||
# 3. Extract author (tries 5 strategies)
|
||||
author = extract_author(soup)
|
||||
# Result: "Max Mustermann"
|
||||
|
||||
# 4. Extract date (tries 6 strategies)
|
||||
published_date = extract_date(soup)
|
||||
# Result: "2024-11-10T10:00:00Z"
|
||||
|
||||
# 5. Extract content (tries 5 strategies)
|
||||
content = extract_main_content(soup)
|
||||
# Result: "The new U-Bahn line connecting..."
|
||||
|
||||
# 6. Save to database
|
||||
article_doc = {
|
||||
'title': title,
|
||||
'author': author,
|
||||
'published_at': published_date,
|
||||
'full_content': content,
|
||||
'word_count': len(content.split())
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Success Rates by Strategy
|
||||
|
||||
Based on common news sites:
|
||||
|
||||
| Strategy | Success Rate | Notes |
|
||||
|----------|-------------|-------|
|
||||
| H1 for title | 95% | Almost universal |
|
||||
| OG meta tags | 90% | Most modern sites |
|
||||
| Time tag for date | 85% | HTML5 sites |
|
||||
| JSON-LD | 70% | Growing adoption |
|
||||
| Class name patterns | 60% | Varies by site |
|
||||
| Schema.org | 50% | Not widely adopted |
|
||||
|
||||
---
|
||||
|
||||
## 🎨 Real-World Examples
|
||||
|
||||
### Example 1: Süddeutsche Zeitung
|
||||
```html
|
||||
<article>
|
||||
<h1>New U-Bahn Line Opens</h1>
|
||||
<span class="author">Max Mustermann</span>
|
||||
<time datetime="2024-11-10T10:00:00Z">10. November 2024</time>
|
||||
<div class="article-body">
|
||||
<p>The new U-Bahn line...</p>
|
||||
</div>
|
||||
</article>
|
||||
```
|
||||
✅ Extracts: Title (H1), Author (class), Date (time), Content (article-body)
|
||||
|
||||
### Example 2: Medium Blog
|
||||
```html
|
||||
<article>
|
||||
<h1>How to Build a News Crawler</h1>
|
||||
<meta property="og:title" content="How to Build a News Crawler" />
|
||||
<meta property="article:published_time" content="2024-11-10T10:00:00Z" />
|
||||
<a rel="author" href="/author">Jane Smith</a>
|
||||
<section>
|
||||
<p>In this article...</p>
|
||||
</section>
|
||||
</article>
|
||||
```
|
||||
✅ Extracts: Title (OG meta), Author (rel), Date (article meta), Content (section)
|
||||
|
||||
### Example 3: WordPress Blog
|
||||
```html
|
||||
<div class="post">
|
||||
<h1 class="entry-title">My Blog Post</h1>
|
||||
<span class="byline">By John Doe</span>
|
||||
<time class="published">November 10, 2024</time>
|
||||
<div class="entry-content">
|
||||
<p>Blog content here...</p>
|
||||
</div>
|
||||
</div>
|
||||
```
|
||||
✅ Extracts: Title (H1), Author (byline), Date (published), Content (entry-content)
|
||||
|
||||
---
|
||||
|
||||
## ⚠️ Edge Cases Handled
|
||||
|
||||
1. **Missing Fields**: Returns `None` instead of crashing
|
||||
2. **Multiple Authors**: Takes first one found
|
||||
3. **Relative Dates**: Stores as-is ("2 hours ago")
|
||||
4. **Paywalls**: Extracts what's available
|
||||
5. **JavaScript-rendered**: Only gets server-side HTML
|
||||
6. **Ads/Navigation**: Filtered out by paragraph length
|
||||
7. **Site Name in Title**: Cleaned automatically
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Future Improvements
|
||||
|
||||
Potential enhancements:
|
||||
|
||||
- [ ] JavaScript rendering (Selenium/Playwright)
|
||||
- [ ] Paywall bypass (where legal)
|
||||
- [ ] Image extraction
|
||||
- [ ] Video detection
|
||||
- [ ] Related articles
|
||||
- [ ] Tags/categories
|
||||
- [ ] Reading time estimation
|
||||
- [ ] Language detection
|
||||
- [ ] Sentiment analysis
|
||||
|
||||
---
|
||||
|
||||
## 🧪 Testing
|
||||
|
||||
Test the extraction on a specific URL:
|
||||
|
||||
```python
|
||||
from crawler_service import extract_article_content
|
||||
|
||||
url = "https://www.sueddeutsche.de/muenchen/article-123"
|
||||
data = extract_article_content(url)
|
||||
|
||||
print(f"Title: {data['title']}")
|
||||
print(f"Author: {data['author']}")
|
||||
print(f"Date: {data['published_date']}")
|
||||
print(f"Content length: {len(data['content'])} chars")
|
||||
print(f"Word count: {data['word_count']}")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📚 Standards Supported
|
||||
|
||||
- ✅ HTML5 semantic tags
|
||||
- ✅ Open Graph Protocol
|
||||
- ✅ Twitter Cards
|
||||
- ✅ Schema.org microdata
|
||||
- ✅ JSON-LD structured data
|
||||
- ✅ Dublin Core metadata
|
||||
- ✅ Common CSS class patterns
|
||||
@@ -1,306 +0,0 @@
|
||||
# How the News Crawler Works
|
||||
|
||||
## 🎯 Overview
|
||||
|
||||
The crawler dynamically extracts article metadata from any website using multiple fallback strategies.
|
||||
|
||||
## 📊 Flow Diagram
|
||||
|
||||
```
|
||||
RSS Feed URL
|
||||
↓
|
||||
Parse RSS Feed
|
||||
↓
|
||||
For each article link:
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 1. Fetch HTML Page │
|
||||
│ GET https://example.com/article │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 2. Parse with BeautifulSoup │
|
||||
│ soup = BeautifulSoup(html) │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 3. Clean HTML │
|
||||
│ Remove: scripts, styles, nav, │
|
||||
│ footer, header, ads │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 4. Extract Title │
|
||||
│ Try: H1 → OG meta → Twitter → │
|
||||
│ Title tag │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 5. Extract Author │
|
||||
│ Try: Meta author → rel=author → │
|
||||
│ Class names → JSON-LD │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 6. Extract Date │
|
||||
│ Try: <time> → Meta tags → │
|
||||
│ Class names → JSON-LD │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 7. Extract Content │
|
||||
│ Try: <article> → Class names → │
|
||||
│ <main> → <body> │
|
||||
│ Filter short paragraphs │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
┌─────────────────────────────────────┐
|
||||
│ 8. Save to MongoDB │
|
||||
│ { │
|
||||
│ title, author, date, │
|
||||
│ content, word_count │
|
||||
│ } │
|
||||
└─────────────────────────────────────┘
|
||||
↓
|
||||
Wait 1 second (rate limiting)
|
||||
↓
|
||||
Next article
|
||||
```
|
||||
|
||||
## 🔍 Detailed Example
|
||||
|
||||
### Input: RSS Feed Entry
|
||||
```xml
|
||||
<item>
|
||||
<title>New U-Bahn Line Opens</title>
|
||||
<link>https://www.sueddeutsche.de/muenchen/article-123</link>
|
||||
<pubDate>Mon, 10 Nov 2024 10:00:00 +0100</pubDate>
|
||||
</item>
|
||||
```
|
||||
|
||||
### Step 1: Fetch HTML
|
||||
```python
|
||||
url = "https://www.sueddeutsche.de/muenchen/article-123"
|
||||
response = requests.get(url)
|
||||
html = response.content
|
||||
```
|
||||
|
||||
### Step 2: Parse HTML
|
||||
```python
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
```
|
||||
|
||||
### Step 3: Extract Title
|
||||
```python
|
||||
# Try H1
|
||||
h1 = soup.find('h1')
|
||||
# Result: "New U-Bahn Line Opens in Munich"
|
||||
|
||||
# If no H1, try OG meta
|
||||
og_title = soup.find('meta', property='og:title')
|
||||
# Fallback chain continues...
|
||||
```
|
||||
|
||||
### Step 4: Extract Author
|
||||
```python
|
||||
# Try meta author
|
||||
meta_author = soup.find('meta', name='author')
|
||||
# Result: None
|
||||
|
||||
# Try class names
|
||||
author_elem = soup.select_one('[class*="author"]')
|
||||
# Result: "Max Mustermann"
|
||||
```
|
||||
|
||||
### Step 5: Extract Date
|
||||
```python
|
||||
# Try time tag
|
||||
time_tag = soup.find('time')
|
||||
# Result: "2024-11-10T10:00:00Z"
|
||||
```
|
||||
|
||||
### Step 6: Extract Content
|
||||
```python
|
||||
# Try article tag
|
||||
article = soup.find('article')
|
||||
paragraphs = article.find_all('p')
|
||||
|
||||
# Filter paragraphs
|
||||
content = []
|
||||
for p in paragraphs:
|
||||
text = p.get_text().strip()
|
||||
if len(text) >= 50: # Keep substantial paragraphs
|
||||
content.append(text)
|
||||
|
||||
full_content = '\n\n'.join(content)
|
||||
# Result: "The new U-Bahn line connecting the city center..."
|
||||
```
|
||||
|
||||
### Step 7: Save to Database
|
||||
```python
|
||||
article_doc = {
|
||||
'title': 'New U-Bahn Line Opens in Munich',
|
||||
'author': 'Max Mustermann',
|
||||
'link': 'https://www.sueddeutsche.de/muenchen/article-123',
|
||||
'summary': 'Short summary from RSS...',
|
||||
'full_content': 'The new U-Bahn line connecting...',
|
||||
'word_count': 1250,
|
||||
'source': 'Süddeutsche Zeitung München',
|
||||
'published_at': '2024-11-10T10:00:00Z',
|
||||
'crawled_at': datetime.utcnow(),
|
||||
'created_at': datetime.utcnow()
|
||||
}
|
||||
|
||||
db.articles.update_one(
|
||||
{'link': article_url},
|
||||
{'$set': article_doc},
|
||||
upsert=True
|
||||
)
|
||||
```
|
||||
|
||||
## 🎨 What Makes It "Dynamic"?
|
||||
|
||||
### Traditional Approach (Hardcoded)
|
||||
```python
|
||||
# Only works for one specific site
|
||||
title = soup.find('h1', class_='article-title').text
|
||||
author = soup.find('span', class_='author-name').text
|
||||
```
|
||||
❌ Breaks when site changes
|
||||
❌ Doesn't work on other sites
|
||||
|
||||
### Our Approach (Dynamic)
|
||||
```python
|
||||
# Works on ANY site
|
||||
title = extract_title(soup) # Tries 4 different methods
|
||||
author = extract_author(soup) # Tries 5 different methods
|
||||
```
|
||||
✅ Adapts to different HTML structures
|
||||
✅ Falls back to alternatives
|
||||
✅ Works across multiple sites
|
||||
|
||||
## 🛡️ Robustness Features
|
||||
|
||||
### 1. Multiple Strategies
|
||||
Each field has 4-6 extraction strategies
|
||||
```python
|
||||
def extract_title(soup):
|
||||
# Try strategy 1
|
||||
if h1 := soup.find('h1'):
|
||||
return h1.text
|
||||
|
||||
# Try strategy 2
|
||||
if og_title := soup.find('meta', property='og:title'):
|
||||
return og_title['content']
|
||||
|
||||
# Try strategy 3...
|
||||
# Try strategy 4...
|
||||
```
|
||||
|
||||
### 2. Validation
|
||||
```python
|
||||
# Title must be reasonable length
|
||||
if title and len(title) > 10:
|
||||
return title
|
||||
|
||||
# Author must be < 100 chars
|
||||
if author and len(author) < 100:
|
||||
return author
|
||||
```
|
||||
|
||||
### 3. Cleaning
|
||||
```python
|
||||
# Remove site name from title
|
||||
if ' | ' in title:
|
||||
title = title.split(' | ')[0]
|
||||
|
||||
# Remove "By" from author
|
||||
author = author.replace('By ', '').strip()
|
||||
```
|
||||
|
||||
### 4. Error Handling
|
||||
```python
|
||||
try:
|
||||
data = extract_article_content(url)
|
||||
except Timeout:
|
||||
print("Timeout - skip")
|
||||
except RequestException:
|
||||
print("Network error - skip")
|
||||
except Exception:
|
||||
print("Unknown error - skip")
|
||||
```
|
||||
|
||||
## 📈 Success Metrics
|
||||
|
||||
After crawling, you'll see:
|
||||
|
||||
```
|
||||
📰 Crawling feed: Süddeutsche Zeitung München
|
||||
🔍 Crawling: New U-Bahn Line Opens...
|
||||
✓ Saved (1250 words)
|
||||
|
||||
Title: ✓ Found
|
||||
Author: ✓ Found (Max Mustermann)
|
||||
Date: ✓ Found (2024-11-10T10:00:00Z)
|
||||
Content: ✓ Found (1250 words)
|
||||
```
|
||||
|
||||
## 🗄️ Database Result
|
||||
|
||||
**Before Crawling:**
|
||||
```javascript
|
||||
{
|
||||
title: "New U-Bahn Line Opens",
|
||||
link: "https://example.com/article",
|
||||
summary: "Short RSS summary...",
|
||||
source: "Süddeutsche Zeitung"
|
||||
}
|
||||
```
|
||||
|
||||
**After Crawling:**
|
||||
```javascript
|
||||
{
|
||||
title: "New U-Bahn Line Opens in Munich", // ← Enhanced
|
||||
author: "Max Mustermann", // ← NEW!
|
||||
link: "https://example.com/article",
|
||||
summary: "Short RSS summary...",
|
||||
full_content: "The new U-Bahn line...", // ← NEW! (1250 words)
|
||||
word_count: 1250, // ← NEW!
|
||||
source: "Süddeutsche Zeitung",
|
||||
published_at: "2024-11-10T10:00:00Z", // ← Enhanced
|
||||
crawled_at: ISODate("2024-11-10T16:30:00Z"), // ← NEW!
|
||||
created_at: ISODate("2024-11-10T16:00:00Z")
|
||||
}
|
||||
```
|
||||
|
||||
## 🚀 Running the Crawler
|
||||
|
||||
```bash
|
||||
cd news_crawler
|
||||
pip install -r requirements.txt
|
||||
python crawler_service.py 10
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
============================================================
|
||||
🚀 Starting RSS Feed Crawler
|
||||
============================================================
|
||||
Found 3 active feed(s)
|
||||
|
||||
📰 Crawling feed: Süddeutsche Zeitung München
|
||||
🔍 Crawling: New U-Bahn Line Opens...
|
||||
✓ Saved (1250 words)
|
||||
🔍 Crawling: Munich Weather Update...
|
||||
✓ Saved (450 words)
|
||||
✓ Crawled 2 articles
|
||||
|
||||
============================================================
|
||||
✓ Crawling Complete!
|
||||
Total feeds processed: 3
|
||||
Total articles crawled: 15
|
||||
Duration: 45.23 seconds
|
||||
============================================================
|
||||
```
|
||||
|
||||
Now you have rich, structured article data ready for AI processing! 🎉
|
||||
@@ -1,127 +0,0 @@
|
||||
# News Crawler - Quick Start
|
||||
|
||||
## 1. Install Dependencies
|
||||
|
||||
```bash
|
||||
cd news_crawler
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## 2. Configure Environment
|
||||
|
||||
Make sure MongoDB is running and accessible. The crawler will use the same database as the backend.
|
||||
|
||||
Default connection: `mongodb://localhost:27017/`
|
||||
|
||||
To use a different MongoDB URI, create a `.env` file:
|
||||
```env
|
||||
MONGODB_URI=mongodb://localhost:27017/
|
||||
```
|
||||
|
||||
## 3. Run the Crawler
|
||||
|
||||
```bash
|
||||
# Crawl up to 10 articles per feed
|
||||
python crawler_service.py
|
||||
|
||||
# Crawl up to 20 articles per feed
|
||||
python crawler_service.py 20
|
||||
```
|
||||
|
||||
## 4. Verify Results
|
||||
|
||||
Check your MongoDB database:
|
||||
|
||||
```bash
|
||||
# Using mongosh
|
||||
mongosh
|
||||
use munich_news
|
||||
db.articles.find({full_content: {$exists: true}}).count()
|
||||
db.articles.findOne({full_content: {$exists: true}})
|
||||
```
|
||||
|
||||
## 5. Schedule Regular Crawling
|
||||
|
||||
### Option A: Cron (Linux/Mac)
|
||||
|
||||
```bash
|
||||
# Edit crontab
|
||||
crontab -e
|
||||
|
||||
# Add this line to run every 6 hours
|
||||
0 */6 * * * cd /path/to/news_crawler && /path/to/venv/bin/python crawler_service.py
|
||||
```
|
||||
|
||||
### Option B: Docker
|
||||
|
||||
```bash
|
||||
# Build and run
|
||||
docker-compose up
|
||||
|
||||
# Or run as a one-off
|
||||
docker-compose run --rm crawler
|
||||
```
|
||||
|
||||
### Option C: Manual
|
||||
|
||||
Just run the script whenever you want to fetch new articles:
|
||||
|
||||
```bash
|
||||
python crawler_service.py
|
||||
```
|
||||
|
||||
## What Gets Crawled?
|
||||
|
||||
The crawler:
|
||||
1. Fetches all active RSS feeds from the database
|
||||
2. For each feed, gets the latest articles
|
||||
3. Crawls the full content from each article URL
|
||||
4. Saves: title, full_content, word_count, crawled_at
|
||||
5. Skips articles that already have content
|
||||
|
||||
## Output Example
|
||||
|
||||
```
|
||||
============================================================
|
||||
🚀 Starting RSS Feed Crawler
|
||||
============================================================
|
||||
Found 3 active feed(s)
|
||||
|
||||
📰 Crawling feed: Süddeutsche Zeitung München
|
||||
URL: https://www.sueddeutsche.de/muenchen/rss
|
||||
🔍 Crawling: New U-Bahn Line Opens in Munich...
|
||||
✓ Saved (1250 words)
|
||||
🔍 Crawling: Munich Weather Update...
|
||||
✓ Saved (450 words)
|
||||
✓ Crawled 2 articles from Süddeutsche Zeitung München
|
||||
|
||||
============================================================
|
||||
✓ Crawling Complete!
|
||||
Total feeds processed: 3
|
||||
Total articles crawled: 15
|
||||
Duration: 45.23 seconds
|
||||
============================================================
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**No feeds found:**
|
||||
- Make sure you've added RSS feeds via the backend API
|
||||
- Check MongoDB connection
|
||||
|
||||
**Can't extract content:**
|
||||
- Some sites block scrapers
|
||||
- Some sites require JavaScript (not supported yet)
|
||||
- Check if the URL is accessible
|
||||
|
||||
**Timeout errors:**
|
||||
- Increase timeout in the code
|
||||
- Check your internet connection
|
||||
|
||||
## Next Steps
|
||||
|
||||
Once articles are crawled, you can:
|
||||
- View them in the frontend
|
||||
- Use Ollama to summarize them
|
||||
- Generate newsletters with full content
|
||||
- Perform text analysis
|
||||
@@ -1,225 +0,0 @@
|
||||
# News Crawler Microservice
|
||||
|
||||
A standalone microservice that crawls full article content from RSS feeds and stores it in MongoDB.
|
||||
|
||||
## Features
|
||||
|
||||
- 🔍 Extracts full article content from RSS feed links
|
||||
- 📊 Calculates word count
|
||||
- 🔄 Avoids re-crawling already processed articles
|
||||
- ⏱️ Rate limiting (1 second delay between requests)
|
||||
- 🎯 Smart content extraction using multiple selectors
|
||||
- 🧹 Cleans up scripts, styles, and navigation elements
|
||||
|
||||
## Installation
|
||||
|
||||
1. Create a virtual environment:
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. Configure environment variables:
|
||||
Create a `.env` file in the project root (or use the backend's `.env`):
|
||||
```env
|
||||
MONGODB_URI=mongodb://localhost:27017/
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Standalone Execution
|
||||
|
||||
Run the crawler directly:
|
||||
|
||||
```bash
|
||||
# Crawl up to 10 articles per feed (default)
|
||||
python crawler_service.py
|
||||
|
||||
# Crawl up to 20 articles per feed
|
||||
python crawler_service.py 20
|
||||
```
|
||||
|
||||
### As a Module
|
||||
|
||||
```python
|
||||
from crawler_service import crawl_all_feeds, crawl_rss_feed
|
||||
|
||||
# Crawl all active feeds
|
||||
result = crawl_all_feeds(max_articles_per_feed=10)
|
||||
print(result)
|
||||
|
||||
# Crawl a specific feed
|
||||
crawl_rss_feed(
|
||||
feed_url='https://example.com/rss',
|
||||
feed_name='Example News',
|
||||
max_articles=10
|
||||
)
|
||||
```
|
||||
|
||||
### Via Backend API
|
||||
|
||||
The backend has integrated endpoints:
|
||||
|
||||
```bash
|
||||
# Start crawler
|
||||
curl -X POST http://localhost:5001/api/crawler/start
|
||||
|
||||
# Check status
|
||||
curl http://localhost:5001/api/crawler/status
|
||||
|
||||
# Crawl specific feed
|
||||
curl -X POST http://localhost:5001/api/crawler/feed/<feed_id>
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Fetch RSS Feeds**: Gets all active RSS feeds from MongoDB
|
||||
2. **Parse Feed**: Extracts article links from each feed
|
||||
3. **Crawl Content**: For each article:
|
||||
- Fetches HTML page
|
||||
- Removes scripts, styles, navigation
|
||||
- Extracts main content using smart selectors
|
||||
- Calculates word count
|
||||
4. **Store Data**: Saves to MongoDB with metadata
|
||||
5. **Skip Duplicates**: Avoids re-crawling articles with existing content
|
||||
|
||||
## Content Extraction Strategy
|
||||
|
||||
The crawler tries multiple selectors in order:
|
||||
|
||||
1. `<article>` tag
|
||||
2. Elements with class containing "article-content", "article-body"
|
||||
3. Elements with class containing "post-content", "entry-content"
|
||||
4. `<main>` tag
|
||||
5. Fallback to all `<p>` tags in body
|
||||
|
||||
## Database Schema
|
||||
|
||||
Articles are stored with these fields:
|
||||
|
||||
```javascript
|
||||
{
|
||||
title: String, // Article title
|
||||
link: String, // Article URL (unique)
|
||||
summary: String, // Short summary
|
||||
full_content: String, // Full article text (max 10,000 chars)
|
||||
word_count: Number, // Number of words
|
||||
source: String, // RSS feed name
|
||||
published_at: String, // Publication date
|
||||
crawled_at: DateTime, // When content was crawled
|
||||
created_at: DateTime // When added to database
|
||||
}
|
||||
```
|
||||
|
||||
## Scheduling
|
||||
|
||||
### Using Cron (Linux/Mac)
|
||||
|
||||
```bash
|
||||
# Run every 6 hours
|
||||
0 */6 * * * cd /path/to/news_crawler && /path/to/venv/bin/python crawler_service.py
|
||||
```
|
||||
|
||||
### Using systemd Timer (Linux)
|
||||
|
||||
Create `/etc/systemd/system/news-crawler.service`:
|
||||
```ini
|
||||
[Unit]
|
||||
Description=News Crawler Service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
WorkingDirectory=/path/to/news_crawler
|
||||
ExecStart=/path/to/venv/bin/python crawler_service.py
|
||||
User=your-user
|
||||
```
|
||||
|
||||
Create `/etc/systemd/system/news-crawler.timer`:
|
||||
```ini
|
||||
[Unit]
|
||||
Description=Run News Crawler every 6 hours
|
||||
|
||||
[Timer]
|
||||
OnBootSec=5min
|
||||
OnUnitActiveSec=6h
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
```
|
||||
|
||||
Enable and start:
|
||||
```bash
|
||||
sudo systemctl enable news-crawler.timer
|
||||
sudo systemctl start news-crawler.timer
|
||||
```
|
||||
|
||||
### Using Docker
|
||||
|
||||
Create `Dockerfile`:
|
||||
```dockerfile
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY crawler_service.py .
|
||||
|
||||
CMD ["python", "crawler_service.py"]
|
||||
```
|
||||
|
||||
Build and run:
|
||||
```bash
|
||||
docker build -t news-crawler .
|
||||
docker run --env-file ../.env news-crawler
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Environment variables:
|
||||
|
||||
- `MONGODB_URI` - MongoDB connection string (default: `mongodb://localhost:27017/`)
|
||||
|
||||
## Rate Limiting
|
||||
|
||||
- 1 second delay between article requests
|
||||
- Respects server resources
|
||||
- User-Agent header included
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Issue: Can't extract content**
|
||||
- Some sites block scrapers
|
||||
- Try adjusting User-Agent header
|
||||
- Some sites require JavaScript (consider Selenium)
|
||||
|
||||
**Issue: Timeout errors**
|
||||
- Increase timeout in `extract_article_content()`
|
||||
- Check network connectivity
|
||||
|
||||
**Issue: Memory usage**
|
||||
- Reduce `max_articles_per_feed`
|
||||
- Content limited to 10,000 characters per article
|
||||
|
||||
## Architecture
|
||||
|
||||
This is a standalone microservice that:
|
||||
- Can run independently of the main backend
|
||||
- Shares the same MongoDB database
|
||||
- Can be deployed separately
|
||||
- Can be scheduled independently
|
||||
|
||||
## Next Steps
|
||||
|
||||
Once articles are crawled, you can:
|
||||
- Use Ollama to summarize articles
|
||||
- Perform sentiment analysis
|
||||
- Extract keywords and topics
|
||||
- Generate newsletter content
|
||||
- Create article recommendations
|
||||
@@ -1,194 +0,0 @@
|
||||
# RSS URL Extraction - How It Works
|
||||
|
||||
## The Problem
|
||||
|
||||
Different RSS feed providers use different fields to store the article URL:
|
||||
|
||||
### Example 1: Standard RSS (uses `link`)
|
||||
```xml
|
||||
<item>
|
||||
<title>Article Title</title>
|
||||
<link>https://example.com/article/123</link>
|
||||
<guid>internal-id-456</guid>
|
||||
</item>
|
||||
```
|
||||
|
||||
### Example 2: Some feeds (uses `guid` as URL)
|
||||
```xml
|
||||
<item>
|
||||
<title>Article Title</title>
|
||||
<guid>https://example.com/article/123</guid>
|
||||
</item>
|
||||
```
|
||||
|
||||
### Example 3: Atom feeds (uses `id`)
|
||||
```xml
|
||||
<entry>
|
||||
<title>Article Title</title>
|
||||
<id>https://example.com/article/123</id>
|
||||
</entry>
|
||||
```
|
||||
|
||||
### Example 4: Complex feeds (guid as object)
|
||||
```xml
|
||||
<item>
|
||||
<title>Article Title</title>
|
||||
<guid isPermaLink="true">https://example.com/article/123</guid>
|
||||
</item>
|
||||
```
|
||||
|
||||
### Example 5: Multiple links
|
||||
```xml
|
||||
<item>
|
||||
<title>Article Title</title>
|
||||
<link rel="alternate" type="text/html" href="https://example.com/article/123"/>
|
||||
<link rel="enclosure" type="image/jpeg" href="https://example.com/image.jpg"/>
|
||||
</item>
|
||||
```
|
||||
|
||||
## Our Solution
|
||||
|
||||
The `extract_article_url()` function tries multiple strategies in order:
|
||||
|
||||
### Strategy 1: Check `link` field (most common)
|
||||
```python
|
||||
if entry.get('link') and entry.get('link', '').startswith('http'):
|
||||
return entry.get('link')
|
||||
```
|
||||
✅ Works for: Most RSS 2.0 feeds
|
||||
|
||||
### Strategy 2: Check `guid` field
|
||||
```python
|
||||
if entry.get('guid'):
|
||||
guid = entry.get('guid')
|
||||
# guid can be a string
|
||||
if isinstance(guid, str) and guid.startswith('http'):
|
||||
return guid
|
||||
# or a dict with 'href'
|
||||
elif isinstance(guid, dict) and guid.get('href', '').startswith('http'):
|
||||
return guid.get('href')
|
||||
```
|
||||
✅ Works for: Feeds that use GUID as permalink
|
||||
|
||||
### Strategy 3: Check `id` field
|
||||
```python
|
||||
if entry.get('id') and entry.get('id', '').startswith('http'):
|
||||
return entry.get('id')
|
||||
```
|
||||
✅ Works for: Atom feeds
|
||||
|
||||
### Strategy 4: Check `links` array
|
||||
```python
|
||||
if entry.get('links'):
|
||||
for link in entry.get('links', []):
|
||||
if isinstance(link, dict) and link.get('href', '').startswith('http'):
|
||||
# Prefer 'alternate' type
|
||||
if link.get('type') == 'text/html' or link.get('rel') == 'alternate':
|
||||
return link.get('href')
|
||||
```
|
||||
✅ Works for: Feeds with multiple links (prefers HTML content)
|
||||
|
||||
## Real-World Examples
|
||||
|
||||
### Süddeutsche Zeitung
|
||||
```python
|
||||
entry = {
|
||||
'title': 'Munich News',
|
||||
'link': 'https://www.sueddeutsche.de/muenchen/article-123',
|
||||
'guid': 'sz-internal-123'
|
||||
}
|
||||
# Returns: 'https://www.sueddeutsche.de/muenchen/article-123'
|
||||
```
|
||||
|
||||
### Medium Blog
|
||||
```python
|
||||
entry = {
|
||||
'title': 'Blog Post',
|
||||
'guid': 'https://medium.com/@user/post-abc123',
|
||||
'link': None
|
||||
}
|
||||
# Returns: 'https://medium.com/@user/post-abc123'
|
||||
```
|
||||
|
||||
### YouTube RSS
|
||||
```python
|
||||
entry = {
|
||||
'title': 'Video Title',
|
||||
'id': 'https://www.youtube.com/watch?v=abc123',
|
||||
'link': None
|
||||
}
|
||||
# Returns: 'https://www.youtube.com/watch?v=abc123'
|
||||
```
|
||||
|
||||
### Complex Feed
|
||||
```python
|
||||
entry = {
|
||||
'title': 'Article',
|
||||
'links': [
|
||||
{'rel': 'alternate', 'type': 'text/html', 'href': 'https://example.com/article'},
|
||||
{'rel': 'enclosure', 'type': 'image/jpeg', 'href': 'https://example.com/image.jpg'}
|
||||
]
|
||||
}
|
||||
# Returns: 'https://example.com/article' (prefers text/html)
|
||||
```
|
||||
|
||||
## Validation
|
||||
|
||||
All extracted URLs must:
|
||||
1. Start with `http://` or `https://`
|
||||
2. Be a valid string (not None or empty)
|
||||
|
||||
If no valid URL is found:
|
||||
```python
|
||||
return None
|
||||
# Crawler will skip this entry and log a warning
|
||||
```
|
||||
|
||||
## Testing Different Feeds
|
||||
|
||||
To test if a feed works with our extractor:
|
||||
|
||||
```python
|
||||
import feedparser
|
||||
from rss_utils import extract_article_url
|
||||
|
||||
# Parse feed
|
||||
feed = feedparser.parse('https://example.com/rss')
|
||||
|
||||
# Test each entry
|
||||
for entry in feed.entries[:5]:
|
||||
url = extract_article_url(entry)
|
||||
if url:
|
||||
print(f"✓ {entry.get('title', 'No title')[:50]}")
|
||||
print(f" URL: {url}")
|
||||
else:
|
||||
print(f"✗ {entry.get('title', 'No title')[:50]}")
|
||||
print(f" No valid URL found")
|
||||
print(f" Available fields: {list(entry.keys())}")
|
||||
```
|
||||
|
||||
## Supported Feed Types
|
||||
|
||||
✅ RSS 2.0
|
||||
✅ RSS 1.0
|
||||
✅ Atom
|
||||
✅ Custom RSS variants
|
||||
✅ Feeds with multiple links
|
||||
✅ Feeds with GUID as permalink
|
||||
|
||||
## Edge Cases Handled
|
||||
|
||||
1. **GUID is not a URL**: Checks if it starts with `http`
|
||||
2. **Multiple links**: Prefers `text/html` type
|
||||
3. **GUID as dict**: Extracts `href` field
|
||||
4. **Missing fields**: Returns None instead of crashing
|
||||
5. **Non-HTTP URLs**: Filters out `mailto:`, `ftp:`, etc.
|
||||
|
||||
## Future Improvements
|
||||
|
||||
Potential enhancements:
|
||||
- [ ] Support for `feedburner:origLink`
|
||||
- [ ] Support for `pheedo:origLink`
|
||||
- [ ] Resolve shortened URLs (bit.ly, etc.)
|
||||
- [ ] Handle relative URLs (convert to absolute)
|
||||
- [ ] Cache URL extraction results
|
||||
@@ -1,33 +0,0 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
crawler:
|
||||
build: .
|
||||
container_name: news-crawler
|
||||
environment:
|
||||
- MONGODB_URI=mongodb://mongodb:27017/
|
||||
networks:
|
||||
- munich-news-network
|
||||
depends_on:
|
||||
- mongodb
|
||||
# Run once and exit
|
||||
restart: "no"
|
||||
|
||||
mongodb:
|
||||
image: mongo:7.0
|
||||
container_name: munich-news-mongodb
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "27017:27017"
|
||||
volumes:
|
||||
- mongodb_data:/data/db
|
||||
networks:
|
||||
- munich-news-network
|
||||
|
||||
volumes:
|
||||
mongodb_data:
|
||||
driver: local
|
||||
|
||||
networks:
|
||||
munich-news-network:
|
||||
driver: bridge
|
||||
@@ -4,3 +4,5 @@ requests==2.31.0
|
||||
feedparser==6.0.10
|
||||
pymongo==4.6.1
|
||||
python-dotenv==1.0.0
|
||||
schedule==1.2.0
|
||||
pytz==2023.3
|
||||
|
||||
75
news_crawler/scheduled_crawler.py
Executable file
75
news_crawler/scheduled_crawler.py
Executable file
@@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scheduled crawler that runs daily at 6 AM Berlin time
|
||||
"""
|
||||
import schedule
|
||||
import time
|
||||
from datetime import datetime
|
||||
import pytz
|
||||
from crawler_service import crawl_all_feeds
|
||||
|
||||
# Berlin timezone
|
||||
BERLIN_TZ = pytz.timezone('Europe/Berlin')
|
||||
|
||||
def run_crawler():
|
||||
"""Run the crawler and log the execution"""
|
||||
berlin_time = datetime.now(BERLIN_TZ)
|
||||
print(f"\n{'='*60}")
|
||||
print(f"🕐 Scheduled crawler started at {berlin_time.strftime('%Y-%m-%d %H:%M:%S %Z')}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
try:
|
||||
# Run crawler with max 20 articles per feed
|
||||
result = crawl_all_feeds(max_articles_per_feed=20)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"✓ Scheduled crawler completed successfully")
|
||||
print(f" Articles crawled: {result['total_articles_crawled']}")
|
||||
print(f" Duration: {result['duration_seconds']}s")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"✗ Scheduled crawler failed: {e}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
def main():
|
||||
"""Main scheduler loop"""
|
||||
print("🤖 Munich News Crawler Scheduler")
|
||||
print("="*60)
|
||||
print("Schedule: Daily at 6:00 AM Berlin time")
|
||||
print("Timezone: Europe/Berlin (CET/CEST)")
|
||||
print("="*60)
|
||||
|
||||
# Schedule the crawler to run at 6 AM Berlin time
|
||||
schedule.every().day.at("06:00").do(run_crawler)
|
||||
|
||||
# Show next run time
|
||||
berlin_time = datetime.now(BERLIN_TZ)
|
||||
print(f"\nCurrent time (Berlin): {berlin_time.strftime('%Y-%m-%d %H:%M:%S %Z')}")
|
||||
|
||||
# Get next scheduled run
|
||||
next_run = schedule.next_run()
|
||||
if next_run:
|
||||
# Convert to Berlin time for display
|
||||
next_run_berlin = next_run.astimezone(BERLIN_TZ)
|
||||
print(f"Next scheduled run: {next_run_berlin.strftime('%Y-%m-%d %H:%M:%S %Z')}")
|
||||
|
||||
print("\n⏳ Scheduler is running... (Press Ctrl+C to stop)\n")
|
||||
|
||||
# Run immediately on startup (optional - comment out if you don't want this)
|
||||
print("🚀 Running initial crawl on startup...")
|
||||
run_crawler()
|
||||
|
||||
# Keep the scheduler running
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(60) # Check every minute
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
main()
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n👋 Scheduler stopped by user")
|
||||
except Exception as e:
|
||||
print(f"\n\n✗ Scheduler error: {e}")
|
||||
@@ -1,83 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test script to verify crawler functionality
|
||||
"""
|
||||
from crawler_service import extract_article_content, get_active_rss_feeds
|
||||
import sys
|
||||
|
||||
|
||||
def test_content_extraction():
|
||||
"""Test content extraction from a sample URL"""
|
||||
print("Testing content extraction...")
|
||||
|
||||
# Test with a simple news site
|
||||
test_url = "https://www.bbc.com/news"
|
||||
|
||||
print(f"Extracting content from: {test_url}")
|
||||
result = extract_article_content(test_url, timeout=10)
|
||||
|
||||
if result:
|
||||
print("✓ Content extraction successful!")
|
||||
print(f" Title: {result.get('title', 'N/A')[:50]}...")
|
||||
print(f" Content length: {len(result.get('content', ''))} chars")
|
||||
print(f" Word count: {result.get('word_count', 0)}")
|
||||
return True
|
||||
else:
|
||||
print("✗ Content extraction failed")
|
||||
return False
|
||||
|
||||
|
||||
def test_database_connection():
|
||||
"""Test MongoDB connection"""
|
||||
print("\nTesting database connection...")
|
||||
|
||||
try:
|
||||
feeds = get_active_rss_feeds()
|
||||
print(f"✓ Database connection successful!")
|
||||
print(f" Found {len(feeds)} active RSS feed(s)")
|
||||
|
||||
if feeds:
|
||||
print("\n Active feeds:")
|
||||
for feed in feeds:
|
||||
print(f" - {feed['name']}: {feed['url']}")
|
||||
else:
|
||||
print("\n ⚠ No active feeds found. Add feeds via the backend API:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Test Feed\", \"url\": \"https://example.com/rss\"}'")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ Database connection failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("="*60)
|
||||
print("News Crawler - Test Suite")
|
||||
print("="*60 + "\n")
|
||||
|
||||
# Test database connection
|
||||
db_ok = test_database_connection()
|
||||
|
||||
# Test content extraction
|
||||
extract_ok = test_content_extraction()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Test Results:")
|
||||
print(f" Database Connection: {'✓ PASS' if db_ok else '✗ FAIL'}")
|
||||
print(f" Content Extraction: {'✓ PASS' if extract_ok else '✗ FAIL'}")
|
||||
print("="*60 + "\n")
|
||||
|
||||
if db_ok and extract_ok:
|
||||
print("✓ All tests passed! Crawler is ready to use.")
|
||||
print("\nRun the crawler with:")
|
||||
print(" python crawler_service.py")
|
||||
return 0
|
||||
else:
|
||||
print("✗ Some tests failed. Please check the errors above.")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
@@ -1,129 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test script for Ollama integration
|
||||
Tests connection, configuration, and summarization
|
||||
"""
|
||||
from config import Config
|
||||
from ollama_client import OllamaClient
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("Ollama Integration Test")
|
||||
print("="*70)
|
||||
|
||||
# Print configuration
|
||||
Config.print_config()
|
||||
|
||||
# Validate configuration
|
||||
issues = Config.validate()
|
||||
if issues:
|
||||
print("⚠ Configuration Issues:")
|
||||
for issue in issues:
|
||||
print(f" - {issue}")
|
||||
print()
|
||||
|
||||
# Initialize client
|
||||
client = OllamaClient(
|
||||
base_url=Config.OLLAMA_BASE_URL,
|
||||
model=Config.OLLAMA_MODEL,
|
||||
api_key=Config.OLLAMA_API_KEY,
|
||||
enabled=Config.OLLAMA_ENABLED,
|
||||
timeout=Config.OLLAMA_TIMEOUT
|
||||
)
|
||||
|
||||
# Test 1: Check if Ollama is enabled
|
||||
print("Test 1: Configuration Check")
|
||||
print(f" Ollama Enabled: {Config.OLLAMA_ENABLED}")
|
||||
if not Config.OLLAMA_ENABLED:
|
||||
print(" ⚠ Ollama is disabled. Set OLLAMA_ENABLED=true in .env to enable.")
|
||||
print("\n" + "="*70)
|
||||
exit(0)
|
||||
|
||||
# Test 2: Test connection
|
||||
print("\nTest 2: Connection Test")
|
||||
conn_result = client.test_connection()
|
||||
print(f" Available: {conn_result['available']}")
|
||||
print(f" Current Model: {conn_result['current_model']}")
|
||||
|
||||
if conn_result['available']:
|
||||
print(f" ✓ Connected to Ollama server")
|
||||
if conn_result['models']:
|
||||
print(f" Available models: {', '.join(conn_result['models'][:5])}")
|
||||
if conn_result['current_model'] not in conn_result['models']:
|
||||
print(f" ⚠ Warning: Model '{conn_result['current_model']}' not found in available models")
|
||||
else:
|
||||
print(f" ✗ Connection failed: {conn_result['error']}")
|
||||
print("\n" + "="*70)
|
||||
exit(1)
|
||||
|
||||
# Test 3: Test summarization with sample article
|
||||
print("\nTest 3: Summarization Test")
|
||||
print(" Testing with sample German article...")
|
||||
|
||||
sample_article = """
|
||||
Die neue U-Bahn-Linie, die das Münchner Stadtzentrum mit dem Flughafen verbindet, wurde heute eröffnet.
|
||||
Oberbürgermeister Dieter Reiter nahm zusammen mit hunderten Anwohnern an der Eröffnungszeremonie teil.
|
||||
Die Linie wird die Reisezeit zwischen dem Flughafen und der Münchner Innenstadt erheblich verkürzen.
|
||||
Der Bau dauerte fünf Jahre und kostete etwa 2 Milliarden Euro. Die neue Linie umfasst 10 Stationen
|
||||
und verkehrt während der Hauptverkehrszeiten alle 10 Minuten. Experten erwarten, dass die neue Verbindung
|
||||
den Verkehr in der Stadt deutlich entlasten wird. Die Münchner Verkehrsgesellschaft rechnet mit täglich
|
||||
über 50.000 Fahrgästen auf der neuen Strecke.
|
||||
"""
|
||||
|
||||
result = client.summarize_article(sample_article, max_words=Config.SUMMARY_MAX_WORDS)
|
||||
|
||||
print(f"\n Success: {result['success']}")
|
||||
if result['success']:
|
||||
print(f" ✓ Summarization successful!")
|
||||
print(f"\n Original word count: {result['original_word_count']}")
|
||||
print(f" Summary word count: {result['summary_word_count']}")
|
||||
print(f" Compression ratio: {result['original_word_count'] / max(result['summary_word_count'], 1):.1f}x")
|
||||
print(f" Duration: {result['duration']:.2f}s")
|
||||
print(f"\n Summary (English):")
|
||||
print(f" {'-'*70}")
|
||||
print(f" {result['summary']}")
|
||||
print(f" {'-'*70}")
|
||||
else:
|
||||
print(f" ✗ Summarization failed: {result['error']}")
|
||||
|
||||
# Test 4: Test with English article
|
||||
print("\nTest 4: English Article Test")
|
||||
print(" Testing with English article...")
|
||||
|
||||
english_article = """
|
||||
The city council approved a new bike lane network spanning 50 kilometers across Munich.
|
||||
The project aims to promote sustainable transportation and reduce car traffic in the city center.
|
||||
Construction will begin next month and is expected to be completed within two years.
|
||||
The bike lanes will connect major residential areas with business districts and public transport hubs.
|
||||
Environmental groups have praised the initiative as a significant step toward carbon neutrality.
|
||||
"""
|
||||
|
||||
result2 = client.summarize_article(english_article, max_words=50)
|
||||
|
||||
print(f"\n Success: {result2['success']}")
|
||||
if result2['success']:
|
||||
print(f" ✓ Summarization successful!")
|
||||
print(f" Original: {result2['original_word_count']} words → Summary: {result2['summary_word_count']} words")
|
||||
print(f" Duration: {result2['duration']:.2f}s")
|
||||
print(f"\n Summary:")
|
||||
print(f" {result2['summary']}")
|
||||
else:
|
||||
print(f" ✗ Summarization failed: {result2['error']}")
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*70)
|
||||
print("Test Summary")
|
||||
print("="*70)
|
||||
print(f"✓ Configuration: Valid")
|
||||
print(f"✓ Connection: {'Success' if conn_result['available'] else 'Failed'}")
|
||||
print(f"✓ German→English: {'Success' if result['success'] else 'Failed'}")
|
||||
print(f"✓ English→English: {'Success' if result2['success'] else 'Failed'}")
|
||||
print("="*70)
|
||||
|
||||
if result['success'] and result2['success']:
|
||||
print("\n🎉 All tests passed! Ollama integration is working correctly.")
|
||||
print("\nYou can now run the crawler with AI summarization:")
|
||||
print(" python crawler_service.py 5")
|
||||
else:
|
||||
print("\n⚠ Some tests failed. Check the errors above.")
|
||||
|
||||
print()
|
||||
@@ -1,154 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Test script to verify RSS feed URL extraction
|
||||
Tests actual feeds from the database
|
||||
"""
|
||||
import feedparser
|
||||
from pymongo import MongoClient
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from rss_utils import extract_article_url, extract_article_summary, extract_published_date
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(dotenv_path='../.env')
|
||||
|
||||
# MongoDB setup
|
||||
MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017/')
|
||||
DB_NAME = 'munich_news'
|
||||
|
||||
client = MongoClient(MONGODB_URI)
|
||||
db = client[DB_NAME]
|
||||
rss_feeds_collection = db['rss_feeds']
|
||||
|
||||
|
||||
def test_feed(feed_name, feed_url):
|
||||
"""Test a single RSS feed"""
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Testing: {feed_name}")
|
||||
print(f"URL: {feed_url}")
|
||||
print('='*70)
|
||||
|
||||
try:
|
||||
# Parse the feed
|
||||
print("Fetching RSS feed...")
|
||||
feed = feedparser.parse(feed_url)
|
||||
|
||||
if not feed.entries:
|
||||
print("❌ No entries found in feed")
|
||||
return False
|
||||
|
||||
print(f"✓ Found {len(feed.entries)} entries\n")
|
||||
|
||||
# Test first 5 entries
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
|
||||
for i, entry in enumerate(feed.entries[:5], 1):
|
||||
print(f"\n--- Entry {i} ---")
|
||||
print(f"Title: {entry.get('title', 'No title')[:60]}")
|
||||
|
||||
# Test URL extraction
|
||||
article_url = extract_article_url(entry)
|
||||
if article_url:
|
||||
print(f"✓ URL: {article_url}")
|
||||
success_count += 1
|
||||
else:
|
||||
print(f"❌ No valid URL found")
|
||||
print(f" Available fields: {list(entry.keys())}")
|
||||
print(f" link: {entry.get('link', 'N/A')}")
|
||||
print(f" guid: {entry.get('guid', 'N/A')}")
|
||||
print(f" id: {entry.get('id', 'N/A')}")
|
||||
fail_count += 1
|
||||
|
||||
# Test summary extraction
|
||||
summary = extract_article_summary(entry)
|
||||
if summary:
|
||||
print(f"✓ Summary: {summary[:80]}...")
|
||||
else:
|
||||
print(f"⚠ No summary found")
|
||||
|
||||
# Test date extraction
|
||||
pub_date = extract_published_date(entry)
|
||||
if pub_date:
|
||||
print(f"✓ Published: {pub_date}")
|
||||
else:
|
||||
print(f"⚠ No published date found")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Results for {feed_name}:")
|
||||
print(f" ✓ Success: {success_count}/5")
|
||||
print(f" ❌ Failed: {fail_count}/5")
|
||||
print('='*70)
|
||||
|
||||
return fail_count == 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error testing feed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("\n" + "="*70)
|
||||
print("RSS Feed URL Extraction Test")
|
||||
print("="*70)
|
||||
|
||||
# Get all RSS feeds from database
|
||||
print("\nFetching RSS feeds from database...")
|
||||
feeds = list(rss_feeds_collection.find())
|
||||
|
||||
if not feeds:
|
||||
print("❌ No RSS feeds found in database")
|
||||
print("\nAdd feeds using:")
|
||||
print(" curl -X POST http://localhost:5001/api/rss-feeds \\")
|
||||
print(" -H 'Content-Type: application/json' \\")
|
||||
print(" -d '{\"name\": \"Feed Name\", \"url\": \"https://example.com/rss\"}'")
|
||||
return 1
|
||||
|
||||
print(f"✓ Found {len(feeds)} feed(s) in database\n")
|
||||
|
||||
# Test each feed
|
||||
results = {}
|
||||
for feed in feeds:
|
||||
feed_name = feed.get('name', 'Unknown')
|
||||
feed_url = feed.get('url', '')
|
||||
active = feed.get('active', True)
|
||||
|
||||
if not active:
|
||||
print(f"\n⏭ Skipping inactive feed: {feed_name}")
|
||||
continue
|
||||
|
||||
if not feed_url:
|
||||
print(f"\n❌ Feed '{feed_name}' has no URL")
|
||||
results[feed_name] = False
|
||||
continue
|
||||
|
||||
results[feed_name] = test_feed(feed_name, feed_url)
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*70)
|
||||
print("FINAL RESULTS")
|
||||
print("="*70)
|
||||
|
||||
for feed_name, success in results.items():
|
||||
status = "✓ PASS" if success else "❌ FAIL"
|
||||
print(f"{status} - {feed_name}")
|
||||
|
||||
total = len(results)
|
||||
passed = sum(1 for s in results.values() if s)
|
||||
|
||||
print(f"\nTotal: {passed}/{total} feeds passed")
|
||||
print("="*70 + "\n")
|
||||
|
||||
if passed == total:
|
||||
print("✓ All feeds are working correctly!")
|
||||
print("\nYou can now run the crawler:")
|
||||
print(" python crawler_service.py")
|
||||
return 0
|
||||
else:
|
||||
print("⚠ Some feeds have issues. Check the output above.")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user