This commit is contained in:
2025-11-12 16:26:59 +01:00
parent fe3e502912
commit 804a751fdf
5 changed files with 234 additions and 1 deletions

View File

@@ -112,6 +112,9 @@ class OllamaClient:
'duration': time.time() - start_time
}
# Clean markdown formatting from summary
summary = self._clean_markdown(summary)
summary_word_count = len(summary.split())
return {
@@ -303,6 +306,35 @@ German headline:
return translation
def _clean_markdown(self, text):
"""Remove markdown formatting from text"""
import re
# Remove markdown headers (##, ###, etc.)
text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
# Remove bold/italic markers (**text**, *text*, __text__, _text_)
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
text = re.sub(r'__([^_]+)__', r'\1', text)
text = re.sub(r'\*([^\*]+)\*', r'\1', text)
text = re.sub(r'_([^_]+)_', r'\1', text)
# Remove markdown links [text](url) -> text
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
# Remove inline code `text`
text = re.sub(r'`([^`]+)`', r'\1', text)
# Remove bullet points and list markers
text = re.sub(r'^\s*[-*+]\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
# Clean up extra whitespace
text = re.sub(r'\n\s*\n', '\n\n', text)
text = text.strip()
return text
def _build_summarization_prompt(self, content, max_words):
"""Build prompt for article summarization"""
# Truncate content if too long (keep first 5000 words)
@@ -319,6 +351,8 @@ Write in the clear, engaging, and authoritative style of New York Times Magazine
- Focus on what matters to readers
- Even if the source is in German or another language, write your summary entirely in English
IMPORTANT: Write in plain text only. Do NOT use markdown formatting (no ##, **, *, bullets, etc.). Just write natural prose.
Article:
{content}