Files
media-downloader/web/backend/routers/press.py
Todd 523f91788e Fix DB paths, add auth to sensitive endpoints, misc bug fixes
- scheduler.py: Use full path for scheduler_state.db instead of relative name
- recycle.py: Use full path for thumbnails.db instead of relative name
- cloud_backup.py, maintenance.py, stats.py: Require admin for config/cleanup/settings endpoints
- press.py: Add auth to press image serving endpoint
- private_gallery.py: Fix _create_pg_job call and add missing secrets import
- appearances.py: Use sync httpx instead of asyncio.run for background thread HTTP call

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-30 08:25:00 -04:00

1099 lines
38 KiB
Python

"""
Press Router
Monitors GDELT for news articles mentioning tracked celebrities.
Stores complete articles and sends Pushover push notifications.
"""
import asyncio
import hashlib
import json
import threading
import re
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from typing import Dict, List, Optional
import urllib.error
from urllib.parse import urlparse
from pathlib import Path
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import FileResponse
from pydantic import BaseModel
from slowapi import Limiter
from slowapi.util import get_remote_address
from ..core.dependencies import get_current_user, get_app_state
from ..core.exceptions import handle_exceptions
from modules.universal_logger import get_logger
logger = get_logger('API')
router = APIRouter(prefix="/api/press", tags=["Press"])
limiter = Limiter(key_func=get_remote_address)
# Thread pool for blocking operations
_executor = ThreadPoolExecutor(max_workers=2)
# Track running fetch jobs
_fetch_lock = threading.Lock()
_fetch_running = False
# ============================================================================
# PYDANTIC MODELS
# ============================================================================
class PressConfigUpdate(BaseModel):
enabled: Optional[bool] = None
check_interval_hours: Optional[int] = None
max_records_per_query: Optional[int] = None
notify_new_articles: Optional[bool] = None
celebrity_ids: Optional[List[int]] = None
class ReadStatusUpdate(BaseModel):
read: bool
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
def _get_db():
"""Get database instance."""
app_state = get_app_state()
return app_state.db
def _get_config(db) -> Dict:
"""Get press config from database."""
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('SELECT * FROM press_config WHERE id = 1')
row = cursor.fetchone()
if not row:
return {
'enabled': True,
'check_interval_hours': 6,
'max_records_per_query': 25,
'notify_new_articles': True,
'celebrity_ids': [],
}
# Parse celebrity_ids from JSON string
celebrity_ids_raw = row['celebrity_ids'] if 'celebrity_ids' in row.keys() else None
try:
celebrity_ids = json.loads(celebrity_ids_raw) if celebrity_ids_raw else []
except (json.JSONDecodeError, TypeError):
celebrity_ids = []
return {
'enabled': bool(row['enabled']),
'check_interval_hours': row['check_interval_hours'],
'max_records_per_query': row['max_records_per_query'],
'notify_new_articles': bool(row['notify_new_articles']),
'celebrity_ids': celebrity_ids,
}
def _get_enabled_celebrities(db) -> List[Dict]:
"""Get enabled celebrities from celebrity_profiles."""
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT id, name, slug FROM celebrity_profiles
WHERE enabled = 1
ORDER BY name
''')
rows = cursor.fetchall()
return [dict(r) for r in rows]
def _decode_google_news_url(google_url: str) -> Optional[str]:
"""Decode a Google News redirect URL to the real article URL."""
if 'news.google.com' not in google_url:
return google_url
try:
from googlenewsdecoder import gnewsdecoder
result = gnewsdecoder(google_url, interval=1)
if result.get('status'):
return result['decoded_url']
except Exception as e:
logger.debug(f"Failed to decode Google News URL: {e}")
return None
def fetch_google_news_articles(name: str, max_records: int = 100) -> List[Dict]:
"""Query Google News RSS for articles mentioning the given name.
Returns list of dicts with: title, url, published_date (seendate key for compat), source."""
import urllib.request
import xml.etree.ElementTree as ET
query = f'%22{name.replace(" ", "+")}%22'
url = f'https://news.google.com/rss/search?q={query}&hl=en&gl=US&ceid=US:en'
for attempt in range(3):
try:
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
with urllib.request.urlopen(req, timeout=30) as response:
data = response.read().decode('utf-8')
root = ET.fromstring(data)
articles = []
for item in root.findall('.//item'):
title_el = item.find('title')
link_el = item.find('link')
pub_el = item.find('pubDate')
source_el = item.find('source')
if title_el is None or link_el is None:
continue
title = title_el.text or ''
source_name = source_el.text if source_el is not None else ''
if source_name and title.endswith(f' - {source_name}'):
title = title[:-len(f' - {source_name}')].strip()
# Parse pubDate (RFC 2822) to GDELT-compat format
seendate = ''
if pub_el is not None and pub_el.text:
try:
from email.utils import parsedate_to_datetime
dt = parsedate_to_datetime(pub_el.text)
seendate = dt.strftime('%Y%m%dT%H%M%SZ')
except Exception:
pass
articles.append({
'title': title,
'url': link_el.text or '',
'seendate': seendate,
'socialimage': '',
'language': 'en',
'sourcecountry': '',
})
logger.info(f"Google News: {len(articles)} articles for '{name}'")
return articles[:max_records]
except Exception as e:
if attempt < 2:
import time
time.sleep(5)
continue
logger.error(f"Google News fetch error for '{name}': {e}")
return []
return []
def _parse_article_html(raw_html: str, url: str) -> tuple:
"""Parse raw HTML into article content and og:image. Returns (content_html, image_url)."""
from urllib.parse import urljoin
try:
from readability import Document
from bs4 import BeautifulSoup
import bleach
# Extract og:image for thumbnail
og_soup = BeautifulSoup(raw_html, 'html.parser')
og_image = None
og_tag = og_soup.find('meta', property='og:image')
if og_tag and og_tag.get('content'):
og_image = og_tag['content']
if not og_image:
tw_tag = og_soup.find('meta', attrs={'name': 'twitter:image'})
if tw_tag and tw_tag.get('content'):
og_image = tw_tag['content']
doc = Document(raw_html, url=url)
content_html = doc.summary()
if not content_html or len(content_html.strip()) < 50:
return (None, og_image)
reader_soup = BeautifulSoup(content_html, 'html.parser')
junk_text_re = re.compile(
r'(^published|^updated|^modified|^posted|^by\s|^written by|^photo:|^image:|^credit:|'
r'share or comment|share this article|comment on this|follow us on|'
r'sign up for|subscribe to|have you got a story|tips@|email us)',
re.I
)
inline_tags = ['b', 'i', 'em', 'strong', 'a', 'br']
inline_attrs = {'a': ['href']}
html_parts = []
for el in reader_soup.find_all(['p', 'h2', 'h3', 'h4', 'blockquote', 'ul', 'ol']):
text = el.get_text(strip=True)
if len(text) < 30:
continue
if junk_text_re.search(text):
continue
tag = el.name
inner = bleach.clean(
el.decode_contents(), tags=inline_tags,
attributes=inline_attrs, strip=True, protocols=['http', 'https']
).strip()
if not inner:
continue
if tag == 'p':
html_parts.append(f'<p>{inner}</p>')
elif tag in ('h2', 'h3', 'h4'):
html_parts.append(f'<{tag}>{inner}</{tag}>')
elif tag == 'blockquote':
html_parts.append(f'<blockquote><p>{inner}</p></blockquote>')
elif tag in ('ul', 'ol'):
items = []
for li in el.find_all('li', recursive=False):
li_inner = bleach.clean(
li.decode_contents(), tags=inline_tags,
attributes=inline_attrs, strip=True, protocols=['http', 'https']
).strip()
if li_inner and len(li.get_text(strip=True)) > 10:
items.append(f'<li>{li_inner}</li>')
if items:
html_parts.append(f'<{tag}>{"".join(items)}</{tag}>')
junk_img_re = re.compile(r'(logo|icon|pixel|spacer|blank|1x1|svg|avatar|spinner|/ct/|cts\.businesswire)', re.I)
seen_srcs = set()
article_images = []
for img in reader_soup.find_all('img'):
src = img.get('src', '')
if src and src.startswith(('http://', 'https://')) and src not in seen_srcs:
if junk_img_re.search(src):
continue
seen_srcs.add(src)
alt = (img.get('alt', '') or '').strip()
article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
# If readability found no images, grab first real image from original HTML
if not article_images:
orig_soup = BeautifulSoup(raw_html, 'html.parser')
for noise in orig_soup.find_all(['script', 'style', 'nav', 'footer', 'header',
'aside', 'form', 'noscript', 'svg']):
noise.decompose()
for img in orig_soup.find_all('img'):
src = (img.get('data-src') or img.get('data-lazy-src') or
img.get('data-original') or img.get('src') or '')
if not src or not src.startswith(('http://', 'https://')):
continue
src_lower = src.lower()
if any(x in src_lower for x in ('logo', 'icon', 'pixel', 'spacer', 'blank',
'1x1', 'svg', 'avatar', 'spinner', '/ct/')):
continue
alt = (img.get('alt', '') or '').strip()
article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
break # Only first real image
if article_images and html_parts:
text_count = len(html_parts)
img_count = len(article_images)
interval = max(1, text_count // (img_count + 1))
merged = []
img_idx = 0
for i, part in enumerate(html_parts):
merged.append(part)
if img_idx < img_count and (i + 1) % interval == 0:
merged.append(article_images[img_idx])
img_idx += 1
while img_idx < img_count:
merged.append(article_images[img_idx])
img_idx += 1
html_parts = merged
elif article_images and not html_parts:
html_parts = article_images
if not html_parts:
text = reader_soup.get_text(separator='\n\n', strip=True)
if text:
for para in text.split('\n\n'):
para = para.strip()
if len(para) > 30:
html_parts.append(f'<p>{bleach.clean(para)}</p>')
if not html_parts:
return (None, og_image)
from bs4 import BeautifulSoup as BS
clean_parts = []
for part in html_parts:
part_soup = BS(part, 'html.parser')
part_text = part_soup.get_text(strip=True)
if len(part_text) > 100:
words = part_text.split()
avg_word_len = len(part_text) / max(len(words), 1)
if avg_word_len > 12:
continue
clean_parts.append(part)
if not clean_parts:
return (None, og_image)
result = '\n'.join(clean_parts)
plain_text = BS(result, 'html.parser').get_text(separator=' ', strip=True)
garbage_re = re.compile(
r'(use (left|right|escape)|arrow keys|navigate between|'
r'sign (in|up) with|we won.t post|social account|'
r'accept cookies|cookie policy|privacy policy|terms of (use|service)|'
r'AlabamaAlaska|CaliforniaColorado|United States of America)',
re.I
)
if len(plain_text) < 200 or garbage_re.search(plain_text):
return (None, og_image)
return (result, og_image)
except Exception as e:
logger.debug(f"Article parsing failed for {url}: {e}")
return (None, None)
def _fetch_html_flaresolverr(url: str) -> Optional[str]:
"""Fetch HTML via FlareSolverr (headless browser)."""
try:
import requests
resp = requests.post('http://localhost:8191/v1', json={
'cmd': 'request.get',
'url': url,
'maxTimeout': 30000
}, timeout=45)
data = resp.json()
if data.get('status') == 'ok':
html = data.get('solution', {}).get('response', '')
if len(html) > 500:
return html
except Exception:
pass
return None
def extract_article_content_with_image(url: str) -> tuple:
"""Extract article content and og:image. Tries direct fetch, falls back to FlareSolverr.
Returns (content_html, image_url)."""
import urllib.request
# Try direct fetch first
try:
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
})
with urllib.request.urlopen(req, timeout=20) as response:
raw_html = response.read().decode('utf-8', errors='replace')
content, image = _parse_article_html(raw_html, url)
if content:
return (content, image)
except Exception:
pass
# Fallback to FlareSolverr for bot-protected sites
raw_html = _fetch_html_flaresolverr(url)
if raw_html:
content, image = _parse_article_html(raw_html, url)
return (content, image)
return (None, None)
def extract_article_content(url: str) -> Optional[str]:
"""Extract article content (legacy wrapper, returns content only)."""
content, _ = extract_article_content_with_image(url)
return content
def process_press_articles(db, celebrity_id: Optional[int] = None, send_notifications: bool = True) -> Dict:
"""
Fetch GDELT articles for celebrities, deduplicate, store, and notify.
Returns stats about the operation.
"""
global _fetch_running
with _fetch_lock:
_fetch_running = True
try:
config = _get_config(db)
max_records = config.get('max_records_per_query', 25)
notify_enabled = config.get('notify_new_articles', True)
# Get celebrities to check
if celebrity_id:
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('SELECT id, name, slug FROM celebrity_profiles WHERE id = ?', (celebrity_id,))
row = cursor.fetchone()
celebrities = [dict(row)] if row else []
else:
# Use celebrity_ids from config if set, otherwise skip
configured_ids = config.get('celebrity_ids', [])
if configured_ids:
with db.get_connection() as conn:
cursor = conn.cursor()
placeholders = ','.join(['?' for _ in configured_ids])
cursor.execute(
f'SELECT id, name, slug FROM celebrity_profiles WHERE id IN ({placeholders}) ORDER BY name',
configured_ids
)
celebrities = [dict(r) for r in cursor.fetchall()]
else:
celebrities = []
total_new = 0
total_fetched = 0
results_by_celebrity = {}
import time as _time
for idx, celeb in enumerate(celebrities):
celeb_id = celeb['id']
celeb_name = celeb['name']
# Small delay between celebrities
if idx > 0:
_time.sleep(2)
# Fetch from Google News RSS
articles = fetch_google_news_articles(celeb_name, max_records)
total_fetched += len(articles)
new_count = 0
for article in articles:
google_url = article.get('url', '')
if not google_url:
continue
article_title = article.get('title', '').strip()
# Check for duplicate by title first (cheap check before URL decode)
with db.get_connection() as conn:
cursor = conn.cursor()
if article_title:
cursor.execute(
'SELECT id FROM press_articles WHERE celebrity_id = ? AND title = ?',
(celeb_id, article_title)
)
if cursor.fetchone():
continue
# Only keep articles where celeb name appears in the title
if not article_title or celeb_name.lower() not in article_title.lower():
continue
# Decode Google News URL to real article URL
article_url = _decode_google_news_url(google_url)
if not article_url:
continue
url_hash = hashlib.sha256(article_url.encode('utf-8')).hexdigest()
# Check for duplicate by URL hash
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('SELECT id FROM press_articles WHERE url_hash = ?', (url_hash,))
if cursor.fetchone():
continue
# Parse domain from real URL
parsed = urlparse(article_url)
domain = parsed.netloc.replace('www.', '')
# Extract article content and og:image
content, og_image = extract_article_content_with_image(article_url)
# Cache all inline images in the content to local proxy
if content:
content = _cache_content_images(content)
if content:
snippet = re.sub(r'<[^>]+>', ' ', content)
snippet = ' '.join(snippet.split())[:300]
else:
snippet = article.get('title', '')[:300] if article.get('title') else ''
# Cache the og:image locally, fall back to first inline image
cached_image = cache_press_image(og_image) if og_image else None
if not cached_image and content:
m = re.search(r'<img\s+src="(/api/press/images/[^"]+)"', content)
if m:
cached_image = m.group(1)
# Parse published date
published_date = article.get('seendate', '')
if published_date:
# GDELT format: YYYYMMDDTHHmmSSZ -> ISO format
try:
dt = datetime.strptime(published_date, '%Y%m%dT%H%M%SZ')
published_date = dt.isoformat()
except (ValueError, TypeError):
pass
# Insert article
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
INSERT INTO press_articles
(celebrity_id, title, url, url_hash, domain, published_date,
image_url, language, country, article_content, snippet, notified, read)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0, 0)
''', (
celeb_id,
article.get('title', ''),
article_url,
url_hash,
domain,
published_date,
cached_image or '',
article.get('language', ''),
article.get('sourcecountry', ''),
content,
snippet,
))
conn.commit()
new_count += 1
total_new += new_count
if new_count > 0:
results_by_celebrity[celeb_name] = new_count
# Send notifications for new articles (only for scheduled fetches)
if send_notifications and notify_enabled and total_new > 0:
_send_press_notification(db, results_by_celebrity)
# Mark notified
if total_new > 0:
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('UPDATE press_articles SET notified = ? WHERE notified = ?', (1, 0))
conn.commit()
logger.info(f"Press monitor: fetched {total_fetched}, new {total_new}")
return {
'total_fetched': total_fetched,
'total_new': total_new,
'by_celebrity': results_by_celebrity,
}
except Exception as e:
logger.error(f"Error in press article processing: {e}")
import traceback
logger.debug(f"Traceback: {traceback.format_exc()}")
return {'error': str(e), 'total_fetched': 0, 'total_new': 0}
finally:
with _fetch_lock:
_fetch_running = False
def _send_press_notification(db, results_by_celebrity: Dict):
"""Send Pushover notification about new press articles."""
try:
from modules.pushover_notifier import create_notifier_from_config
from modules.settings_manager import SettingsManager
settings_manager = SettingsManager(str(db.db_path))
config = settings_manager.get_all()
notifier = create_notifier_from_config(config, unified_db=db)
if not notifier:
return
total = sum(results_by_celebrity.values())
title = f"📰 Press: {total} new article{'s' if total != 1 else ''}"
# Build rich HTML message matching other notification formats
message_parts = []
for name, count in results_by_celebrity.items():
message_parts.append(f"<b>👤 {name}:</b> {count} article{'s' if count != 1 else ''}")
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
message_parts.append(f"\n<b>⏰ Discovered:</b> {now}")
message = "\n".join(message_parts)
# Set notification context so it gets recorded to the notifications table
# and broadcast via websocket for real-time UI updates
notifier._current_notification_context = {
'platform': 'press',
'source': 'GDELT',
'content_type': 'article',
'download_count': total,
'metadata': {'by_celebrity': results_by_celebrity}
}
notifier.send_notification(
title=title,
message=message,
priority=0,
html=True,
)
except Exception as e:
logger.error(f"Failed to send press notification: {e}")
# ============================================================================
# CONFIGURATION ENDPOINTS
# ============================================================================
@router.get("/config")
@limiter.limit("30/minute")
@handle_exceptions
async def get_config(
request: Request,
current_user: Dict = Depends(get_current_user)
):
"""Get press monitor configuration."""
db = _get_db()
config = _get_config(db)
return {"success": True, "config": config}
@router.put("/config")
@limiter.limit("10/minute")
@handle_exceptions
async def update_config(
request: Request,
config_update: PressConfigUpdate,
current_user: Dict = Depends(get_current_user)
):
"""Update press monitor configuration."""
db = _get_db()
updates = config_update.model_dump(exclude_none=True)
if not updates:
return {"success": False, "message": "No fields to update"}
set_parts = []
values = []
for key, value in updates.items():
if isinstance(value, bool):
value = 1 if value else 0
elif key == 'celebrity_ids' and isinstance(value, list):
value = json.dumps(value)
set_parts.append(f"{key} = ?")
values.append(value)
set_parts.append("updated_at = CURRENT_TIMESTAMP")
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
f"UPDATE press_config SET {', '.join(set_parts)} WHERE id = 1",
values
)
conn.commit()
return {"success": True, "message": "Configuration updated"}
# ============================================================================
# CELEBRITY SELECTION ENDPOINT
# ============================================================================
@router.get("/celebrities")
@limiter.limit("30/minute")
@handle_exceptions
async def get_press_celebrities(
request: Request,
current_user: Dict = Depends(get_current_user),
):
"""Get all tracked celebrities with press_enabled flag based on config."""
db = _get_db()
config = _get_config(db)
enabled_ids = set(config.get('celebrity_ids', []))
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT id, name, slug FROM celebrity_profiles
WHERE enabled = 1
ORDER BY name
''')
rows = cursor.fetchall()
celebrities = []
for r in rows:
celeb = dict(r)
celeb['press_enabled'] = celeb['id'] in enabled_ids
celebrities.append(celeb)
return {"success": True, "celebrities": celebrities}
# ============================================================================
# ARTICLE ENDPOINTS
# ============================================================================
@router.get("/articles")
@limiter.limit("30/minute")
@handle_exceptions
async def get_articles(
request: Request,
current_user: Dict = Depends(get_current_user),
celebrity_id: Optional[int] = None,
domain: Optional[str] = None,
read: Optional[bool] = None,
search: Optional[str] = None,
page: int = 1,
per_page: int = 50,
):
"""Get paginated list of press articles with filters."""
db = _get_db()
conditions = []
params = []
if celebrity_id is not None:
conditions.append("pa.celebrity_id = ?")
params.append(celebrity_id)
if domain is not None:
conditions.append("pa.domain = ?")
params.append(domain)
if read is not None:
conditions.append("pa.read = ?")
params.append(1 if read else 0)
if search:
conditions.append("(pa.title LIKE ? OR pa.snippet LIKE ?)")
params.extend([f'%{search}%', f'%{search}%'])
where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else ""
offset = (page - 1) * per_page
with db.get_connection() as conn:
cursor = conn.cursor()
# Get total count
cursor.execute(f"SELECT COUNT(*) FROM press_articles pa {where_clause}", params)
total = cursor.fetchone()[0]
# Get articles
cursor.execute(f'''
SELECT pa.id, pa.celebrity_id, pa.title, pa.url, pa.domain,
pa.published_date, pa.image_url, pa.language, pa.country,
pa.snippet, pa.fetched_at, pa.read,
cp.name as celebrity_name
FROM press_articles pa
LEFT JOIN celebrity_profiles cp ON pa.celebrity_id = cp.id
{where_clause}
ORDER BY pa.published_date DESC
LIMIT ? OFFSET ?
''', params + [per_page, offset])
articles = [dict(r) for r in cursor.fetchall()]
return {
"success": True,
"articles": articles,
"total": total,
"page": page,
"per_page": per_page,
"pages": (total + per_page - 1) // per_page if per_page else 1,
}
@router.get("/articles/{article_id}")
@limiter.limit("30/minute")
@handle_exceptions
async def get_article(
request: Request,
article_id: int,
current_user: Dict = Depends(get_current_user),
):
"""Get a single article with full content."""
db = _get_db()
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT pa.*, cp.name as celebrity_name
FROM press_articles pa
LEFT JOIN celebrity_profiles cp ON pa.celebrity_id = cp.id
WHERE pa.id = ?
''', (article_id,))
row = cursor.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Article not found")
return {"success": True, "article": dict(row)}
@router.patch("/articles/{article_id}/read")
@limiter.limit("30/minute")
@handle_exceptions
async def update_read_status(
request: Request,
article_id: int,
body: ReadStatusUpdate,
current_user: Dict = Depends(get_current_user),
):
"""Mark an article as read or unread."""
db = _get_db()
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
'UPDATE press_articles SET read = ? WHERE id = ?',
(1 if body.read else 0, article_id)
)
conn.commit()
if cursor.rowcount == 0:
raise HTTPException(status_code=404, detail="Article not found")
return {"success": True, "message": f"Article marked as {'read' if body.read else 'unread'}"}
@router.post("/articles/mark-all-read")
@limiter.limit("10/minute")
@handle_exceptions
async def mark_all_read(
request: Request,
current_user: Dict = Depends(get_current_user),
):
"""Mark all unread articles as read."""
db = _get_db()
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('UPDATE press_articles SET read = 1 WHERE read = 0')
conn.commit()
count = cursor.rowcount
return {"success": True, "message": f"Marked {count} article{'s' if count != 1 else ''} as read", "count": count}
@router.delete("/articles/{article_id}")
@limiter.limit("10/minute")
@handle_exceptions
async def delete_article(
request: Request,
article_id: int,
current_user: Dict = Depends(get_current_user),
):
"""Delete a press article."""
db = _get_db()
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('DELETE FROM press_articles WHERE id = ?', (article_id,))
conn.commit()
if cursor.rowcount == 0:
raise HTTPException(status_code=404, detail="Article not found")
return {"success": True, "message": "Article deleted"}
# ============================================================================
# STATS ENDPOINT
# ============================================================================
@router.get("/stats")
@limiter.limit("30/minute")
@handle_exceptions
async def get_stats(
request: Request,
current_user: Dict = Depends(get_current_user),
):
"""Get press article statistics."""
db = _get_db()
with db.get_connection() as conn:
cursor = conn.cursor()
# Total articles
cursor.execute('SELECT COUNT(*) FROM press_articles')
total = cursor.fetchone()[0]
# Unread count
cursor.execute('SELECT COUNT(*) FROM press_articles WHERE read = 0')
unread = cursor.fetchone()[0]
# By celebrity
cursor.execute('''
SELECT pa.celebrity_id, cp.name, COUNT(*) as count
FROM press_articles pa
JOIN celebrity_profiles cp ON pa.celebrity_id = cp.id
GROUP BY pa.celebrity_id, cp.name
ORDER BY count DESC
''')
by_celebrity = [{'id': r['celebrity_id'], 'name': r['name'], 'count': r['count']} for r in cursor.fetchall()]
# By domain (top 10)
cursor.execute('''
SELECT domain, COUNT(*) as count
FROM press_articles
GROUP BY domain
ORDER BY count DESC
LIMIT 10
''')
by_domain = [{'domain': r['domain'], 'count': r['count']} for r in cursor.fetchall()]
return {
"success": True,
"stats": {
"total": total,
"unread": unread,
"by_celebrity": by_celebrity,
"by_domain": by_domain,
}
}
# ============================================================================
# FETCH ENDPOINT
# ============================================================================
@router.post("/fetch")
@limiter.limit("5/minute")
@handle_exceptions
async def trigger_fetch(
request: Request,
background_tasks: BackgroundTasks,
current_user: Dict = Depends(get_current_user),
celebrity_id: Optional[int] = None,
):
"""Trigger a manual GDELT fetch for all or a specific celebrity."""
global _fetch_running
with _fetch_lock:
if _fetch_running:
return {"success": False, "message": "Fetch already in progress"}
db = _get_db()
def do_fetch():
process_press_articles(db, celebrity_id, send_notifications=False)
loop = asyncio.get_event_loop()
background_tasks.add_task(loop.run_in_executor, _executor, do_fetch)
return {"success": True, "message": "Fetch started"}
@router.get("/fetch/status")
@limiter.limit("30/minute")
@handle_exceptions
async def get_fetch_status(
request: Request,
current_user: Dict = Depends(get_current_user),
):
"""Check if a fetch is currently running."""
return {"success": True, "is_running": _fetch_running}
# ============================================================================
# IMAGE PROXY / CACHE
# ============================================================================
PRESS_IMAGE_CACHE = Path("/opt/media-downloader/data/press_images")
PRESS_IMAGE_CACHE.mkdir(parents=True, exist_ok=True)
def _cache_content_images(html_content: str) -> str:
"""Find all <img ...> in HTML content, cache each image locally,
and rewrite src to /api/press/images/... proxy path.
Removes img tags where caching fails (broken > missing)."""
if not html_content:
return html_content
def _replace_img(match):
full_tag = match.group(0)
src = match.group(1)
if not src or src.startswith('/api/press/images/'):
return full_tag
cached = cache_press_image(src)
if cached:
return full_tag.replace(src, cached)
return '' # Remove img if caching failed
return re.sub(r'<img\s+src="([^"]+)"[^>]*>', _replace_img, html_content)
def cache_press_image(image_url: str, use_flaresolverr: bool = False) -> Optional[str]:
"""Download an image and cache it locally. Returns the API path to serve it."""
if not image_url:
return None
import urllib.request
# Hash the URL for the filename
url_hash = hashlib.sha256(image_url.encode('utf-8')).hexdigest()[:16]
# Check if already cached
for ext in ('.jpg', '.jpeg', '.png', '.webp', '.gif'):
cached = PRESS_IMAGE_CACHE / f"{url_hash}{ext}"
if cached.exists() and cached.stat().st_size > 0:
return f"/api/press/images/{cached.name}"
# Download the image
image_data = None
# Try direct fetch first
try:
req = urllib.request.Request(image_url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'image/*,*/*',
})
with urllib.request.urlopen(req, timeout=15) as resp:
content_type = resp.headers.get('Content-Type', '')
if 'image' in content_type or resp.status == 200:
image_data = resp.read()
if len(image_data) < 1000:
image_data = None # Too small, likely an error page
except Exception:
pass
# Fallback to FlareSolverr if direct failed
if not image_data and use_flaresolverr:
try:
import requests
resp = requests.post('http://localhost:8191/v1', json={
'cmd': 'request.get',
'url': image_url,
'maxTimeout': 15000
}, timeout=20)
data = resp.json()
if data.get('status') == 'ok':
# FlareSolverr returns HTML for the response, not binary
# It can't fetch binary images directly, skip this
pass
except Exception:
pass
if not image_data or len(image_data) < 1000:
return None
# Determine extension from content or URL
ext = '.jpg'
url_lower = image_url.lower()
if '.png' in url_lower:
ext = '.png'
elif '.webp' in url_lower:
ext = '.webp'
elif '.gif' in url_lower:
ext = '.gif'
cached_path = PRESS_IMAGE_CACHE / f"{url_hash}{ext}"
cached_path.write_bytes(image_data)
return f"/api/press/images/{cached_path.name}"
@router.get("/images/{filename}")
async def serve_press_image(filename: str, current_user: Dict = Depends(get_current_user)):
"""Serve a cached press article image."""
# Sanitize filename
if '/' in filename or '..' in filename:
raise HTTPException(status_code=400, detail="Invalid filename")
filepath = PRESS_IMAGE_CACHE / filename
if not filepath.exists():
raise HTTPException(status_code=404, detail="Image not found")
# Determine media type
suffix = filepath.suffix.lower()
media_types = {
'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
'.png': 'image/png', '.webp': 'image/webp', '.gif': 'image/gif',
}
media_type = media_types.get(suffix, 'image/jpeg')
return FileResponse(filepath, media_type=media_type, headers={
'Cache-Control': 'public, max-age=86400',
})