""" Press Router Monitors GDELT for news articles mentioning tracked celebrities. Stores complete articles and sends Pushover push notifications. """ import asyncio import hashlib import json import threading import re from concurrent.futures import ThreadPoolExecutor from datetime import datetime from typing import Dict, List, Optional import urllib.error from urllib.parse import urlparse from pathlib import Path from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request from fastapi.responses import FileResponse from pydantic import BaseModel from slowapi import Limiter from slowapi.util import get_remote_address from ..core.dependencies import get_current_user, get_app_state from ..core.exceptions import handle_exceptions from modules.universal_logger import get_logger logger = get_logger('API') router = APIRouter(prefix="/api/press", tags=["Press"]) limiter = Limiter(key_func=get_remote_address) # Thread pool for blocking operations _executor = ThreadPoolExecutor(max_workers=2) # Track running fetch jobs _fetch_lock = threading.Lock() _fetch_running = False # ============================================================================ # PYDANTIC MODELS # ============================================================================ class PressConfigUpdate(BaseModel): enabled: Optional[bool] = None check_interval_hours: Optional[int] = None max_records_per_query: Optional[int] = None notify_new_articles: Optional[bool] = None celebrity_ids: Optional[List[int]] = None class ReadStatusUpdate(BaseModel): read: bool # ============================================================================ # HELPER FUNCTIONS # ============================================================================ def _get_db(): """Get database instance.""" app_state = get_app_state() return app_state.db def _get_config(db) -> Dict: """Get press config from database.""" with db.get_connection() as conn: cursor = conn.cursor() cursor.execute('SELECT * FROM press_config WHERE id = 1') row = cursor.fetchone() if not row: return { 'enabled': True, 'check_interval_hours': 6, 'max_records_per_query': 25, 'notify_new_articles': True, 'celebrity_ids': [], } # Parse celebrity_ids from JSON string celebrity_ids_raw = row['celebrity_ids'] if 'celebrity_ids' in row.keys() else None try: celebrity_ids = json.loads(celebrity_ids_raw) if celebrity_ids_raw else [] except (json.JSONDecodeError, TypeError): celebrity_ids = [] return { 'enabled': bool(row['enabled']), 'check_interval_hours': row['check_interval_hours'], 'max_records_per_query': row['max_records_per_query'], 'notify_new_articles': bool(row['notify_new_articles']), 'celebrity_ids': celebrity_ids, } def _get_enabled_celebrities(db) -> List[Dict]: """Get enabled celebrities from celebrity_profiles.""" with db.get_connection() as conn: cursor = conn.cursor() cursor.execute(''' SELECT id, name, slug FROM celebrity_profiles WHERE enabled = 1 ORDER BY name ''') rows = cursor.fetchall() return [dict(r) for r in rows] def _decode_google_news_url(google_url: str) -> Optional[str]: """Decode a Google News redirect URL to the real article URL.""" if 'news.google.com' not in google_url: return google_url try: from googlenewsdecoder import gnewsdecoder result = gnewsdecoder(google_url, interval=1) if result.get('status'): return result['decoded_url'] except Exception as e: logger.debug(f"Failed to decode Google News URL: {e}") return None def fetch_google_news_articles(name: str, max_records: int = 100) -> List[Dict]: """Query Google News RSS for articles mentioning the given name. Returns list of dicts with: title, url, published_date (seendate key for compat), source.""" import urllib.request import xml.etree.ElementTree as ET query = f'%22{name.replace(" ", "+")}%22' url = f'https://news.google.com/rss/search?q={query}&hl=en&gl=US&ceid=US:en' for attempt in range(3): try: req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) with urllib.request.urlopen(req, timeout=30) as response: data = response.read().decode('utf-8') root = ET.fromstring(data) articles = [] for item in root.findall('.//item'): title_el = item.find('title') link_el = item.find('link') pub_el = item.find('pubDate') source_el = item.find('source') if title_el is None or link_el is None: continue title = title_el.text or '' source_name = source_el.text if source_el is not None else '' if source_name and title.endswith(f' - {source_name}'): title = title[:-len(f' - {source_name}')].strip() # Parse pubDate (RFC 2822) to GDELT-compat format seendate = '' if pub_el is not None and pub_el.text: try: from email.utils import parsedate_to_datetime dt = parsedate_to_datetime(pub_el.text) seendate = dt.strftime('%Y%m%dT%H%M%SZ') except Exception: pass articles.append({ 'title': title, 'url': link_el.text or '', 'seendate': seendate, 'socialimage': '', 'language': 'en', 'sourcecountry': '', }) logger.info(f"Google News: {len(articles)} articles for '{name}'") return articles[:max_records] except Exception as e: if attempt < 2: import time time.sleep(5) continue logger.error(f"Google News fetch error for '{name}': {e}") return [] return [] def _parse_article_html(raw_html: str, url: str) -> tuple: """Parse raw HTML into article content and og:image. Returns (content_html, image_url).""" from urllib.parse import urljoin try: from readability import Document from bs4 import BeautifulSoup import bleach # Extract og:image for thumbnail og_soup = BeautifulSoup(raw_html, 'html.parser') og_image = None og_tag = og_soup.find('meta', property='og:image') if og_tag and og_tag.get('content'): og_image = og_tag['content'] if not og_image: tw_tag = og_soup.find('meta', attrs={'name': 'twitter:image'}) if tw_tag and tw_tag.get('content'): og_image = tw_tag['content'] doc = Document(raw_html, url=url) content_html = doc.summary() if not content_html or len(content_html.strip()) < 50: return (None, og_image) reader_soup = BeautifulSoup(content_html, 'html.parser') junk_text_re = re.compile( r'(^published|^updated|^modified|^posted|^by\s|^written by|^photo:|^image:|^credit:|' r'share or comment|share this article|comment on this|follow us on|' r'sign up for|subscribe to|have you got a story|tips@|email us)', re.I ) inline_tags = ['b', 'i', 'em', 'strong', 'a', 'br'] inline_attrs = {'a': ['href']} html_parts = [] for el in reader_soup.find_all(['p', 'h2', 'h3', 'h4', 'blockquote', 'ul', 'ol']): text = el.get_text(strip=True) if len(text) < 30: continue if junk_text_re.search(text): continue tag = el.name inner = bleach.clean( el.decode_contents(), tags=inline_tags, attributes=inline_attrs, strip=True, protocols=['http', 'https'] ).strip() if not inner: continue if tag == 'p': html_parts.append(f'

{inner}

') elif tag in ('h2', 'h3', 'h4'): html_parts.append(f'<{tag}>{inner}') elif tag == 'blockquote': html_parts.append(f'

{inner}

') elif tag in ('ul', 'ol'): items = [] for li in el.find_all('li', recursive=False): li_inner = bleach.clean( li.decode_contents(), tags=inline_tags, attributes=inline_attrs, strip=True, protocols=['http', 'https'] ).strip() if li_inner and len(li.get_text(strip=True)) > 10: items.append(f'
  • {li_inner}
  • ') if items: html_parts.append(f'<{tag}>{"".join(items)}') junk_img_re = re.compile(r'(logo|icon|pixel|spacer|blank|1x1|svg|avatar|spinner|/ct/|cts\.businesswire)', re.I) seen_srcs = set() article_images = [] for img in reader_soup.find_all('img'): src = img.get('src', '') if src and src.startswith(('http://', 'https://')) and src not in seen_srcs: if junk_img_re.search(src): continue seen_srcs.add(src) alt = (img.get('alt', '') or '').strip() article_images.append(f'{bleach.clean(alt)}') # If readability found no images, grab first real image from original HTML if not article_images: orig_soup = BeautifulSoup(raw_html, 'html.parser') for noise in orig_soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form', 'noscript', 'svg']): noise.decompose() for img in orig_soup.find_all('img'): src = (img.get('data-src') or img.get('data-lazy-src') or img.get('data-original') or img.get('src') or '') if not src or not src.startswith(('http://', 'https://')): continue src_lower = src.lower() if any(x in src_lower for x in ('logo', 'icon', 'pixel', 'spacer', 'blank', '1x1', 'svg', 'avatar', 'spinner', '/ct/')): continue alt = (img.get('alt', '') or '').strip() article_images.append(f'{bleach.clean(alt)}') break # Only first real image if article_images and html_parts: text_count = len(html_parts) img_count = len(article_images) interval = max(1, text_count // (img_count + 1)) merged = [] img_idx = 0 for i, part in enumerate(html_parts): merged.append(part) if img_idx < img_count and (i + 1) % interval == 0: merged.append(article_images[img_idx]) img_idx += 1 while img_idx < img_count: merged.append(article_images[img_idx]) img_idx += 1 html_parts = merged elif article_images and not html_parts: html_parts = article_images if not html_parts: text = reader_soup.get_text(separator='\n\n', strip=True) if text: for para in text.split('\n\n'): para = para.strip() if len(para) > 30: html_parts.append(f'

    {bleach.clean(para)}

    ') if not html_parts: return (None, og_image) from bs4 import BeautifulSoup as BS clean_parts = [] for part in html_parts: part_soup = BS(part, 'html.parser') part_text = part_soup.get_text(strip=True) if len(part_text) > 100: words = part_text.split() avg_word_len = len(part_text) / max(len(words), 1) if avg_word_len > 12: continue clean_parts.append(part) if not clean_parts: return (None, og_image) result = '\n'.join(clean_parts) plain_text = BS(result, 'html.parser').get_text(separator=' ', strip=True) garbage_re = re.compile( r'(use (left|right|escape)|arrow keys|navigate between|' r'sign (in|up) with|we won.t post|social account|' r'accept cookies|cookie policy|privacy policy|terms of (use|service)|' r'AlabamaAlaska|CaliforniaColorado|United States of America)', re.I ) if len(plain_text) < 200 or garbage_re.search(plain_text): return (None, og_image) return (result, og_image) except Exception as e: logger.debug(f"Article parsing failed for {url}: {e}") return (None, None) def _fetch_html_flaresolverr(url: str) -> Optional[str]: """Fetch HTML via FlareSolverr (headless browser).""" try: import requests resp = requests.post('http://localhost:8191/v1', json={ 'cmd': 'request.get', 'url': url, 'maxTimeout': 30000 }, timeout=45) data = resp.json() if data.get('status') == 'ok': html = data.get('solution', {}).get('response', '') if len(html) > 500: return html except Exception: pass return None def extract_article_content_with_image(url: str) -> tuple: """Extract article content and og:image. Tries direct fetch, falls back to FlareSolverr. Returns (content_html, image_url).""" import urllib.request # Try direct fetch first try: req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' }) with urllib.request.urlopen(req, timeout=20) as response: raw_html = response.read().decode('utf-8', errors='replace') content, image = _parse_article_html(raw_html, url) if content: return (content, image) except Exception: pass # Fallback to FlareSolverr for bot-protected sites raw_html = _fetch_html_flaresolverr(url) if raw_html: content, image = _parse_article_html(raw_html, url) return (content, image) return (None, None) def extract_article_content(url: str) -> Optional[str]: """Extract article content (legacy wrapper, returns content only).""" content, _ = extract_article_content_with_image(url) return content def process_press_articles(db, celebrity_id: Optional[int] = None, send_notifications: bool = True) -> Dict: """ Fetch GDELT articles for celebrities, deduplicate, store, and notify. Returns stats about the operation. """ global _fetch_running with _fetch_lock: _fetch_running = True try: config = _get_config(db) max_records = config.get('max_records_per_query', 25) notify_enabled = config.get('notify_new_articles', True) # Get celebrities to check if celebrity_id: with db.get_connection() as conn: cursor = conn.cursor() cursor.execute('SELECT id, name, slug FROM celebrity_profiles WHERE id = ?', (celebrity_id,)) row = cursor.fetchone() celebrities = [dict(row)] if row else [] else: # Use celebrity_ids from config if set, otherwise skip configured_ids = config.get('celebrity_ids', []) if configured_ids: with db.get_connection() as conn: cursor = conn.cursor() placeholders = ','.join(['?' for _ in configured_ids]) cursor.execute( f'SELECT id, name, slug FROM celebrity_profiles WHERE id IN ({placeholders}) ORDER BY name', configured_ids ) celebrities = [dict(r) for r in cursor.fetchall()] else: celebrities = [] total_new = 0 total_fetched = 0 results_by_celebrity = {} import time as _time for idx, celeb in enumerate(celebrities): celeb_id = celeb['id'] celeb_name = celeb['name'] # Small delay between celebrities if idx > 0: _time.sleep(2) # Fetch from Google News RSS articles = fetch_google_news_articles(celeb_name, max_records) total_fetched += len(articles) new_count = 0 for article in articles: google_url = article.get('url', '') if not google_url: continue article_title = article.get('title', '').strip() # Check for duplicate by title first (cheap check before URL decode) with db.get_connection() as conn: cursor = conn.cursor() if article_title: cursor.execute( 'SELECT id FROM press_articles WHERE celebrity_id = ? AND title = ?', (celeb_id, article_title) ) if cursor.fetchone(): continue # Only keep articles where celeb name appears in the title if not article_title or celeb_name.lower() not in article_title.lower(): continue # Decode Google News URL to real article URL article_url = _decode_google_news_url(google_url) if not article_url: continue url_hash = hashlib.sha256(article_url.encode('utf-8')).hexdigest() # Check for duplicate by URL hash with db.get_connection() as conn: cursor = conn.cursor() cursor.execute('SELECT id FROM press_articles WHERE url_hash = ?', (url_hash,)) if cursor.fetchone(): continue # Parse domain from real URL parsed = urlparse(article_url) domain = parsed.netloc.replace('www.', '') # Extract article content and og:image content, og_image = extract_article_content_with_image(article_url) # Cache all inline images in the content to local proxy if content: content = _cache_content_images(content) if content: snippet = re.sub(r'<[^>]+>', ' ', content) snippet = ' '.join(snippet.split())[:300] else: snippet = article.get('title', '')[:300] if article.get('title') else '' # Cache the og:image locally, fall back to first inline image cached_image = cache_press_image(og_image) if og_image else None if not cached_image and content: m = re.search(r' ISO format try: dt = datetime.strptime(published_date, '%Y%m%dT%H%M%SZ') published_date = dt.isoformat() except (ValueError, TypeError): pass # Insert article with db.get_connection() as conn: cursor = conn.cursor() cursor.execute(''' INSERT INTO press_articles (celebrity_id, title, url, url_hash, domain, published_date, image_url, language, country, article_content, snippet, notified, read) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0, 0) ''', ( celeb_id, article.get('title', ''), article_url, url_hash, domain, published_date, cached_image or '', article.get('language', ''), article.get('sourcecountry', ''), content, snippet, )) conn.commit() new_count += 1 total_new += new_count if new_count > 0: results_by_celebrity[celeb_name] = new_count # Send notifications for new articles (only for scheduled fetches) if send_notifications and notify_enabled and total_new > 0: _send_press_notification(db, results_by_celebrity) # Mark notified if total_new > 0: with db.get_connection() as conn: cursor = conn.cursor() cursor.execute('UPDATE press_articles SET notified = ? WHERE notified = ?', (1, 0)) conn.commit() logger.info(f"Press monitor: fetched {total_fetched}, new {total_new}") return { 'total_fetched': total_fetched, 'total_new': total_new, 'by_celebrity': results_by_celebrity, } except Exception as e: logger.error(f"Error in press article processing: {e}") import traceback logger.debug(f"Traceback: {traceback.format_exc()}") return {'error': str(e), 'total_fetched': 0, 'total_new': 0} finally: with _fetch_lock: _fetch_running = False def _send_press_notification(db, results_by_celebrity: Dict): """Send Pushover notification about new press articles.""" try: from modules.pushover_notifier import create_notifier_from_config from modules.settings_manager import SettingsManager settings_manager = SettingsManager(str(db.db_path)) config = settings_manager.get_all() notifier = create_notifier_from_config(config, unified_db=db) if not notifier: return total = sum(results_by_celebrity.values()) title = f"📰 Press: {total} new article{'s' if total != 1 else ''}" # Build rich HTML message matching other notification formats message_parts = [] for name, count in results_by_celebrity.items(): message_parts.append(f"👤 {name}: {count} article{'s' if count != 1 else ''}") now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") message_parts.append(f"\n⏰ Discovered: {now}") message = "\n".join(message_parts) # Set notification context so it gets recorded to the notifications table # and broadcast via websocket for real-time UI updates notifier._current_notification_context = { 'platform': 'press', 'source': 'GDELT', 'content_type': 'article', 'download_count': total, 'metadata': {'by_celebrity': results_by_celebrity} } notifier.send_notification( title=title, message=message, priority=0, html=True, ) except Exception as e: logger.error(f"Failed to send press notification: {e}") # ============================================================================ # CONFIGURATION ENDPOINTS # ============================================================================ @router.get("/config") @limiter.limit("30/minute") @handle_exceptions async def get_config( request: Request, current_user: Dict = Depends(get_current_user) ): """Get press monitor configuration.""" db = _get_db() config = _get_config(db) return {"success": True, "config": config} @router.put("/config") @limiter.limit("10/minute") @handle_exceptions async def update_config( request: Request, config_update: PressConfigUpdate, current_user: Dict = Depends(get_current_user) ): """Update press monitor configuration.""" db = _get_db() updates = config_update.model_dump(exclude_none=True) if not updates: return {"success": False, "message": "No fields to update"} set_parts = [] values = [] for key, value in updates.items(): if isinstance(value, bool): value = 1 if value else 0 elif key == 'celebrity_ids' and isinstance(value, list): value = json.dumps(value) set_parts.append(f"{key} = ?") values.append(value) set_parts.append("updated_at = CURRENT_TIMESTAMP") with db.get_connection() as conn: cursor = conn.cursor() cursor.execute( f"UPDATE press_config SET {', '.join(set_parts)} WHERE id = 1", values ) conn.commit() return {"success": True, "message": "Configuration updated"} # ============================================================================ # CELEBRITY SELECTION ENDPOINT # ============================================================================ @router.get("/celebrities") @limiter.limit("30/minute") @handle_exceptions async def get_press_celebrities( request: Request, current_user: Dict = Depends(get_current_user), ): """Get all tracked celebrities with press_enabled flag based on config.""" db = _get_db() config = _get_config(db) enabled_ids = set(config.get('celebrity_ids', [])) with db.get_connection() as conn: cursor = conn.cursor() cursor.execute(''' SELECT id, name, slug FROM celebrity_profiles WHERE enabled = 1 ORDER BY name ''') rows = cursor.fetchall() celebrities = [] for r in rows: celeb = dict(r) celeb['press_enabled'] = celeb['id'] in enabled_ids celebrities.append(celeb) return {"success": True, "celebrities": celebrities} # ============================================================================ # ARTICLE ENDPOINTS # ============================================================================ @router.get("/articles") @limiter.limit("30/minute") @handle_exceptions async def get_articles( request: Request, current_user: Dict = Depends(get_current_user), celebrity_id: Optional[int] = None, domain: Optional[str] = None, read: Optional[bool] = None, search: Optional[str] = None, page: int = 1, per_page: int = 50, ): """Get paginated list of press articles with filters.""" db = _get_db() conditions = [] params = [] if celebrity_id is not None: conditions.append("pa.celebrity_id = ?") params.append(celebrity_id) if domain is not None: conditions.append("pa.domain = ?") params.append(domain) if read is not None: conditions.append("pa.read = ?") params.append(1 if read else 0) if search: conditions.append("(pa.title LIKE ? OR pa.snippet LIKE ?)") params.extend([f'%{search}%', f'%{search}%']) where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else "" offset = (page - 1) * per_page with db.get_connection() as conn: cursor = conn.cursor() # Get total count cursor.execute(f"SELECT COUNT(*) FROM press_articles pa {where_clause}", params) total = cursor.fetchone()[0] # Get articles cursor.execute(f''' SELECT pa.id, pa.celebrity_id, pa.title, pa.url, pa.domain, pa.published_date, pa.image_url, pa.language, pa.country, pa.snippet, pa.fetched_at, pa.read, cp.name as celebrity_name FROM press_articles pa LEFT JOIN celebrity_profiles cp ON pa.celebrity_id = cp.id {where_clause} ORDER BY pa.published_date DESC LIMIT ? OFFSET ? ''', params + [per_page, offset]) articles = [dict(r) for r in cursor.fetchall()] return { "success": True, "articles": articles, "total": total, "page": page, "per_page": per_page, "pages": (total + per_page - 1) // per_page if per_page else 1, } @router.get("/articles/{article_id}") @limiter.limit("30/minute") @handle_exceptions async def get_article( request: Request, article_id: int, current_user: Dict = Depends(get_current_user), ): """Get a single article with full content.""" db = _get_db() with db.get_connection() as conn: cursor = conn.cursor() cursor.execute(''' SELECT pa.*, cp.name as celebrity_name FROM press_articles pa LEFT JOIN celebrity_profiles cp ON pa.celebrity_id = cp.id WHERE pa.id = ? ''', (article_id,)) row = cursor.fetchone() if not row: raise HTTPException(status_code=404, detail="Article not found") return {"success": True, "article": dict(row)} @router.patch("/articles/{article_id}/read") @limiter.limit("30/minute") @handle_exceptions async def update_read_status( request: Request, article_id: int, body: ReadStatusUpdate, current_user: Dict = Depends(get_current_user), ): """Mark an article as read or unread.""" db = _get_db() with db.get_connection() as conn: cursor = conn.cursor() cursor.execute( 'UPDATE press_articles SET read = ? WHERE id = ?', (1 if body.read else 0, article_id) ) conn.commit() if cursor.rowcount == 0: raise HTTPException(status_code=404, detail="Article not found") return {"success": True, "message": f"Article marked as {'read' if body.read else 'unread'}"} @router.post("/articles/mark-all-read") @limiter.limit("10/minute") @handle_exceptions async def mark_all_read( request: Request, current_user: Dict = Depends(get_current_user), ): """Mark all unread articles as read.""" db = _get_db() with db.get_connection() as conn: cursor = conn.cursor() cursor.execute('UPDATE press_articles SET read = 1 WHERE read = 0') conn.commit() count = cursor.rowcount return {"success": True, "message": f"Marked {count} article{'s' if count != 1 else ''} as read", "count": count} @router.delete("/articles/{article_id}") @limiter.limit("10/minute") @handle_exceptions async def delete_article( request: Request, article_id: int, current_user: Dict = Depends(get_current_user), ): """Delete a press article.""" db = _get_db() with db.get_connection() as conn: cursor = conn.cursor() cursor.execute('DELETE FROM press_articles WHERE id = ?', (article_id,)) conn.commit() if cursor.rowcount == 0: raise HTTPException(status_code=404, detail="Article not found") return {"success": True, "message": "Article deleted"} # ============================================================================ # STATS ENDPOINT # ============================================================================ @router.get("/stats") @limiter.limit("30/minute") @handle_exceptions async def get_stats( request: Request, current_user: Dict = Depends(get_current_user), ): """Get press article statistics.""" db = _get_db() with db.get_connection() as conn: cursor = conn.cursor() # Total articles cursor.execute('SELECT COUNT(*) FROM press_articles') total = cursor.fetchone()[0] # Unread count cursor.execute('SELECT COUNT(*) FROM press_articles WHERE read = 0') unread = cursor.fetchone()[0] # By celebrity cursor.execute(''' SELECT pa.celebrity_id, cp.name, COUNT(*) as count FROM press_articles pa JOIN celebrity_profiles cp ON pa.celebrity_id = cp.id GROUP BY pa.celebrity_id, cp.name ORDER BY count DESC ''') by_celebrity = [{'id': r['celebrity_id'], 'name': r['name'], 'count': r['count']} for r in cursor.fetchall()] # By domain (top 10) cursor.execute(''' SELECT domain, COUNT(*) as count FROM press_articles GROUP BY domain ORDER BY count DESC LIMIT 10 ''') by_domain = [{'domain': r['domain'], 'count': r['count']} for r in cursor.fetchall()] return { "success": True, "stats": { "total": total, "unread": unread, "by_celebrity": by_celebrity, "by_domain": by_domain, } } # ============================================================================ # FETCH ENDPOINT # ============================================================================ @router.post("/fetch") @limiter.limit("5/minute") @handle_exceptions async def trigger_fetch( request: Request, background_tasks: BackgroundTasks, current_user: Dict = Depends(get_current_user), celebrity_id: Optional[int] = None, ): """Trigger a manual GDELT fetch for all or a specific celebrity.""" global _fetch_running with _fetch_lock: if _fetch_running: return {"success": False, "message": "Fetch already in progress"} db = _get_db() def do_fetch(): process_press_articles(db, celebrity_id, send_notifications=False) loop = asyncio.get_event_loop() background_tasks.add_task(loop.run_in_executor, _executor, do_fetch) return {"success": True, "message": "Fetch started"} @router.get("/fetch/status") @limiter.limit("30/minute") @handle_exceptions async def get_fetch_status( request: Request, current_user: Dict = Depends(get_current_user), ): """Check if a fetch is currently running.""" return {"success": True, "is_running": _fetch_running} # ============================================================================ # IMAGE PROXY / CACHE # ============================================================================ PRESS_IMAGE_CACHE = Path("/opt/media-downloader/data/press_images") PRESS_IMAGE_CACHE.mkdir(parents=True, exist_ok=True) def _cache_content_images(html_content: str) -> str: """Find all in HTML content, cache each image locally, and rewrite src to /api/press/images/... proxy path. Removes img tags where caching fails (broken > missing).""" if not html_content: return html_content def _replace_img(match): full_tag = match.group(0) src = match.group(1) if not src or src.startswith('/api/press/images/'): return full_tag cached = cache_press_image(src) if cached: return full_tag.replace(src, cached) return '' # Remove img if caching failed return re.sub(r']*>', _replace_img, html_content) def cache_press_image(image_url: str, use_flaresolverr: bool = False) -> Optional[str]: """Download an image and cache it locally. Returns the API path to serve it.""" if not image_url: return None import urllib.request # Hash the URL for the filename url_hash = hashlib.sha256(image_url.encode('utf-8')).hexdigest()[:16] # Check if already cached for ext in ('.jpg', '.jpeg', '.png', '.webp', '.gif'): cached = PRESS_IMAGE_CACHE / f"{url_hash}{ext}" if cached.exists() and cached.stat().st_size > 0: return f"/api/press/images/{cached.name}" # Download the image image_data = None # Try direct fetch first try: req = urllib.request.Request(image_url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'image/*,*/*', }) with urllib.request.urlopen(req, timeout=15) as resp: content_type = resp.headers.get('Content-Type', '') if 'image' in content_type or resp.status == 200: image_data = resp.read() if len(image_data) < 1000: image_data = None # Too small, likely an error page except Exception: pass # Fallback to FlareSolverr if direct failed if not image_data and use_flaresolverr: try: import requests resp = requests.post('http://localhost:8191/v1', json={ 'cmd': 'request.get', 'url': image_url, 'maxTimeout': 15000 }, timeout=20) data = resp.json() if data.get('status') == 'ok': # FlareSolverr returns HTML for the response, not binary # It can't fetch binary images directly, skip this pass except Exception: pass if not image_data or len(image_data) < 1000: return None # Determine extension from content or URL ext = '.jpg' url_lower = image_url.lower() if '.png' in url_lower: ext = '.png' elif '.webp' in url_lower: ext = '.webp' elif '.gif' in url_lower: ext = '.gif' cached_path = PRESS_IMAGE_CACHE / f"{url_hash}{ext}" cached_path.write_bytes(image_data) return f"/api/press/images/{cached_path.name}" @router.get("/images/{filename}") async def serve_press_image(filename: str, current_user: Dict = Depends(get_current_user)): """Serve a cached press article image.""" # Sanitize filename if '/' in filename or '..' in filename: raise HTTPException(status_code=400, detail="Invalid filename") filepath = PRESS_IMAGE_CACHE / filename if not filepath.exists(): raise HTTPException(status_code=404, detail="Image not found") # Determine media type suffix = filepath.suffix.lower() media_types = { '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png', '.webp': 'image/webp', '.gif': 'image/gif', } media_type = media_types.get(suffix, 'image/jpeg') return FileResponse(filepath, media_type=media_type, headers={ 'Cache-Control': 'public, max-age=86400', })