- scheduler.py: Use full path for scheduler_state.db instead of relative name - recycle.py: Use full path for thumbnails.db instead of relative name - cloud_backup.py, maintenance.py, stats.py: Require admin for config/cleanup/settings endpoints - press.py: Add auth to press image serving endpoint - private_gallery.py: Fix _create_pg_job call and add missing secrets import - appearances.py: Use sync httpx instead of asyncio.run for background thread HTTP call Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1099 lines
38 KiB
Python
1099 lines
38 KiB
Python
"""
|
|
Press Router
|
|
|
|
Monitors GDELT for news articles mentioning tracked celebrities.
|
|
Stores complete articles and sends Pushover push notifications.
|
|
"""
|
|
|
|
import asyncio
|
|
import hashlib
|
|
import json
|
|
import threading
|
|
import re
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from datetime import datetime
|
|
from typing import Dict, List, Optional
|
|
import urllib.error
|
|
from urllib.parse import urlparse
|
|
|
|
from pathlib import Path
|
|
|
|
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request
|
|
from fastapi.responses import FileResponse
|
|
from pydantic import BaseModel
|
|
from slowapi import Limiter
|
|
from slowapi.util import get_remote_address
|
|
|
|
from ..core.dependencies import get_current_user, get_app_state
|
|
from ..core.exceptions import handle_exceptions
|
|
from modules.universal_logger import get_logger
|
|
|
|
logger = get_logger('API')
|
|
|
|
router = APIRouter(prefix="/api/press", tags=["Press"])
|
|
limiter = Limiter(key_func=get_remote_address)
|
|
|
|
# Thread pool for blocking operations
|
|
_executor = ThreadPoolExecutor(max_workers=2)
|
|
|
|
# Track running fetch jobs
|
|
_fetch_lock = threading.Lock()
|
|
_fetch_running = False
|
|
|
|
|
|
# ============================================================================
|
|
# PYDANTIC MODELS
|
|
# ============================================================================
|
|
|
|
class PressConfigUpdate(BaseModel):
|
|
enabled: Optional[bool] = None
|
|
check_interval_hours: Optional[int] = None
|
|
max_records_per_query: Optional[int] = None
|
|
notify_new_articles: Optional[bool] = None
|
|
celebrity_ids: Optional[List[int]] = None
|
|
|
|
|
|
class ReadStatusUpdate(BaseModel):
|
|
read: bool
|
|
|
|
|
|
# ============================================================================
|
|
# HELPER FUNCTIONS
|
|
# ============================================================================
|
|
|
|
def _get_db():
|
|
"""Get database instance."""
|
|
app_state = get_app_state()
|
|
return app_state.db
|
|
|
|
|
|
def _get_config(db) -> Dict:
|
|
"""Get press config from database."""
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('SELECT * FROM press_config WHERE id = 1')
|
|
row = cursor.fetchone()
|
|
if not row:
|
|
return {
|
|
'enabled': True,
|
|
'check_interval_hours': 6,
|
|
'max_records_per_query': 25,
|
|
'notify_new_articles': True,
|
|
'celebrity_ids': [],
|
|
}
|
|
# Parse celebrity_ids from JSON string
|
|
celebrity_ids_raw = row['celebrity_ids'] if 'celebrity_ids' in row.keys() else None
|
|
try:
|
|
celebrity_ids = json.loads(celebrity_ids_raw) if celebrity_ids_raw else []
|
|
except (json.JSONDecodeError, TypeError):
|
|
celebrity_ids = []
|
|
return {
|
|
'enabled': bool(row['enabled']),
|
|
'check_interval_hours': row['check_interval_hours'],
|
|
'max_records_per_query': row['max_records_per_query'],
|
|
'notify_new_articles': bool(row['notify_new_articles']),
|
|
'celebrity_ids': celebrity_ids,
|
|
}
|
|
|
|
|
|
def _get_enabled_celebrities(db) -> List[Dict]:
|
|
"""Get enabled celebrities from celebrity_profiles."""
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
SELECT id, name, slug FROM celebrity_profiles
|
|
WHERE enabled = 1
|
|
ORDER BY name
|
|
''')
|
|
rows = cursor.fetchall()
|
|
return [dict(r) for r in rows]
|
|
|
|
|
|
def _decode_google_news_url(google_url: str) -> Optional[str]:
|
|
"""Decode a Google News redirect URL to the real article URL."""
|
|
if 'news.google.com' not in google_url:
|
|
return google_url
|
|
try:
|
|
from googlenewsdecoder import gnewsdecoder
|
|
result = gnewsdecoder(google_url, interval=1)
|
|
if result.get('status'):
|
|
return result['decoded_url']
|
|
except Exception as e:
|
|
logger.debug(f"Failed to decode Google News URL: {e}")
|
|
return None
|
|
|
|
|
|
def fetch_google_news_articles(name: str, max_records: int = 100) -> List[Dict]:
|
|
"""Query Google News RSS for articles mentioning the given name.
|
|
Returns list of dicts with: title, url, published_date (seendate key for compat), source."""
|
|
import urllib.request
|
|
import xml.etree.ElementTree as ET
|
|
|
|
query = f'%22{name.replace(" ", "+")}%22'
|
|
url = f'https://news.google.com/rss/search?q={query}&hl=en&gl=US&ceid=US:en'
|
|
|
|
for attempt in range(3):
|
|
try:
|
|
req = urllib.request.Request(url, headers={
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
})
|
|
with urllib.request.urlopen(req, timeout=30) as response:
|
|
data = response.read().decode('utf-8')
|
|
|
|
root = ET.fromstring(data)
|
|
articles = []
|
|
for item in root.findall('.//item'):
|
|
title_el = item.find('title')
|
|
link_el = item.find('link')
|
|
pub_el = item.find('pubDate')
|
|
source_el = item.find('source')
|
|
|
|
if title_el is None or link_el is None:
|
|
continue
|
|
|
|
title = title_el.text or ''
|
|
source_name = source_el.text if source_el is not None else ''
|
|
if source_name and title.endswith(f' - {source_name}'):
|
|
title = title[:-len(f' - {source_name}')].strip()
|
|
|
|
# Parse pubDate (RFC 2822) to GDELT-compat format
|
|
seendate = ''
|
|
if pub_el is not None and pub_el.text:
|
|
try:
|
|
from email.utils import parsedate_to_datetime
|
|
dt = parsedate_to_datetime(pub_el.text)
|
|
seendate = dt.strftime('%Y%m%dT%H%M%SZ')
|
|
except Exception:
|
|
pass
|
|
|
|
articles.append({
|
|
'title': title,
|
|
'url': link_el.text or '',
|
|
'seendate': seendate,
|
|
'socialimage': '',
|
|
'language': 'en',
|
|
'sourcecountry': '',
|
|
})
|
|
|
|
logger.info(f"Google News: {len(articles)} articles for '{name}'")
|
|
return articles[:max_records]
|
|
except Exception as e:
|
|
if attempt < 2:
|
|
import time
|
|
time.sleep(5)
|
|
continue
|
|
logger.error(f"Google News fetch error for '{name}': {e}")
|
|
return []
|
|
return []
|
|
|
|
|
|
def _parse_article_html(raw_html: str, url: str) -> tuple:
|
|
"""Parse raw HTML into article content and og:image. Returns (content_html, image_url)."""
|
|
from urllib.parse import urljoin
|
|
|
|
try:
|
|
from readability import Document
|
|
from bs4 import BeautifulSoup
|
|
import bleach
|
|
|
|
# Extract og:image for thumbnail
|
|
og_soup = BeautifulSoup(raw_html, 'html.parser')
|
|
og_image = None
|
|
og_tag = og_soup.find('meta', property='og:image')
|
|
if og_tag and og_tag.get('content'):
|
|
og_image = og_tag['content']
|
|
if not og_image:
|
|
tw_tag = og_soup.find('meta', attrs={'name': 'twitter:image'})
|
|
if tw_tag and tw_tag.get('content'):
|
|
og_image = tw_tag['content']
|
|
|
|
doc = Document(raw_html, url=url)
|
|
content_html = doc.summary()
|
|
|
|
if not content_html or len(content_html.strip()) < 50:
|
|
return (None, og_image)
|
|
|
|
reader_soup = BeautifulSoup(content_html, 'html.parser')
|
|
|
|
junk_text_re = re.compile(
|
|
r'(^published|^updated|^modified|^posted|^by\s|^written by|^photo:|^image:|^credit:|'
|
|
r'share or comment|share this article|comment on this|follow us on|'
|
|
r'sign up for|subscribe to|have you got a story|tips@|email us)',
|
|
re.I
|
|
)
|
|
|
|
inline_tags = ['b', 'i', 'em', 'strong', 'a', 'br']
|
|
inline_attrs = {'a': ['href']}
|
|
|
|
html_parts = []
|
|
for el in reader_soup.find_all(['p', 'h2', 'h3', 'h4', 'blockquote', 'ul', 'ol']):
|
|
text = el.get_text(strip=True)
|
|
if len(text) < 30:
|
|
continue
|
|
if junk_text_re.search(text):
|
|
continue
|
|
tag = el.name
|
|
inner = bleach.clean(
|
|
el.decode_contents(), tags=inline_tags,
|
|
attributes=inline_attrs, strip=True, protocols=['http', 'https']
|
|
).strip()
|
|
if not inner:
|
|
continue
|
|
if tag == 'p':
|
|
html_parts.append(f'<p>{inner}</p>')
|
|
elif tag in ('h2', 'h3', 'h4'):
|
|
html_parts.append(f'<{tag}>{inner}</{tag}>')
|
|
elif tag == 'blockquote':
|
|
html_parts.append(f'<blockquote><p>{inner}</p></blockquote>')
|
|
elif tag in ('ul', 'ol'):
|
|
items = []
|
|
for li in el.find_all('li', recursive=False):
|
|
li_inner = bleach.clean(
|
|
li.decode_contents(), tags=inline_tags,
|
|
attributes=inline_attrs, strip=True, protocols=['http', 'https']
|
|
).strip()
|
|
if li_inner and len(li.get_text(strip=True)) > 10:
|
|
items.append(f'<li>{li_inner}</li>')
|
|
if items:
|
|
html_parts.append(f'<{tag}>{"".join(items)}</{tag}>')
|
|
|
|
junk_img_re = re.compile(r'(logo|icon|pixel|spacer|blank|1x1|svg|avatar|spinner|/ct/|cts\.businesswire)', re.I)
|
|
seen_srcs = set()
|
|
article_images = []
|
|
for img in reader_soup.find_all('img'):
|
|
src = img.get('src', '')
|
|
if src and src.startswith(('http://', 'https://')) and src not in seen_srcs:
|
|
if junk_img_re.search(src):
|
|
continue
|
|
seen_srcs.add(src)
|
|
alt = (img.get('alt', '') or '').strip()
|
|
article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
|
|
|
|
# If readability found no images, grab first real image from original HTML
|
|
if not article_images:
|
|
orig_soup = BeautifulSoup(raw_html, 'html.parser')
|
|
for noise in orig_soup.find_all(['script', 'style', 'nav', 'footer', 'header',
|
|
'aside', 'form', 'noscript', 'svg']):
|
|
noise.decompose()
|
|
for img in orig_soup.find_all('img'):
|
|
src = (img.get('data-src') or img.get('data-lazy-src') or
|
|
img.get('data-original') or img.get('src') or '')
|
|
if not src or not src.startswith(('http://', 'https://')):
|
|
continue
|
|
src_lower = src.lower()
|
|
if any(x in src_lower for x in ('logo', 'icon', 'pixel', 'spacer', 'blank',
|
|
'1x1', 'svg', 'avatar', 'spinner', '/ct/')):
|
|
continue
|
|
alt = (img.get('alt', '') or '').strip()
|
|
article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
|
|
break # Only first real image
|
|
|
|
if article_images and html_parts:
|
|
text_count = len(html_parts)
|
|
img_count = len(article_images)
|
|
interval = max(1, text_count // (img_count + 1))
|
|
merged = []
|
|
img_idx = 0
|
|
for i, part in enumerate(html_parts):
|
|
merged.append(part)
|
|
if img_idx < img_count and (i + 1) % interval == 0:
|
|
merged.append(article_images[img_idx])
|
|
img_idx += 1
|
|
while img_idx < img_count:
|
|
merged.append(article_images[img_idx])
|
|
img_idx += 1
|
|
html_parts = merged
|
|
elif article_images and not html_parts:
|
|
html_parts = article_images
|
|
|
|
if not html_parts:
|
|
text = reader_soup.get_text(separator='\n\n', strip=True)
|
|
if text:
|
|
for para in text.split('\n\n'):
|
|
para = para.strip()
|
|
if len(para) > 30:
|
|
html_parts.append(f'<p>{bleach.clean(para)}</p>')
|
|
|
|
if not html_parts:
|
|
return (None, og_image)
|
|
|
|
from bs4 import BeautifulSoup as BS
|
|
clean_parts = []
|
|
for part in html_parts:
|
|
part_soup = BS(part, 'html.parser')
|
|
part_text = part_soup.get_text(strip=True)
|
|
if len(part_text) > 100:
|
|
words = part_text.split()
|
|
avg_word_len = len(part_text) / max(len(words), 1)
|
|
if avg_word_len > 12:
|
|
continue
|
|
clean_parts.append(part)
|
|
|
|
if not clean_parts:
|
|
return (None, og_image)
|
|
|
|
result = '\n'.join(clean_parts)
|
|
plain_text = BS(result, 'html.parser').get_text(separator=' ', strip=True)
|
|
|
|
garbage_re = re.compile(
|
|
r'(use (left|right|escape)|arrow keys|navigate between|'
|
|
r'sign (in|up) with|we won.t post|social account|'
|
|
r'accept cookies|cookie policy|privacy policy|terms of (use|service)|'
|
|
r'AlabamaAlaska|CaliforniaColorado|United States of America)',
|
|
re.I
|
|
)
|
|
if len(plain_text) < 200 or garbage_re.search(plain_text):
|
|
return (None, og_image)
|
|
|
|
return (result, og_image)
|
|
except Exception as e:
|
|
logger.debug(f"Article parsing failed for {url}: {e}")
|
|
return (None, None)
|
|
|
|
|
|
def _fetch_html_flaresolverr(url: str) -> Optional[str]:
|
|
"""Fetch HTML via FlareSolverr (headless browser)."""
|
|
try:
|
|
import requests
|
|
resp = requests.post('http://localhost:8191/v1', json={
|
|
'cmd': 'request.get',
|
|
'url': url,
|
|
'maxTimeout': 30000
|
|
}, timeout=45)
|
|
data = resp.json()
|
|
if data.get('status') == 'ok':
|
|
html = data.get('solution', {}).get('response', '')
|
|
if len(html) > 500:
|
|
return html
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
|
|
def extract_article_content_with_image(url: str) -> tuple:
|
|
"""Extract article content and og:image. Tries direct fetch, falls back to FlareSolverr.
|
|
Returns (content_html, image_url)."""
|
|
import urllib.request
|
|
|
|
# Try direct fetch first
|
|
try:
|
|
req = urllib.request.Request(url, headers={
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
})
|
|
with urllib.request.urlopen(req, timeout=20) as response:
|
|
raw_html = response.read().decode('utf-8', errors='replace')
|
|
content, image = _parse_article_html(raw_html, url)
|
|
if content:
|
|
return (content, image)
|
|
except Exception:
|
|
pass
|
|
|
|
# Fallback to FlareSolverr for bot-protected sites
|
|
raw_html = _fetch_html_flaresolverr(url)
|
|
if raw_html:
|
|
content, image = _parse_article_html(raw_html, url)
|
|
return (content, image)
|
|
|
|
return (None, None)
|
|
|
|
|
|
def extract_article_content(url: str) -> Optional[str]:
|
|
"""Extract article content (legacy wrapper, returns content only)."""
|
|
content, _ = extract_article_content_with_image(url)
|
|
return content
|
|
|
|
|
|
def process_press_articles(db, celebrity_id: Optional[int] = None, send_notifications: bool = True) -> Dict:
|
|
"""
|
|
Fetch GDELT articles for celebrities, deduplicate, store, and notify.
|
|
Returns stats about the operation.
|
|
"""
|
|
global _fetch_running
|
|
with _fetch_lock:
|
|
_fetch_running = True
|
|
|
|
try:
|
|
config = _get_config(db)
|
|
max_records = config.get('max_records_per_query', 25)
|
|
notify_enabled = config.get('notify_new_articles', True)
|
|
|
|
# Get celebrities to check
|
|
if celebrity_id:
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('SELECT id, name, slug FROM celebrity_profiles WHERE id = ?', (celebrity_id,))
|
|
row = cursor.fetchone()
|
|
celebrities = [dict(row)] if row else []
|
|
else:
|
|
# Use celebrity_ids from config if set, otherwise skip
|
|
configured_ids = config.get('celebrity_ids', [])
|
|
if configured_ids:
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
placeholders = ','.join(['?' for _ in configured_ids])
|
|
cursor.execute(
|
|
f'SELECT id, name, slug FROM celebrity_profiles WHERE id IN ({placeholders}) ORDER BY name',
|
|
configured_ids
|
|
)
|
|
celebrities = [dict(r) for r in cursor.fetchall()]
|
|
else:
|
|
celebrities = []
|
|
|
|
total_new = 0
|
|
total_fetched = 0
|
|
results_by_celebrity = {}
|
|
|
|
import time as _time
|
|
for idx, celeb in enumerate(celebrities):
|
|
celeb_id = celeb['id']
|
|
celeb_name = celeb['name']
|
|
|
|
# Small delay between celebrities
|
|
if idx > 0:
|
|
_time.sleep(2)
|
|
|
|
# Fetch from Google News RSS
|
|
articles = fetch_google_news_articles(celeb_name, max_records)
|
|
total_fetched += len(articles)
|
|
new_count = 0
|
|
|
|
for article in articles:
|
|
google_url = article.get('url', '')
|
|
if not google_url:
|
|
continue
|
|
|
|
article_title = article.get('title', '').strip()
|
|
|
|
# Check for duplicate by title first (cheap check before URL decode)
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
if article_title:
|
|
cursor.execute(
|
|
'SELECT id FROM press_articles WHERE celebrity_id = ? AND title = ?',
|
|
(celeb_id, article_title)
|
|
)
|
|
if cursor.fetchone():
|
|
continue
|
|
|
|
# Only keep articles where celeb name appears in the title
|
|
if not article_title or celeb_name.lower() not in article_title.lower():
|
|
continue
|
|
|
|
# Decode Google News URL to real article URL
|
|
article_url = _decode_google_news_url(google_url)
|
|
if not article_url:
|
|
continue
|
|
|
|
url_hash = hashlib.sha256(article_url.encode('utf-8')).hexdigest()
|
|
|
|
# Check for duplicate by URL hash
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('SELECT id FROM press_articles WHERE url_hash = ?', (url_hash,))
|
|
if cursor.fetchone():
|
|
continue
|
|
|
|
# Parse domain from real URL
|
|
parsed = urlparse(article_url)
|
|
domain = parsed.netloc.replace('www.', '')
|
|
|
|
# Extract article content and og:image
|
|
content, og_image = extract_article_content_with_image(article_url)
|
|
|
|
# Cache all inline images in the content to local proxy
|
|
if content:
|
|
content = _cache_content_images(content)
|
|
|
|
if content:
|
|
snippet = re.sub(r'<[^>]+>', ' ', content)
|
|
snippet = ' '.join(snippet.split())[:300]
|
|
else:
|
|
snippet = article.get('title', '')[:300] if article.get('title') else ''
|
|
|
|
# Cache the og:image locally, fall back to first inline image
|
|
cached_image = cache_press_image(og_image) if og_image else None
|
|
if not cached_image and content:
|
|
m = re.search(r'<img\s+src="(/api/press/images/[^"]+)"', content)
|
|
if m:
|
|
cached_image = m.group(1)
|
|
|
|
# Parse published date
|
|
published_date = article.get('seendate', '')
|
|
if published_date:
|
|
# GDELT format: YYYYMMDDTHHmmSSZ -> ISO format
|
|
try:
|
|
dt = datetime.strptime(published_date, '%Y%m%dT%H%M%SZ')
|
|
published_date = dt.isoformat()
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# Insert article
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
INSERT INTO press_articles
|
|
(celebrity_id, title, url, url_hash, domain, published_date,
|
|
image_url, language, country, article_content, snippet, notified, read)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0, 0)
|
|
''', (
|
|
celeb_id,
|
|
article.get('title', ''),
|
|
article_url,
|
|
url_hash,
|
|
domain,
|
|
published_date,
|
|
cached_image or '',
|
|
article.get('language', ''),
|
|
article.get('sourcecountry', ''),
|
|
content,
|
|
snippet,
|
|
))
|
|
conn.commit()
|
|
new_count += 1
|
|
|
|
total_new += new_count
|
|
if new_count > 0:
|
|
results_by_celebrity[celeb_name] = new_count
|
|
|
|
# Send notifications for new articles (only for scheduled fetches)
|
|
if send_notifications and notify_enabled and total_new > 0:
|
|
_send_press_notification(db, results_by_celebrity)
|
|
|
|
# Mark notified
|
|
if total_new > 0:
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('UPDATE press_articles SET notified = ? WHERE notified = ?', (1, 0))
|
|
conn.commit()
|
|
|
|
logger.info(f"Press monitor: fetched {total_fetched}, new {total_new}")
|
|
return {
|
|
'total_fetched': total_fetched,
|
|
'total_new': total_new,
|
|
'by_celebrity': results_by_celebrity,
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error in press article processing: {e}")
|
|
import traceback
|
|
logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
return {'error': str(e), 'total_fetched': 0, 'total_new': 0}
|
|
finally:
|
|
with _fetch_lock:
|
|
_fetch_running = False
|
|
|
|
|
|
def _send_press_notification(db, results_by_celebrity: Dict):
|
|
"""Send Pushover notification about new press articles."""
|
|
try:
|
|
from modules.pushover_notifier import create_notifier_from_config
|
|
from modules.settings_manager import SettingsManager
|
|
|
|
settings_manager = SettingsManager(str(db.db_path))
|
|
config = settings_manager.get_all()
|
|
notifier = create_notifier_from_config(config, unified_db=db)
|
|
|
|
if not notifier:
|
|
return
|
|
|
|
total = sum(results_by_celebrity.values())
|
|
title = f"📰 Press: {total} new article{'s' if total != 1 else ''}"
|
|
|
|
# Build rich HTML message matching other notification formats
|
|
message_parts = []
|
|
for name, count in results_by_celebrity.items():
|
|
message_parts.append(f"<b>👤 {name}:</b> {count} article{'s' if count != 1 else ''}")
|
|
|
|
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
message_parts.append(f"\n<b>⏰ Discovered:</b> {now}")
|
|
|
|
message = "\n".join(message_parts)
|
|
|
|
# Set notification context so it gets recorded to the notifications table
|
|
# and broadcast via websocket for real-time UI updates
|
|
notifier._current_notification_context = {
|
|
'platform': 'press',
|
|
'source': 'GDELT',
|
|
'content_type': 'article',
|
|
'download_count': total,
|
|
'metadata': {'by_celebrity': results_by_celebrity}
|
|
}
|
|
|
|
notifier.send_notification(
|
|
title=title,
|
|
message=message,
|
|
priority=0,
|
|
html=True,
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Failed to send press notification: {e}")
|
|
|
|
|
|
# ============================================================================
|
|
# CONFIGURATION ENDPOINTS
|
|
# ============================================================================
|
|
|
|
@router.get("/config")
|
|
@limiter.limit("30/minute")
|
|
@handle_exceptions
|
|
async def get_config(
|
|
request: Request,
|
|
current_user: Dict = Depends(get_current_user)
|
|
):
|
|
"""Get press monitor configuration."""
|
|
db = _get_db()
|
|
config = _get_config(db)
|
|
return {"success": True, "config": config}
|
|
|
|
|
|
@router.put("/config")
|
|
@limiter.limit("10/minute")
|
|
@handle_exceptions
|
|
async def update_config(
|
|
request: Request,
|
|
config_update: PressConfigUpdate,
|
|
current_user: Dict = Depends(get_current_user)
|
|
):
|
|
"""Update press monitor configuration."""
|
|
db = _get_db()
|
|
updates = config_update.model_dump(exclude_none=True)
|
|
if not updates:
|
|
return {"success": False, "message": "No fields to update"}
|
|
|
|
set_parts = []
|
|
values = []
|
|
for key, value in updates.items():
|
|
if isinstance(value, bool):
|
|
value = 1 if value else 0
|
|
elif key == 'celebrity_ids' and isinstance(value, list):
|
|
value = json.dumps(value)
|
|
set_parts.append(f"{key} = ?")
|
|
values.append(value)
|
|
|
|
set_parts.append("updated_at = CURRENT_TIMESTAMP")
|
|
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
f"UPDATE press_config SET {', '.join(set_parts)} WHERE id = 1",
|
|
values
|
|
)
|
|
conn.commit()
|
|
|
|
return {"success": True, "message": "Configuration updated"}
|
|
|
|
|
|
# ============================================================================
|
|
# CELEBRITY SELECTION ENDPOINT
|
|
# ============================================================================
|
|
|
|
@router.get("/celebrities")
|
|
@limiter.limit("30/minute")
|
|
@handle_exceptions
|
|
async def get_press_celebrities(
|
|
request: Request,
|
|
current_user: Dict = Depends(get_current_user),
|
|
):
|
|
"""Get all tracked celebrities with press_enabled flag based on config."""
|
|
db = _get_db()
|
|
config = _get_config(db)
|
|
enabled_ids = set(config.get('celebrity_ids', []))
|
|
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
SELECT id, name, slug FROM celebrity_profiles
|
|
WHERE enabled = 1
|
|
ORDER BY name
|
|
''')
|
|
rows = cursor.fetchall()
|
|
|
|
celebrities = []
|
|
for r in rows:
|
|
celeb = dict(r)
|
|
celeb['press_enabled'] = celeb['id'] in enabled_ids
|
|
celebrities.append(celeb)
|
|
|
|
return {"success": True, "celebrities": celebrities}
|
|
|
|
|
|
# ============================================================================
|
|
# ARTICLE ENDPOINTS
|
|
# ============================================================================
|
|
|
|
@router.get("/articles")
|
|
@limiter.limit("30/minute")
|
|
@handle_exceptions
|
|
async def get_articles(
|
|
request: Request,
|
|
current_user: Dict = Depends(get_current_user),
|
|
celebrity_id: Optional[int] = None,
|
|
domain: Optional[str] = None,
|
|
read: Optional[bool] = None,
|
|
search: Optional[str] = None,
|
|
page: int = 1,
|
|
per_page: int = 50,
|
|
):
|
|
"""Get paginated list of press articles with filters."""
|
|
db = _get_db()
|
|
|
|
conditions = []
|
|
params = []
|
|
|
|
if celebrity_id is not None:
|
|
conditions.append("pa.celebrity_id = ?")
|
|
params.append(celebrity_id)
|
|
if domain is not None:
|
|
conditions.append("pa.domain = ?")
|
|
params.append(domain)
|
|
if read is not None:
|
|
conditions.append("pa.read = ?")
|
|
params.append(1 if read else 0)
|
|
if search:
|
|
conditions.append("(pa.title LIKE ? OR pa.snippet LIKE ?)")
|
|
params.extend([f'%{search}%', f'%{search}%'])
|
|
|
|
where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else ""
|
|
offset = (page - 1) * per_page
|
|
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
|
|
# Get total count
|
|
cursor.execute(f"SELECT COUNT(*) FROM press_articles pa {where_clause}", params)
|
|
total = cursor.fetchone()[0]
|
|
|
|
# Get articles
|
|
cursor.execute(f'''
|
|
SELECT pa.id, pa.celebrity_id, pa.title, pa.url, pa.domain,
|
|
pa.published_date, pa.image_url, pa.language, pa.country,
|
|
pa.snippet, pa.fetched_at, pa.read,
|
|
cp.name as celebrity_name
|
|
FROM press_articles pa
|
|
LEFT JOIN celebrity_profiles cp ON pa.celebrity_id = cp.id
|
|
{where_clause}
|
|
ORDER BY pa.published_date DESC
|
|
LIMIT ? OFFSET ?
|
|
''', params + [per_page, offset])
|
|
|
|
articles = [dict(r) for r in cursor.fetchall()]
|
|
|
|
return {
|
|
"success": True,
|
|
"articles": articles,
|
|
"total": total,
|
|
"page": page,
|
|
"per_page": per_page,
|
|
"pages": (total + per_page - 1) // per_page if per_page else 1,
|
|
}
|
|
|
|
|
|
@router.get("/articles/{article_id}")
|
|
@limiter.limit("30/minute")
|
|
@handle_exceptions
|
|
async def get_article(
|
|
request: Request,
|
|
article_id: int,
|
|
current_user: Dict = Depends(get_current_user),
|
|
):
|
|
"""Get a single article with full content."""
|
|
db = _get_db()
|
|
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
SELECT pa.*, cp.name as celebrity_name
|
|
FROM press_articles pa
|
|
LEFT JOIN celebrity_profiles cp ON pa.celebrity_id = cp.id
|
|
WHERE pa.id = ?
|
|
''', (article_id,))
|
|
row = cursor.fetchone()
|
|
|
|
if not row:
|
|
raise HTTPException(status_code=404, detail="Article not found")
|
|
|
|
return {"success": True, "article": dict(row)}
|
|
|
|
|
|
@router.patch("/articles/{article_id}/read")
|
|
@limiter.limit("30/minute")
|
|
@handle_exceptions
|
|
async def update_read_status(
|
|
request: Request,
|
|
article_id: int,
|
|
body: ReadStatusUpdate,
|
|
current_user: Dict = Depends(get_current_user),
|
|
):
|
|
"""Mark an article as read or unread."""
|
|
db = _get_db()
|
|
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
'UPDATE press_articles SET read = ? WHERE id = ?',
|
|
(1 if body.read else 0, article_id)
|
|
)
|
|
conn.commit()
|
|
|
|
if cursor.rowcount == 0:
|
|
raise HTTPException(status_code=404, detail="Article not found")
|
|
|
|
return {"success": True, "message": f"Article marked as {'read' if body.read else 'unread'}"}
|
|
|
|
|
|
@router.post("/articles/mark-all-read")
|
|
@limiter.limit("10/minute")
|
|
@handle_exceptions
|
|
async def mark_all_read(
|
|
request: Request,
|
|
current_user: Dict = Depends(get_current_user),
|
|
):
|
|
"""Mark all unread articles as read."""
|
|
db = _get_db()
|
|
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('UPDATE press_articles SET read = 1 WHERE read = 0')
|
|
conn.commit()
|
|
count = cursor.rowcount
|
|
|
|
return {"success": True, "message": f"Marked {count} article{'s' if count != 1 else ''} as read", "count": count}
|
|
|
|
|
|
@router.delete("/articles/{article_id}")
|
|
@limiter.limit("10/minute")
|
|
@handle_exceptions
|
|
async def delete_article(
|
|
request: Request,
|
|
article_id: int,
|
|
current_user: Dict = Depends(get_current_user),
|
|
):
|
|
"""Delete a press article."""
|
|
db = _get_db()
|
|
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('DELETE FROM press_articles WHERE id = ?', (article_id,))
|
|
conn.commit()
|
|
|
|
if cursor.rowcount == 0:
|
|
raise HTTPException(status_code=404, detail="Article not found")
|
|
|
|
return {"success": True, "message": "Article deleted"}
|
|
|
|
|
|
# ============================================================================
|
|
# STATS ENDPOINT
|
|
# ============================================================================
|
|
|
|
@router.get("/stats")
|
|
@limiter.limit("30/minute")
|
|
@handle_exceptions
|
|
async def get_stats(
|
|
request: Request,
|
|
current_user: Dict = Depends(get_current_user),
|
|
):
|
|
"""Get press article statistics."""
|
|
db = _get_db()
|
|
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
|
|
# Total articles
|
|
cursor.execute('SELECT COUNT(*) FROM press_articles')
|
|
total = cursor.fetchone()[0]
|
|
|
|
# Unread count
|
|
cursor.execute('SELECT COUNT(*) FROM press_articles WHERE read = 0')
|
|
unread = cursor.fetchone()[0]
|
|
|
|
# By celebrity
|
|
cursor.execute('''
|
|
SELECT pa.celebrity_id, cp.name, COUNT(*) as count
|
|
FROM press_articles pa
|
|
JOIN celebrity_profiles cp ON pa.celebrity_id = cp.id
|
|
GROUP BY pa.celebrity_id, cp.name
|
|
ORDER BY count DESC
|
|
''')
|
|
by_celebrity = [{'id': r['celebrity_id'], 'name': r['name'], 'count': r['count']} for r in cursor.fetchall()]
|
|
|
|
# By domain (top 10)
|
|
cursor.execute('''
|
|
SELECT domain, COUNT(*) as count
|
|
FROM press_articles
|
|
GROUP BY domain
|
|
ORDER BY count DESC
|
|
LIMIT 10
|
|
''')
|
|
by_domain = [{'domain': r['domain'], 'count': r['count']} for r in cursor.fetchall()]
|
|
|
|
return {
|
|
"success": True,
|
|
"stats": {
|
|
"total": total,
|
|
"unread": unread,
|
|
"by_celebrity": by_celebrity,
|
|
"by_domain": by_domain,
|
|
}
|
|
}
|
|
|
|
|
|
# ============================================================================
|
|
# FETCH ENDPOINT
|
|
# ============================================================================
|
|
|
|
@router.post("/fetch")
|
|
@limiter.limit("5/minute")
|
|
@handle_exceptions
|
|
async def trigger_fetch(
|
|
request: Request,
|
|
background_tasks: BackgroundTasks,
|
|
current_user: Dict = Depends(get_current_user),
|
|
celebrity_id: Optional[int] = None,
|
|
):
|
|
"""Trigger a manual GDELT fetch for all or a specific celebrity."""
|
|
global _fetch_running
|
|
|
|
with _fetch_lock:
|
|
if _fetch_running:
|
|
return {"success": False, "message": "Fetch already in progress"}
|
|
|
|
db = _get_db()
|
|
|
|
def do_fetch():
|
|
process_press_articles(db, celebrity_id, send_notifications=False)
|
|
|
|
loop = asyncio.get_event_loop()
|
|
background_tasks.add_task(loop.run_in_executor, _executor, do_fetch)
|
|
|
|
return {"success": True, "message": "Fetch started"}
|
|
|
|
|
|
@router.get("/fetch/status")
|
|
@limiter.limit("30/minute")
|
|
@handle_exceptions
|
|
async def get_fetch_status(
|
|
request: Request,
|
|
current_user: Dict = Depends(get_current_user),
|
|
):
|
|
"""Check if a fetch is currently running."""
|
|
return {"success": True, "is_running": _fetch_running}
|
|
|
|
|
|
# ============================================================================
|
|
# IMAGE PROXY / CACHE
|
|
# ============================================================================
|
|
|
|
PRESS_IMAGE_CACHE = Path("/opt/media-downloader/data/press_images")
|
|
PRESS_IMAGE_CACHE.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def _cache_content_images(html_content: str) -> str:
|
|
"""Find all <img ...> in HTML content, cache each image locally,
|
|
and rewrite src to /api/press/images/... proxy path.
|
|
Removes img tags where caching fails (broken > missing)."""
|
|
if not html_content:
|
|
return html_content
|
|
def _replace_img(match):
|
|
full_tag = match.group(0)
|
|
src = match.group(1)
|
|
if not src or src.startswith('/api/press/images/'):
|
|
return full_tag
|
|
cached = cache_press_image(src)
|
|
if cached:
|
|
return full_tag.replace(src, cached)
|
|
return '' # Remove img if caching failed
|
|
return re.sub(r'<img\s+src="([^"]+)"[^>]*>', _replace_img, html_content)
|
|
|
|
|
|
def cache_press_image(image_url: str, use_flaresolverr: bool = False) -> Optional[str]:
|
|
"""Download an image and cache it locally. Returns the API path to serve it."""
|
|
if not image_url:
|
|
return None
|
|
|
|
import urllib.request
|
|
|
|
# Hash the URL for the filename
|
|
url_hash = hashlib.sha256(image_url.encode('utf-8')).hexdigest()[:16]
|
|
|
|
# Check if already cached
|
|
for ext in ('.jpg', '.jpeg', '.png', '.webp', '.gif'):
|
|
cached = PRESS_IMAGE_CACHE / f"{url_hash}{ext}"
|
|
if cached.exists() and cached.stat().st_size > 0:
|
|
return f"/api/press/images/{cached.name}"
|
|
|
|
# Download the image
|
|
image_data = None
|
|
|
|
# Try direct fetch first
|
|
try:
|
|
req = urllib.request.Request(image_url, headers={
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
'Accept': 'image/*,*/*',
|
|
})
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
content_type = resp.headers.get('Content-Type', '')
|
|
if 'image' in content_type or resp.status == 200:
|
|
image_data = resp.read()
|
|
if len(image_data) < 1000:
|
|
image_data = None # Too small, likely an error page
|
|
except Exception:
|
|
pass
|
|
|
|
# Fallback to FlareSolverr if direct failed
|
|
if not image_data and use_flaresolverr:
|
|
try:
|
|
import requests
|
|
resp = requests.post('http://localhost:8191/v1', json={
|
|
'cmd': 'request.get',
|
|
'url': image_url,
|
|
'maxTimeout': 15000
|
|
}, timeout=20)
|
|
data = resp.json()
|
|
if data.get('status') == 'ok':
|
|
# FlareSolverr returns HTML for the response, not binary
|
|
# It can't fetch binary images directly, skip this
|
|
pass
|
|
except Exception:
|
|
pass
|
|
|
|
if not image_data or len(image_data) < 1000:
|
|
return None
|
|
|
|
# Determine extension from content or URL
|
|
ext = '.jpg'
|
|
url_lower = image_url.lower()
|
|
if '.png' in url_lower:
|
|
ext = '.png'
|
|
elif '.webp' in url_lower:
|
|
ext = '.webp'
|
|
elif '.gif' in url_lower:
|
|
ext = '.gif'
|
|
|
|
cached_path = PRESS_IMAGE_CACHE / f"{url_hash}{ext}"
|
|
cached_path.write_bytes(image_data)
|
|
return f"/api/press/images/{cached_path.name}"
|
|
|
|
|
|
@router.get("/images/{filename}")
|
|
async def serve_press_image(filename: str, current_user: Dict = Depends(get_current_user)):
|
|
"""Serve a cached press article image."""
|
|
# Sanitize filename
|
|
if '/' in filename or '..' in filename:
|
|
raise HTTPException(status_code=400, detail="Invalid filename")
|
|
|
|
filepath = PRESS_IMAGE_CACHE / filename
|
|
if not filepath.exists():
|
|
raise HTTPException(status_code=404, detail="Image not found")
|
|
|
|
# Determine media type
|
|
suffix = filepath.suffix.lower()
|
|
media_types = {
|
|
'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
|
|
'.png': 'image/png', '.webp': 'image/webp', '.gif': 'image/gif',
|
|
}
|
|
media_type = media_types.get(suffix, 'image/jpeg')
|
|
|
|
return FileResponse(filepath, media_type=media_type, headers={
|
|
'Cache-Control': 'public, max-age=86400',
|
|
})
|