media-downloader/web/backend/routers/press.py

"""
Press Router

Monitors GDELT for news articles mentioning tracked celebrities.
Stores complete articles and sends Pushover push notifications.
"""

import asyncio
import hashlib
import json
import threading
import re
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from typing import Dict, List, Optional
import urllib.error
from urllib.parse import urlparse

from pathlib import Path

from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import FileResponse
from pydantic import BaseModel
from slowapi import Limiter
from slowapi.util import get_remote_address

from ..core.dependencies import get_current_user, get_app_state
from ..core.exceptions import handle_exceptions
from modules.universal_logger import get_logger

logger = get_logger('API')

router = APIRouter(prefix="/api/press", tags=["Press"])
limiter = Limiter(key_func=get_remote_address)

# Thread pool for blocking operations
_executor = ThreadPoolExecutor(max_workers=2)

# Track running fetch jobs
_fetch_lock = threading.Lock()
_fetch_running = False


# ============================================================================
# PYDANTIC MODELS
# ============================================================================

class PressConfigUpdate(BaseModel):
    enabled: Optional[bool] = None
    check_interval_hours: Optional[int] = None
    max_records_per_query: Optional[int] = None
    notify_new_articles: Optional[bool] = None
    celebrity_ids: Optional[List[int]] = None


class ReadStatusUpdate(BaseModel):
    read: bool


# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def _get_db():
    """Get database instance."""
    app_state = get_app_state()
    return app_state.db


def _get_config(db) -> Dict:
    """Get press config from database."""
    with db.get_connection() as conn:
        cursor = conn.cursor()
        cursor.execute('SELECT * FROM press_config WHERE id = 1')
        row = cursor.fetchone()
        if not row:
            return {
                'enabled': True,
                'check_interval_hours': 6,
                'max_records_per_query': 25,
                'notify_new_articles': True,
                'celebrity_ids': [],
            }
        # Parse celebrity_ids from JSON string
        celebrity_ids_raw = row['celebrity_ids'] if 'celebrity_ids' in row.keys() else None
        try:
            celebrity_ids = json.loads(celebrity_ids_raw) if celebrity_ids_raw else []
        except (json.JSONDecodeError, TypeError):
            celebrity_ids = []
        return {
            'enabled': bool(row['enabled']),
            'check_interval_hours': row['check_interval_hours'],
            'max_records_per_query': row['max_records_per_query'],
            'notify_new_articles': bool(row['notify_new_articles']),
            'celebrity_ids': celebrity_ids,
        }


def _get_enabled_celebrities(db) -> List[Dict]:
    """Get enabled celebrities from celebrity_profiles."""
    with db.get_connection() as conn:
        cursor = conn.cursor()
        cursor.execute('''
            SELECT id, name, slug FROM celebrity_profiles
            WHERE enabled = 1
            ORDER BY name
        ''')
        rows = cursor.fetchall()
        return [dict(r) for r in rows]


def _decode_google_news_url(google_url: str) -> Optional[str]:
    """Decode a Google News redirect URL to the real article URL."""
    if 'news.google.com' not in google_url:
        return google_url
    try:
        from googlenewsdecoder import gnewsdecoder
        result = gnewsdecoder(google_url, interval=1)
        if result.get('status'):
            return result['decoded_url']
    except Exception as e:
        logger.debug(f"Failed to decode Google News URL: {e}")
    return None


def fetch_google_news_articles(name: str, max_records: int = 100) -> List[Dict]:
    """Query Google News RSS for articles mentioning the given name.
    Returns list of dicts with: title, url, published_date (seendate key for compat), source."""
    import urllib.request
    import xml.etree.ElementTree as ET

    query = f'%22{name.replace(" ", "+")}%22'
    url = f'https://news.google.com/rss/search?q={query}&hl=en&gl=US&ceid=US:en'

    for attempt in range(3):
        try:
            req = urllib.request.Request(url, headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            })
            with urllib.request.urlopen(req, timeout=30) as response:
                data = response.read().decode('utf-8')

            root = ET.fromstring(data)
            articles = []
            for item in root.findall('.//item'):
                title_el = item.find('title')
                link_el = item.find('link')
                pub_el = item.find('pubDate')
                source_el = item.find('source')

                if title_el is None or link_el is None:
                    continue

                title = title_el.text or ''
                source_name = source_el.text if source_el is not None else ''
                if source_name and title.endswith(f' - {source_name}'):
                    title = title[:-len(f' - {source_name}')].strip()

                # Parse pubDate (RFC 2822) to GDELT-compat format
                seendate = ''
                if pub_el is not None and pub_el.text:
                    try:
                        from email.utils import parsedate_to_datetime
                        dt = parsedate_to_datetime(pub_el.text)
                        seendate = dt.strftime('%Y%m%dT%H%M%SZ')
                    except Exception:
                        pass

                articles.append({
                    'title': title,
                    'url': link_el.text or '',
                    'seendate': seendate,
                    'socialimage': '',
                    'language': 'en',
                    'sourcecountry': '',
                })

            logger.info(f"Google News: {len(articles)} articles for '{name}'")
            return articles[:max_records]
        except Exception as e:
            if attempt < 2:
                import time
                time.sleep(5)
                continue
            logger.error(f"Google News fetch error for '{name}': {e}")
            return []
    return []


def _parse_article_html(raw_html: str, url: str) -> tuple:
    """Parse raw HTML into article content and og:image. Returns (content_html, image_url)."""
    from urllib.parse import urljoin

    try:
        from readability import Document
        from bs4 import BeautifulSoup
        import bleach

        # Extract og:image for thumbnail
        og_soup = BeautifulSoup(raw_html, 'html.parser')
        og_image = None
        og_tag = og_soup.find('meta', property='og:image')
        if og_tag and og_tag.get('content'):
            og_image = og_tag['content']
        if not og_image:
            tw_tag = og_soup.find('meta', attrs={'name': 'twitter:image'})
            if tw_tag and tw_tag.get('content'):
                og_image = tw_tag['content']

        doc = Document(raw_html, url=url)
        content_html = doc.summary()

        if not content_html or len(content_html.strip()) < 50:
            return (None, og_image)

        reader_soup = BeautifulSoup(content_html, 'html.parser')

        junk_text_re = re.compile(
            r'(^published|^updated|^modified|^posted|^by\s|^written by|^photo:|^image:|^credit:|'
            r'share or comment|share this article|comment on this|follow us on|'
            r'sign up for|subscribe to|have you got a story|tips@|email us)',
            re.I
        )

        inline_tags = ['b', 'i', 'em', 'strong', 'a', 'br']
        inline_attrs = {'a': ['href']}

        html_parts = []
        for el in reader_soup.find_all(['p', 'h2', 'h3', 'h4', 'blockquote', 'ul', 'ol']):
            text = el.get_text(strip=True)
            if len(text) < 30:
                continue
            if junk_text_re.search(text):
                continue
            tag = el.name
            inner = bleach.clean(
                el.decode_contents(), tags=inline_tags,
                attributes=inline_attrs, strip=True, protocols=['http', 'https']
            ).strip()
            if not inner:
                continue
            if tag == 'p':
                html_parts.append(f'<p>{inner}</p>')
            elif tag in ('h2', 'h3', 'h4'):
                html_parts.append(f'<{tag}>{inner}</{tag}>')
            elif tag == 'blockquote':
                html_parts.append(f'<blockquote><p>{inner}</p></blockquote>')
            elif tag in ('ul', 'ol'):
                items = []
                for li in el.find_all('li', recursive=False):
                    li_inner = bleach.clean(
                        li.decode_contents(), tags=inline_tags,
                        attributes=inline_attrs, strip=True, protocols=['http', 'https']
                    ).strip()
                    if li_inner and len(li.get_text(strip=True)) > 10:
                        items.append(f'<li>{li_inner}</li>')
                if items:
                    html_parts.append(f'<{tag}>{"".join(items)}</{tag}>')

        junk_img_re = re.compile(r'(logo|icon|pixel|spacer|blank|1x1|svg|avatar|spinner|/ct/|cts\.businesswire)', re.I)
        seen_srcs = set()
        article_images = []
        for img in reader_soup.find_all('img'):
            src = img.get('src', '')
            if src and src.startswith(('http://', 'https://')) and src not in seen_srcs:
                if junk_img_re.search(src):
                    continue
                seen_srcs.add(src)
                alt = (img.get('alt', '') or '').strip()
                article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')

        # If readability found no images, grab first real image from original HTML
        if not article_images:
            orig_soup = BeautifulSoup(raw_html, 'html.parser')
            for noise in orig_soup.find_all(['script', 'style', 'nav', 'footer', 'header',
                                              'aside', 'form', 'noscript', 'svg']):
                noise.decompose()
            for img in orig_soup.find_all('img'):
                src = (img.get('data-src') or img.get('data-lazy-src') or
                       img.get('data-original') or img.get('src') or '')
                if not src or not src.startswith(('http://', 'https://')):
                    continue
                src_lower = src.lower()
                if any(x in src_lower for x in ('logo', 'icon', 'pixel', 'spacer', 'blank',
                                                  '1x1', 'svg', 'avatar', 'spinner', '/ct/')):
                    continue
                alt = (img.get('alt', '') or '').strip()
                article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
                break  # Only first real image

        if article_images and html_parts:
            text_count = len(html_parts)
            img_count = len(article_images)
            interval = max(1, text_count // (img_count + 1))
            merged = []
            img_idx = 0
            for i, part in enumerate(html_parts):
                merged.append(part)
                if img_idx < img_count and (i + 1) % interval == 0:
                    merged.append(article_images[img_idx])
                    img_idx += 1
            while img_idx < img_count:
                merged.append(article_images[img_idx])
                img_idx += 1
            html_parts = merged
        elif article_images and not html_parts:
            html_parts = article_images

        if not html_parts:
            text = reader_soup.get_text(separator='\n\n', strip=True)
            if text:
                for para in text.split('\n\n'):
                    para = para.strip()
                    if len(para) > 30:
                        html_parts.append(f'<p>{bleach.clean(para)}</p>')

        if not html_parts:
            return (None, og_image)

        from bs4 import BeautifulSoup as BS
        clean_parts = []
        for part in html_parts:
            part_soup = BS(part, 'html.parser')
            part_text = part_soup.get_text(strip=True)
            if len(part_text) > 100:
                words = part_text.split()
                avg_word_len = len(part_text) / max(len(words), 1)
                if avg_word_len > 12:
                    continue
            clean_parts.append(part)

        if not clean_parts:
            return (None, og_image)

        result = '\n'.join(clean_parts)
        plain_text = BS(result, 'html.parser').get_text(separator=' ', strip=True)

        garbage_re = re.compile(
            r'(use (left|right|escape)|arrow keys|navigate between|'
            r'sign (in|up) with|we won.t post|social account|'
            r'accept cookies|cookie policy|privacy policy|terms of (use|service)|'
            r'AlabamaAlaska|CaliforniaColorado|United States of America)',
            re.I
        )
        if len(plain_text) < 200 or garbage_re.search(plain_text):
            return (None, og_image)

        return (result, og_image)
    except Exception as e:
        logger.debug(f"Article parsing failed for {url}: {e}")
        return (None, None)


def _fetch_html_flaresolverr(url: str) -> Optional[str]:
    """Fetch HTML via FlareSolverr (headless browser)."""
    try:
        import requests
        resp = requests.post('http://localhost:8191/v1', json={
            'cmd': 'request.get',
            'url': url,
            'maxTimeout': 30000
        }, timeout=45)
        data = resp.json()
        if data.get('status') == 'ok':
            html = data.get('solution', {}).get('response', '')
            if len(html) > 500:
                return html
    except Exception:
        pass
    return None


def extract_article_content_with_image(url: str) -> tuple:
    """Extract article content and og:image. Tries direct fetch, falls back to FlareSolverr.
    Returns (content_html, image_url)."""
    import urllib.request

    # Try direct fetch first
    try:
        req = urllib.request.Request(url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })
        with urllib.request.urlopen(req, timeout=20) as response:
            raw_html = response.read().decode('utf-8', errors='replace')
        content, image = _parse_article_html(raw_html, url)
        if content:
            return (content, image)
    except Exception:
        pass

    # Fallback to FlareSolverr for bot-protected sites
    raw_html = _fetch_html_flaresolverr(url)
    if raw_html:
        content, image = _parse_article_html(raw_html, url)
        return (content, image)

    return (None, None)


def extract_article_content(url: str) -> Optional[str]:
    """Extract article content (legacy wrapper, returns content only)."""
    content, _ = extract_article_content_with_image(url)
    return content


def process_press_articles(db, celebrity_id: Optional[int] = None, send_notifications: bool = True) -> Dict:
    """
    Fetch GDELT articles for celebrities, deduplicate, store, and notify.
    Returns stats about the operation.
    """
    global _fetch_running
    with _fetch_lock:
        _fetch_running = True

    try:
        config = _get_config(db)
        max_records = config.get('max_records_per_query', 25)
        notify_enabled = config.get('notify_new_articles', True)

        # Get celebrities to check
        if celebrity_id:
            with db.get_connection() as conn:
                cursor = conn.cursor()
                cursor.execute('SELECT id, name, slug FROM celebrity_profiles WHERE id = ?', (celebrity_id,))
                row = cursor.fetchone()
                celebrities = [dict(row)] if row else []
        else:
            # Use celebrity_ids from config if set, otherwise skip
            configured_ids = config.get('celebrity_ids', [])
            if configured_ids:
                with db.get_connection() as conn:
                    cursor = conn.cursor()
                    placeholders = ','.join(['?' for _ in configured_ids])
                    cursor.execute(
                        f'SELECT id, name, slug FROM celebrity_profiles WHERE id IN ({placeholders}) ORDER BY name',
                        configured_ids
                    )
                    celebrities = [dict(r) for r in cursor.fetchall()]
            else:
                celebrities = []

        total_new = 0
        total_fetched = 0
        results_by_celebrity = {}

        import time as _time
        for idx, celeb in enumerate(celebrities):
            celeb_id = celeb['id']
            celeb_name = celeb['name']

            # Small delay between celebrities
            if idx > 0:
                _time.sleep(2)

            # Fetch from Google News RSS
            articles = fetch_google_news_articles(celeb_name, max_records)
            total_fetched += len(articles)
            new_count = 0

            for article in articles:
                google_url = article.get('url', '')
                if not google_url:
                    continue

                article_title = article.get('title', '').strip()

                # Check for duplicate by title first (cheap check before URL decode)
                with db.get_connection() as conn:
                    cursor = conn.cursor()
                    if article_title:
                        cursor.execute(
                            'SELECT id FROM press_articles WHERE celebrity_id = ? AND title = ?',
                            (celeb_id, article_title)
                        )
                        if cursor.fetchone():
                            continue

                # Only keep articles where celeb name appears in the title
                if not article_title or celeb_name.lower() not in article_title.lower():
                    continue

                # Decode Google News URL to real article URL
                article_url = _decode_google_news_url(google_url)
                if not article_url:
                    continue

                url_hash = hashlib.sha256(article_url.encode('utf-8')).hexdigest()

                # Check for duplicate by URL hash
                with db.get_connection() as conn:
                    cursor = conn.cursor()
                    cursor.execute('SELECT id FROM press_articles WHERE url_hash = ?', (url_hash,))
                    if cursor.fetchone():
                        continue

                # Parse domain from real URL
                parsed = urlparse(article_url)
                domain = parsed.netloc.replace('www.', '')

                # Extract article content and og:image
                content, og_image = extract_article_content_with_image(article_url)

                # Cache all inline images in the content to local proxy
                if content:
                    content = _cache_content_images(content)

                if content:
                    snippet = re.sub(r'<[^>]+>', ' ', content)
                    snippet = ' '.join(snippet.split())[:300]
                else:
                    snippet = article.get('title', '')[:300] if article.get('title') else ''

                # Cache the og:image locally, fall back to first inline image
                cached_image = cache_press_image(og_image) if og_image else None
                if not cached_image and content:
                    m = re.search(r'<img\s+src="(/api/press/images/[^"]+)"', content)
                    if m:
                        cached_image = m.group(1)

                # Parse published date
                published_date = article.get('seendate', '')
                if published_date:
                    # GDELT format: YYYYMMDDTHHmmSSZ -> ISO format
                    try:
                        dt = datetime.strptime(published_date, '%Y%m%dT%H%M%SZ')
                        published_date = dt.isoformat()
                    except (ValueError, TypeError):
                        pass

                # Insert article
                with db.get_connection() as conn:
                    cursor = conn.cursor()
                    cursor.execute('''
                        INSERT INTO press_articles
                        (celebrity_id, title, url, url_hash, domain, published_date,
                         image_url, language, country, article_content, snippet, notified, read)
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0, 0)
                    ''', (
                        celeb_id,
                        article.get('title', ''),
                        article_url,
                        url_hash,
                        domain,
                        published_date,
                        cached_image or '',
                        article.get('language', ''),
                        article.get('sourcecountry', ''),
                        content,
                        snippet,
                    ))
                    conn.commit()
                    new_count += 1

            total_new += new_count
            if new_count > 0:
                results_by_celebrity[celeb_name] = new_count

        # Send notifications for new articles (only for scheduled fetches)
        if send_notifications and notify_enabled and total_new > 0:
            _send_press_notification(db, results_by_celebrity)

        # Mark notified
        if total_new > 0:
            with db.get_connection() as conn:
                cursor = conn.cursor()
                cursor.execute('UPDATE press_articles SET notified = ? WHERE notified = ?', (1, 0))
                conn.commit()

        logger.info(f"Press monitor: fetched {total_fetched}, new {total_new}")
        return {
            'total_fetched': total_fetched,
            'total_new': total_new,
            'by_celebrity': results_by_celebrity,
        }
    except Exception as e:
        logger.error(f"Error in press article processing: {e}")
        import traceback
        logger.debug(f"Traceback: {traceback.format_exc()}")
        return {'error': str(e), 'total_fetched': 0, 'total_new': 0}
    finally:
        with _fetch_lock:
            _fetch_running = False


def _send_press_notification(db, results_by_celebrity: Dict):
    """Send Pushover notification about new press articles."""
    try:
        from modules.pushover_notifier import create_notifier_from_config
        from modules.settings_manager import SettingsManager

        settings_manager = SettingsManager(str(db.db_path))
        config = settings_manager.get_all()
        notifier = create_notifier_from_config(config, unified_db=db)

        if not notifier:
            return

        total = sum(results_by_celebrity.values())
        title = f"📰 Press: {total} new article{'s' if total != 1 else ''}"

        # Build rich HTML message matching other notification formats
        message_parts = []
        for name, count in results_by_celebrity.items():
            message_parts.append(f"<b>👤 {name}:</b> {count} article{'s' if count != 1 else ''}")

        now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        message_parts.append(f"\n<b>⏰ Discovered:</b> {now}")

        message = "\n".join(message_parts)

        # Set notification context so it gets recorded to the notifications table
        # and broadcast via websocket for real-time UI updates
        notifier._current_notification_context = {
            'platform': 'press',
            'source': 'GDELT',
            'content_type': 'article',
            'download_count': total,
            'metadata': {'by_celebrity': results_by_celebrity}
        }

        notifier.send_notification(
            title=title,
            message=message,
            priority=0,
            html=True,
        )
    except Exception as e:
        logger.error(f"Failed to send press notification: {e}")


# ============================================================================
# CONFIGURATION ENDPOINTS
# ============================================================================

@router.get("/config")
@limiter.limit("30/minute")
@handle_exceptions
async def get_config(
    request: Request,
    current_user: Dict = Depends(get_current_user)
):
    """Get press monitor configuration."""
    db = _get_db()
    config = _get_config(db)
    return {"success": True, "config": config}


@router.put("/config")
@limiter.limit("10/minute")
@handle_exceptions
async def update_config(
    request: Request,
    config_update: PressConfigUpdate,
    current_user: Dict = Depends(get_current_user)
):
    """Update press monitor configuration."""
    db = _get_db()
    updates = config_update.model_dump(exclude_none=True)
    if not updates:
        return {"success": False, "message": "No fields to update"}

    set_parts = []
    values = []
    for key, value in updates.items():
        if isinstance(value, bool):
            value = 1 if value else 0
        elif key == 'celebrity_ids' and isinstance(value, list):
            value = json.dumps(value)
        set_parts.append(f"{key} = ?")
        values.append(value)

    set_parts.append("updated_at = CURRENT_TIMESTAMP")

    with db.get_connection() as conn:
        cursor = conn.cursor()
        cursor.execute(
            f"UPDATE press_config SET {', '.join(set_parts)} WHERE id = 1",
            values
        )
        conn.commit()

    return {"success": True, "message": "Configuration updated"}


# ============================================================================
# CELEBRITY SELECTION ENDPOINT
# ============================================================================

@router.get("/celebrities")
@limiter.limit("30/minute")
@handle_exceptions
async def get_press_celebrities(
    request: Request,
    current_user: Dict = Depends(get_current_user),
):
    """Get all tracked celebrities with press_enabled flag based on config."""
    db = _get_db()
    config = _get_config(db)
    enabled_ids = set(config.get('celebrity_ids', []))

    with db.get_connection() as conn:
        cursor = conn.cursor()
        cursor.execute('''
            SELECT id, name, slug FROM celebrity_profiles
            WHERE enabled = 1
            ORDER BY name
        ''')
        rows = cursor.fetchall()

    celebrities = []
    for r in rows:
        celeb = dict(r)
        celeb['press_enabled'] = celeb['id'] in enabled_ids
        celebrities.append(celeb)

    return {"success": True, "celebrities": celebrities}


# ============================================================================
# ARTICLE ENDPOINTS
# ============================================================================

@router.get("/articles")
@limiter.limit("30/minute")
@handle_exceptions
async def get_articles(
    request: Request,
    current_user: Dict = Depends(get_current_user),
    celebrity_id: Optional[int] = None,
    domain: Optional[str] = None,
    read: Optional[bool] = None,
    search: Optional[str] = None,
    page: int = 1,
    per_page: int = 50,
):
    """Get paginated list of press articles with filters."""
    db = _get_db()

    conditions = []
    params = []

    if celebrity_id is not None:
        conditions.append("pa.celebrity_id = ?")
        params.append(celebrity_id)
    if domain is not None:
        conditions.append("pa.domain = ?")
        params.append(domain)
    if read is not None:
        conditions.append("pa.read = ?")
        params.append(1 if read else 0)
    if search:
        conditions.append("(pa.title LIKE ? OR pa.snippet LIKE ?)")
        params.extend([f'%{search}%', f'%{search}%'])

    where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else ""
    offset = (page - 1) * per_page

    with db.get_connection() as conn:
        cursor = conn.cursor()

        # Get total count
        cursor.execute(f"SELECT COUNT(*) FROM press_articles pa {where_clause}", params)
        total = cursor.fetchone()[0]

        # Get articles
        cursor.execute(f'''
            SELECT pa.id, pa.celebrity_id, pa.title, pa.url, pa.domain,
                   pa.published_date, pa.image_url, pa.language, pa.country,
                   pa.snippet, pa.fetched_at, pa.read,
                   cp.name as celebrity_name
            FROM press_articles pa
            LEFT JOIN celebrity_profiles cp ON pa.celebrity_id = cp.id
            {where_clause}
            ORDER BY pa.published_date DESC
            LIMIT ? OFFSET ?
        ''', params + [per_page, offset])

        articles = [dict(r) for r in cursor.fetchall()]

    return {
        "success": True,
        "articles": articles,
        "total": total,
        "page": page,
        "per_page": per_page,
        "pages": (total + per_page - 1) // per_page if per_page else 1,
    }


@router.get("/articles/{article_id}")
@limiter.limit("30/minute")
@handle_exceptions
async def get_article(
    request: Request,
    article_id: int,
    current_user: Dict = Depends(get_current_user),
):
    """Get a single article with full content."""
    db = _get_db()

    with db.get_connection() as conn:
        cursor = conn.cursor()
        cursor.execute('''
            SELECT pa.*, cp.name as celebrity_name
            FROM press_articles pa
            LEFT JOIN celebrity_profiles cp ON pa.celebrity_id = cp.id
            WHERE pa.id = ?
        ''', (article_id,))
        row = cursor.fetchone()

    if not row:
        raise HTTPException(status_code=404, detail="Article not found")

    return {"success": True, "article": dict(row)}


@router.patch("/articles/{article_id}/read")
@limiter.limit("30/minute")
@handle_exceptions
async def update_read_status(
    request: Request,
    article_id: int,
    body: ReadStatusUpdate,
    current_user: Dict = Depends(get_current_user),
):
    """Mark an article as read or unread."""
    db = _get_db()

    with db.get_connection() as conn:
        cursor = conn.cursor()
        cursor.execute(
            'UPDATE press_articles SET read = ? WHERE id = ?',
            (1 if body.read else 0, article_id)
        )
        conn.commit()

        if cursor.rowcount == 0:
            raise HTTPException(status_code=404, detail="Article not found")

    return {"success": True, "message": f"Article marked as {'read' if body.read else 'unread'}"}


@router.post("/articles/mark-all-read")
@limiter.limit("10/minute")
@handle_exceptions
async def mark_all_read(
    request: Request,
    current_user: Dict = Depends(get_current_user),
):
    """Mark all unread articles as read."""
    db = _get_db()

    with db.get_connection() as conn:
        cursor = conn.cursor()
        cursor.execute('UPDATE press_articles SET read = 1 WHERE read = 0')
        conn.commit()
        count = cursor.rowcount

    return {"success": True, "message": f"Marked {count} article{'s' if count != 1 else ''} as read", "count": count}


@router.delete("/articles/{article_id}")
@limiter.limit("10/minute")
@handle_exceptions
async def delete_article(
    request: Request,
    article_id: int,
    current_user: Dict = Depends(get_current_user),
):
    """Delete a press article."""
    db = _get_db()

    with db.get_connection() as conn:
        cursor = conn.cursor()
        cursor.execute('DELETE FROM press_articles WHERE id = ?', (article_id,))
        conn.commit()

        if cursor.rowcount == 0:
            raise HTTPException(status_code=404, detail="Article not found")

    return {"success": True, "message": "Article deleted"}


# ============================================================================
# STATS ENDPOINT
# ============================================================================

@router.get("/stats")
@limiter.limit("30/minute")
@handle_exceptions
async def get_stats(
    request: Request,
    current_user: Dict = Depends(get_current_user),
):
    """Get press article statistics."""
    db = _get_db()

    with db.get_connection() as conn:
        cursor = conn.cursor()

        # Total articles
        cursor.execute('SELECT COUNT(*) FROM press_articles')
        total = cursor.fetchone()[0]

        # Unread count
        cursor.execute('SELECT COUNT(*) FROM press_articles WHERE read = 0')
        unread = cursor.fetchone()[0]

        # By celebrity
        cursor.execute('''
            SELECT pa.celebrity_id, cp.name, COUNT(*) as count
            FROM press_articles pa
            JOIN celebrity_profiles cp ON pa.celebrity_id = cp.id
            GROUP BY pa.celebrity_id, cp.name
            ORDER BY count DESC
        ''')
        by_celebrity = [{'id': r['celebrity_id'], 'name': r['name'], 'count': r['count']} for r in cursor.fetchall()]

        # By domain (top 10)
        cursor.execute('''
            SELECT domain, COUNT(*) as count
            FROM press_articles
            GROUP BY domain
            ORDER BY count DESC
            LIMIT 10
        ''')
        by_domain = [{'domain': r['domain'], 'count': r['count']} for r in cursor.fetchall()]

    return {
        "success": True,
        "stats": {
            "total": total,
            "unread": unread,
            "by_celebrity": by_celebrity,
            "by_domain": by_domain,
        }
    }


# ============================================================================
# FETCH ENDPOINT
# ============================================================================

@router.post("/fetch")
@limiter.limit("5/minute")
@handle_exceptions
async def trigger_fetch(
    request: Request,
    background_tasks: BackgroundTasks,
    current_user: Dict = Depends(get_current_user),
    celebrity_id: Optional[int] = None,
):
    """Trigger a manual GDELT fetch for all or a specific celebrity."""
    global _fetch_running

    with _fetch_lock:
        if _fetch_running:
            return {"success": False, "message": "Fetch already in progress"}

    db = _get_db()

    def do_fetch():
        process_press_articles(db, celebrity_id, send_notifications=False)

    loop = asyncio.get_event_loop()
    background_tasks.add_task(loop.run_in_executor, _executor, do_fetch)

    return {"success": True, "message": "Fetch started"}


@router.get("/fetch/status")
@limiter.limit("30/minute")
@handle_exceptions
async def get_fetch_status(
    request: Request,
    current_user: Dict = Depends(get_current_user),
):
    """Check if a fetch is currently running."""
    return {"success": True, "is_running": _fetch_running}


# ============================================================================
# IMAGE PROXY / CACHE
# ============================================================================

PRESS_IMAGE_CACHE = Path("/opt/media-downloader/data/press_images")
PRESS_IMAGE_CACHE.mkdir(parents=True, exist_ok=True)


def _cache_content_images(html_content: str) -> str:
    """Find all <img ...> in HTML content, cache each image locally,
    and rewrite src to /api/press/images/... proxy path.
    Removes img tags where caching fails (broken > missing)."""
    if not html_content:
        return html_content
    def _replace_img(match):
        full_tag = match.group(0)
        src = match.group(1)
        if not src or src.startswith('/api/press/images/'):
            return full_tag
        cached = cache_press_image(src)
        if cached:
            return full_tag.replace(src, cached)
        return ''  # Remove img if caching failed
    return re.sub(r'<img\s+src="([^"]+)"[^>]*>', _replace_img, html_content)


def cache_press_image(image_url: str, use_flaresolverr: bool = False) -> Optional[str]:
    """Download an image and cache it locally. Returns the API path to serve it."""
    if not image_url:
        return None

    import urllib.request

    # Hash the URL for the filename
    url_hash = hashlib.sha256(image_url.encode('utf-8')).hexdigest()[:16]

    # Check if already cached
    for ext in ('.jpg', '.jpeg', '.png', '.webp', '.gif'):
        cached = PRESS_IMAGE_CACHE / f"{url_hash}{ext}"
        if cached.exists() and cached.stat().st_size > 0:
            return f"/api/press/images/{cached.name}"

    # Download the image
    image_data = None

    # Try direct fetch first
    try:
        req = urllib.request.Request(image_url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'image/*,*/*',
        })
        with urllib.request.urlopen(req, timeout=15) as resp:
            content_type = resp.headers.get('Content-Type', '')
            if 'image' in content_type or resp.status == 200:
                image_data = resp.read()
                if len(image_data) < 1000:
                    image_data = None  # Too small, likely an error page
    except Exception:
        pass

    # Fallback to FlareSolverr if direct failed
    if not image_data and use_flaresolverr:
        try:
            import requests
            resp = requests.post('http://localhost:8191/v1', json={
                'cmd': 'request.get',
                'url': image_url,
                'maxTimeout': 15000
            }, timeout=20)
            data = resp.json()
            if data.get('status') == 'ok':
                # FlareSolverr returns HTML for the response, not binary
                # It can't fetch binary images directly, skip this
                pass
        except Exception:
            pass

    if not image_data or len(image_data) < 1000:
        return None

    # Determine extension from content or URL
    ext = '.jpg'
    url_lower = image_url.lower()
    if '.png' in url_lower:
        ext = '.png'
    elif '.webp' in url_lower:
        ext = '.webp'
    elif '.gif' in url_lower:
        ext = '.gif'

    cached_path = PRESS_IMAGE_CACHE / f"{url_hash}{ext}"
    cached_path.write_bytes(image_data)
    return f"/api/press/images/{cached_path.name}"


@router.get("/images/{filename}")
async def serve_press_image(filename: str, current_user: Dict = Depends(get_current_user)):
    """Serve a cached press article image."""
    # Sanitize filename
    if '/' in filename or '..' in filename:
        raise HTTPException(status_code=400, detail="Invalid filename")

    filepath = PRESS_IMAGE_CACHE / filename
    if not filepath.exists():
        raise HTTPException(status_code=404, detail="Image not found")

    # Determine media type
    suffix = filepath.suffix.lower()
    media_types = {
        '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
        '.png': 'image/png', '.webp': 'image/webp', '.gif': 'image/gif',
    }
    media_type = media_types.get(suffix, 'image/jpeg')

    return FileResponse(filepath, media_type=media_type, headers={
        'Cache-Control': 'public, max-age=86400',
    })