media-downloader/scripts/backfill_press.py

#!/usr/bin/env python3
"""
Backfill press articles from Google News RSS for the last year.

Google News RSS:
- 100 articles per query (cap)
- No rate limiting, no API key needed
- ~12 months of history
- Strategy: 1-week windows to stay under the 100 cap
"""

import hashlib
import json
import os
import subprocess
import sys
import time
import urllib.error
import urllib.request
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
from urllib.parse import urlparse

# Bootstrap database
sys.path.insert(0, '/opt/media-downloader')
import modules.db_bootstrap  # noqa: E402,F401

from modules.universal_logger import get_logger

logger = get_logger('PressBackfill')

DB_PASSWORD = "PNsihOXvvuPwWiIvGlsc9Fh2YmMmB"
WEEKS_BACK = 52

# Domains that return no content even with FlareSolverr
SKIP_DOMAINS = {
    'msn.com',
    'news.google.com',
    'imdb.com',
    'st-aug.edu',
}


def fetch_google_news_window(name: str, start_date: str, end_date: str) -> list:
    """Fetch Google News RSS articles for a specific time window.
    Returns list of dicts with: title, url, published_date, source."""
    query = f'%22{name.replace(" ", "+")}%22+after:{start_date}+before:{end_date}'
    url = f'https://news.google.com/rss/search?q={query}&hl=en&gl=US&ceid=US:en'

    for attempt in range(3):
        try:
            req = urllib.request.Request(url, headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            })
            with urllib.request.urlopen(req, timeout=30) as response:
                data = response.read().decode('utf-8')

            root = ET.fromstring(data)
            articles = []
            for item in root.findall('.//item'):
                title_el = item.find('title')
                link_el = item.find('link')
                pub_el = item.find('pubDate')
                source_el = item.find('source')

                if title_el is None or link_el is None:
                    continue

                title = title_el.text or ''
                # Google News titles often end with " - Source Name", strip it
                source_name = source_el.text if source_el is not None else ''
                if source_name and title.endswith(f' - {source_name}'):
                    title = title[:-len(f' - {source_name}')].strip()

                # Parse pubDate (RFC 2822 format)
                published_date = ''
                if pub_el is not None and pub_el.text:
                    try:
                        from email.utils import parsedate_to_datetime
                        dt = parsedate_to_datetime(pub_el.text)
                        published_date = dt.isoformat()
                    except Exception:
                        published_date = pub_el.text

                articles.append({
                    'title': title,
                    'url': link_el.text or '',
                    'published_date': published_date,
                    'source': source_name,
                })
            return articles
        except Exception as e:
            if attempt < 2:
                time.sleep(5)
                continue
            print(f"  Error fetching Google News: {e}")
            return []
    return []


PRESS_IMAGE_CACHE = '/opt/media-downloader/data/press_images'
os.makedirs(PRESS_IMAGE_CACHE, exist_ok=True)


def cache_press_image(image_url: str) -> str | None:
    """Download and cache an image locally. Returns API path."""
    if not image_url:
        return None

    url_hash = hashlib.sha256(image_url.encode('utf-8')).hexdigest()[:16]

    # Check if already cached
    for ext in ('.jpg', '.jpeg', '.png', '.webp', '.gif'):
        cached = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}")
        if os.path.exists(cached) and os.path.getsize(cached) > 0:
            return f"/api/press/images/{url_hash}{ext}"

    # Download
    try:
        req = urllib.request.Request(image_url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'image/*,*/*',
        })
        with urllib.request.urlopen(req, timeout=15) as resp:
            image_data = resp.read()
            if len(image_data) < 1000:
                return None
    except Exception:
        # Try via FlareSolverr — but it can't fetch binary, so try fetching
        # the page and extracting the image URL that works
        return None

    ext = '.jpg'
    url_lower = image_url.lower()
    if '.png' in url_lower:
        ext = '.png'
    elif '.webp' in url_lower:
        ext = '.webp'
    elif '.gif' in url_lower:
        ext = '.gif'

    cached_path = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}")
    with open(cached_path, 'wb') as f:
        f.write(image_data)
    return f"/api/press/images/{url_hash}{ext}"


def cache_content_images(html_content: str) -> str:
    """Find all <img ...> in HTML content, cache each image locally,
    and rewrite src to /api/press/images/... proxy path.
    Removes img tags where caching fails (broken > missing)."""
    if not html_content:
        return html_content
    import re as _re
    def _replace_img(match):
        full_tag = match.group(0)
        src = match.group(1)
        if not src or src.startswith('/api/press/images/'):
            return full_tag
        cached = cache_press_image(src)
        if cached:
            return full_tag.replace(src, cached)
        return ''  # Remove img if caching failed
    return _re.sub(r'<img\s+src="([^"]+)"[^>]*>', _replace_img, html_content)


def decode_google_news_url(google_url: str) -> str | None:
    """Decode a Google News redirect URL to the real article URL."""
    if 'news.google.com' not in google_url:
        return google_url
    try:
        from googlenewsdecoder import gnewsdecoder
        result = gnewsdecoder(google_url, interval=1)
        if result.get('status'):
            return result['decoded_url']
    except Exception:
        pass
    return None


def extract_content(article_url: str) -> tuple[str | None, str | None]:
    """Extract article content and og:image from the real article URL.
    Tries direct fetch first, falls back to FlareSolverr for bot-protected sites.
    Returns (content_html, image_url)."""
    content, image = _extract_content_direct(article_url)
    if content:
        return (content, image)
    # Fallback to FlareSolverr for bot-protected sites
    content2, image2 = _extract_content_flaresolverr(article_url)
    return (content2, image2 or image)


def _fetch_html_flaresolverr(url: str) -> str | None:
    """Fetch HTML via FlareSolverr (headless browser)."""
    try:
        import requests
        resp = requests.post('http://localhost:8191/v1', json={
            'cmd': 'request.get',
            'url': url,
            'maxTimeout': 30000
        }, timeout=45)
        data = resp.json()
        if data.get('status') == 'ok':
            html = data.get('solution', {}).get('response', '')
            if len(html) > 500:
                return html
    except Exception:
        pass
    return None


def _extract_content_flaresolverr(url: str) -> tuple[str | None, str | None]:
    """Extract content using FlareSolverr as fetcher."""
    raw_html = _fetch_html_flaresolverr(url)
    if not raw_html:
        return (None, None)
    return _parse_article_html(raw_html, url)


def _extract_content_direct(url: str) -> tuple[str | None, str | None]:
    """Self-contained article extraction. Returns (content_html, image_url)."""
    import urllib.request

    try:
        req = urllib.request.Request(url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })
        with urllib.request.urlopen(req, timeout=20) as response:
            raw_html = response.read().decode('utf-8', errors='replace')
        return _parse_article_html(raw_html, url)
    except Exception:
        return (None, None)


def _parse_article_html(raw_html: str, url: str) -> tuple[str | None, str | None]:
    """Parse raw HTML into article content and og:image. Returns (content_html, image_url)."""
    import re
    from urllib.parse import urljoin

    try:

        from readability import Document
        from bs4 import BeautifulSoup

        # Extract og:image for thumbnail
        og_soup = BeautifulSoup(raw_html, 'html.parser')
        og_image = None
        og_tag = og_soup.find('meta', property='og:image')
        if og_tag and og_tag.get('content'):
            og_image = og_tag['content']
        if not og_image:
            tw_tag = og_soup.find('meta', attrs={'name': 'twitter:image'})
            if tw_tag and tw_tag.get('content'):
                og_image = tw_tag['content']
        import bleach

        doc = Document(raw_html, url=url)
        content_html = doc.summary()

        if not content_html or len(content_html.strip()) < 50:
            return (None, og_image)

        reader_soup = BeautifulSoup(content_html, 'html.parser')

        junk_text_re = re.compile(
            r'(^published|^updated|^modified|^posted|^by\s|^written by|^photo:|^image:|^credit:|'
            r'share or comment|share this article|comment on this|follow us on|'
            r'sign up for|subscribe to|have you got a story|tips@|email us)',
            re.I
        )

        inline_tags = ['b', 'i', 'em', 'strong', 'a', 'br']
        inline_attrs = {'a': ['href']}

        html_parts = []
        for el in reader_soup.find_all(['p', 'h2', 'h3', 'h4', 'blockquote', 'ul', 'ol']):
            text = el.get_text(strip=True)
            if len(text) < 30:
                continue
            if junk_text_re.search(text):
                continue
            tag = el.name
            inner = bleach.clean(
                el.decode_contents(), tags=inline_tags,
                attributes=inline_attrs, strip=True, protocols=['http', 'https']
            ).strip()
            if not inner:
                continue
            if tag == 'p':
                html_parts.append(f'<p>{inner}</p>')
            elif tag in ('h2', 'h3', 'h4'):
                html_parts.append(f'<{tag}>{inner}</{tag}>')
            elif tag == 'blockquote':
                html_parts.append(f'<blockquote><p>{inner}</p></blockquote>')
            elif tag in ('ul', 'ol'):
                items = []
                for li in el.find_all('li', recursive=False):
                    li_inner = bleach.clean(
                        li.decode_contents(), tags=inline_tags,
                        attributes=inline_attrs, strip=True, protocols=['http', 'https']
                    ).strip()
                    if li_inner and len(li.get_text(strip=True)) > 10:
                        items.append(f'<li>{li_inner}</li>')
                if items:
                    html_parts.append(f'<{tag}>{"".join(items)}</{tag}>')

        # Images from readability
        junk_img_re = re.compile(r'(logo|icon|pixel|spacer|blank|1x1|svg|avatar|spinner|/ct/|cts\.businesswire)', re.I)
        seen_srcs = set()
        article_images = []
        for img in reader_soup.find_all('img'):
            src = img.get('src', '')
            if src and src.startswith(('http://', 'https://')) and src not in seen_srcs:
                if junk_img_re.search(src):
                    continue
                seen_srcs.add(src)
                alt = (img.get('alt', '') or '').strip()
                article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')

        # If readability found no images, grab first real image from original HTML
        if not article_images:
            orig_soup = BeautifulSoup(raw_html, 'html.parser')
            for noise in orig_soup.find_all(['script', 'style', 'nav', 'footer', 'header',
                                              'aside', 'form', 'noscript', 'svg']):
                noise.decompose()
            for img in orig_soup.find_all('img'):
                src = (img.get('data-src') or img.get('data-lazy-src') or
                       img.get('data-original') or img.get('src') or '')
                if not src or not src.startswith(('http://', 'https://')):
                    continue
                src_lower = src.lower()
                if any(x in src_lower for x in ('logo', 'icon', 'pixel', 'spacer', 'blank',
                                                  '1x1', 'svg', 'avatar', 'spinner', '/ct/')):
                    continue
                alt = (img.get('alt', '') or '').strip()
                article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
                break  # Only first real image

        # Merge text + images
        if article_images and html_parts:
            text_count = len(html_parts)
            img_count = len(article_images)
            interval = max(1, text_count // (img_count + 1))
            merged = []
            img_idx = 0
            for i, part in enumerate(html_parts):
                merged.append(part)
                if img_idx < img_count and (i + 1) % interval == 0:
                    merged.append(article_images[img_idx])
                    img_idx += 1
            while img_idx < img_count:
                merged.append(article_images[img_idx])
                img_idx += 1
            html_parts = merged
        elif article_images and not html_parts:
            html_parts = article_images

        if not html_parts:
            text = reader_soup.get_text(separator='\n\n', strip=True)
            if text:
                for para in text.split('\n\n'):
                    para = para.strip()
                    if len(para) > 30:
                        html_parts.append(f'<p>{bleach.clean(para)}</p>')

        if not html_parts:
            return (None, og_image)

        # Quality check
        from bs4 import BeautifulSoup as BS
        clean_parts = []
        for part in html_parts:
            part_soup = BS(part, 'html.parser')
            part_text = part_soup.get_text(strip=True)
            if len(part_text) > 100:
                words = part_text.split()
                avg_word_len = len(part_text) / max(len(words), 1)
                if avg_word_len > 12:
                    continue
            clean_parts.append(part)

        if not clean_parts:
            return (None, og_image)

        result = '\n'.join(clean_parts)
        plain_text = BS(result, 'html.parser').get_text(separator=' ', strip=True)

        garbage_re = re.compile(
            r'(use (left|right|escape)|arrow keys|navigate between|'
            r'sign (in|up) with|we won.t post|social account|'
            r'accept cookies|cookie policy|privacy policy|terms of (use|service)|'
            r'AlabamaAlaska|CaliforniaColorado|United States of America)',
            re.I
        )
        if len(plain_text) < 200 or garbage_re.search(plain_text):
            return (None, og_image)

        return (result, og_image)
    except Exception:
        return (None, None)


def main():
    # Get configured celebrities
    env = os.environ.copy()
    env['PGPASSWORD'] = DB_PASSWORD

    result = subprocess.run(
        ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
         '-tAc', "SELECT celebrity_ids FROM press_config WHERE id = 1"],
        capture_output=True, text=True, env=env
    )
    celebrity_ids = json.loads(result.stdout.strip()) if result.stdout.strip() else []

    if not celebrity_ids:
        print("No celebrities configured in press_config")
        return

    # Get celebrity names
    placeholders = ','.join(str(i) for i in celebrity_ids)
    result = subprocess.run(
        ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
         '-tAF', '|', '-c', f"SELECT id, name FROM celebrity_profiles WHERE id IN ({placeholders})"],
        capture_output=True, text=True, env=env
    )
    celebrities = []
    for line in result.stdout.strip().splitlines():
        if '|' in line:
            parts = line.split('|')
            celebrities.append({'id': int(parts[0]), 'name': parts[1].strip()})

    if not celebrities:
        print("No celebrities found")
        return

    # Get existing URL hashes for dedup
    result = subprocess.run(
        ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
         '-tAc', "SELECT url_hash FROM press_articles"],
        capture_output=True, text=True, env=env
    )
    existing_hashes = set(line.strip() for line in result.stdout.strip().splitlines() if line.strip())
    print(f"Existing articles: {len(existing_hashes)}")

    # Also get existing titles per celebrity for dedup
    result = subprocess.run(
        ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
         '-tAF', '|', '-c', "SELECT celebrity_id, title FROM press_articles"],
        capture_output=True, text=True, env=env
    )
    existing_titles = set()
    for line in result.stdout.strip().splitlines():
        if '|' in line:
            parts = line.split('|', 1)
            existing_titles.add((parts[0].strip(), parts[1].strip()))

    now = datetime.now()
    total_new = 0
    total_fetched = 0

    for celeb in celebrities:
        celeb_id = celeb['id']
        celeb_name = celeb['name']
        print(f"\n{'='*60}")
        print(f"Backfilling: {celeb_name} (id={celeb_id})")
        print(f"{'='*60}")

        celeb_new = 0

        # Query in 1-week windows going back
        for week in range(WEEKS_BACK):
            end_dt = now - timedelta(weeks=week)
            start_dt = now - timedelta(weeks=week + 1)

            start_str = start_dt.strftime('%Y-%m-%d')
            end_str = end_dt.strftime('%Y-%m-%d')
            week_label = f"Week -{week+1} ({start_dt.strftime('%b %d')} - {end_dt.strftime('%b %d')})"
            print(f"\n  {week_label}...", end='', flush=True)

            articles = fetch_google_news_window(celeb_name, start_str, end_str)
            total_fetched += len(articles)

            if not articles:
                print(f" no articles")
                continue

            # Warn if we hit the 100 cap (may be missing articles)
            cap_warning = " [HIT 100 CAP]" if len(articles) >= 100 else ""
            print(f" {len(articles)} found{cap_warning}", flush=True)
            week_new = 0

            for article in articles:
                google_url = article.get('url', '')
                if not google_url:
                    continue

                title = article.get('title', '').strip()
                if title and (str(celeb_id), title) in existing_titles:
                    continue

                # Only keep articles where celeb name appears in the title
                if not title or celeb_name.lower() not in title.lower():
                    continue

                # Decode Google News URL to real article URL
                article_url = decode_google_news_url(google_url)
                if not article_url:
                    continue

                # Skip domains that are JS-rendered or block scrapers
                parsed_check = urlparse(article_url)
                host = parsed_check.netloc.lower()
                # Check if host or any parent domain is in SKIP_DOMAINS
                if any(host == d or host.endswith('.' + d) for d in SKIP_DOMAINS):
                    continue

                url_hash = hashlib.sha256(article_url.encode('utf-8')).hexdigest()
                if url_hash in existing_hashes:
                    continue

                # Parse domain from real URL
                parsed = urlparse(article_url)
                domain = parsed.netloc.replace('www.', '')

                published_date = article.get('published_date', '')
                source = article.get('source', '')

                # Extract content and og:image (with rate limiting to be polite)
                content, og_image = extract_content(article_url)

                # Cache all inline images in the content to local proxy
                if content:
                    content = cache_content_images(content)

                if content:
                    import re as _re3
                    snippet = _re3.sub(r'<[^>]+>', ' ', content)
                    snippet = ' '.join(snippet.split())[:300]
                else:
                    snippet = title[:300] if title else ''

                # Cache the og:image locally, fall back to first inline image
                image_url = cache_press_image(og_image) if og_image else None
                if not image_url and content:
                    import re as _re2
                    m = _re2.search(r'<img\s+src="(/api/press/images/[^"]+)"', content)
                    if m:
                        image_url = m.group(1)
                time.sleep(0.5)

                # Insert using parameterized query via psycopg2
                import psycopg2
                try:
                    pg_conn = psycopg2.connect(
                        host='localhost', user='media_downloader',
                        password=env.get('PGPASSWORD', ''), dbname='media_downloader'
                    )
                    pg_cur = pg_conn.cursor()
                    pg_cur.execute("""INSERT INTO press_articles
                        (celebrity_id, title, url, url_hash, domain, published_date,
                         image_url, language, country, article_content, snippet, notified, read)
                        VALUES (%s, %s, %s, %s, %s, %s, %s, 'en', '', %s, %s, 1, 0)
                        ON CONFLICT DO NOTHING""",
                        (celeb_id, title, article_url, url_hash, domain,
                         published_date, image_url or '', content, snippet))
                    inserted = pg_cur.rowcount > 0
                    pg_conn.commit()
                    pg_cur.close()
                    pg_conn.close()
                except Exception as db_err:
                    print(f"    DB error: {db_err}")
                    inserted = False
                if inserted:
                    week_new += 1
                    existing_hashes.add(url_hash)
                    existing_titles.add((str(celeb_id), title))

            if week_new > 0:
                print(f"    Added {week_new} new articles")
            celeb_new += week_new

            # Small delay between queries to be polite
            time.sleep(1)

        total_new += celeb_new
        print(f"\n  {celeb_name}: {celeb_new} new articles added")

    print(f"\n{'='*60}")
    print(f"DONE: Fetched {total_fetched} total, added {total_new} new articles")
    print(f"{'='*60}")


if __name__ == '__main__':
    main()