Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/scripts/backfill_press.py
+++ b/scripts/backfill_press.py
@@ -0,0 +1,594 @@
+#!/usr/bin/env python3
+"""
+Backfill press articles from Google News RSS for the last year.
+
+Google News RSS:
+- 100 articles per query (cap)
+- No rate limiting, no API key needed
+- ~12 months of history
+- Strategy: 1-week windows to stay under the 100 cap
+"""
+
+import hashlib
+import json
+import os
+import subprocess
+import sys
+import time
+import urllib.error
+import urllib.request
+import xml.etree.ElementTree as ET
+from datetime import datetime, timedelta
+from urllib.parse import urlparse
+
+# Bootstrap database
+sys.path.insert(0, '/opt/media-downloader')
+import modules.db_bootstrap  # noqa: E402,F401
+
+from modules.universal_logger import get_logger
+
+logger = get_logger('PressBackfill')
+
+DB_PASSWORD = "PNsihOXvvuPwWiIvGlsc9Fh2YmMmB"
+WEEKS_BACK = 52
+
+# Domains that return no content even with FlareSolverr
+SKIP_DOMAINS = {
+    'msn.com',
+    'news.google.com',
+    'imdb.com',
+    'st-aug.edu',
+}
+
+
+def fetch_google_news_window(name: str, start_date: str, end_date: str) -> list:
+    """Fetch Google News RSS articles for a specific time window.
+    Returns list of dicts with: title, url, published_date, source."""
+    query = f'%22{name.replace(" ", "+")}%22+after:{start_date}+before:{end_date}'
+    url = f'https://news.google.com/rss/search?q={query}&hl=en&gl=US&ceid=US:en'
+
+    for attempt in range(3):
+        try:
+            req = urllib.request.Request(url, headers={
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+            })
+            with urllib.request.urlopen(req, timeout=30) as response:
+                data = response.read().decode('utf-8')
+
+            root = ET.fromstring(data)
+            articles = []
+            for item in root.findall('.//item'):
+                title_el = item.find('title')
+                link_el = item.find('link')
+                pub_el = item.find('pubDate')
+                source_el = item.find('source')
+
+                if title_el is None or link_el is None:
+                    continue
+
+                title = title_el.text or ''
+                # Google News titles often end with " - Source Name", strip it
+                source_name = source_el.text if source_el is not None else ''
+                if source_name and title.endswith(f' - {source_name}'):
+                    title = title[:-len(f' - {source_name}')].strip()
+
+                # Parse pubDate (RFC 2822 format)
+                published_date = ''
+                if pub_el is not None and pub_el.text:
+                    try:
+                        from email.utils import parsedate_to_datetime
+                        dt = parsedate_to_datetime(pub_el.text)
+                        published_date = dt.isoformat()
+                    except Exception:
+                        published_date = pub_el.text
+
+                articles.append({
+                    'title': title,
+                    'url': link_el.text or '',
+                    'published_date': published_date,
+                    'source': source_name,
+                })
+            return articles
+        except Exception as e:
+            if attempt < 2:
+                time.sleep(5)
+                continue
+            print(f"  Error fetching Google News: {e}")
+            return []
+    return []
+
+
+PRESS_IMAGE_CACHE = '/opt/media-downloader/data/press_images'
+os.makedirs(PRESS_IMAGE_CACHE, exist_ok=True)
+
+
+def cache_press_image(image_url: str) -> str | None:
+    """Download and cache an image locally. Returns API path."""
+    if not image_url:
+        return None
+
+    url_hash = hashlib.sha256(image_url.encode('utf-8')).hexdigest()[:16]
+
+    # Check if already cached
+    for ext in ('.jpg', '.jpeg', '.png', '.webp', '.gif'):
+        cached = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}")
+        if os.path.exists(cached) and os.path.getsize(cached) > 0:
+            return f"/api/press/images/{url_hash}{ext}"
+
+    # Download
+    try:
+        req = urllib.request.Request(image_url, headers={
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+            'Accept': 'image/*,*/*',
+        })
+        with urllib.request.urlopen(req, timeout=15) as resp:
+            image_data = resp.read()
+            if len(image_data) < 1000:
+                return None
+    except Exception:
+        # Try via FlareSolverr — but it can't fetch binary, so try fetching
+        # the page and extracting the image URL that works
+        return None
+
+    ext = '.jpg'
+    url_lower = image_url.lower()
+    if '.png' in url_lower:
+        ext = '.png'
+    elif '.webp' in url_lower:
+        ext = '.webp'
+    elif '.gif' in url_lower:
+        ext = '.gif'
+
+    cached_path = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}")
+    with open(cached_path, 'wb') as f:
+        f.write(image_data)
+    return f"/api/press/images/{url_hash}{ext}"
+
+
+def cache_content_images(html_content: str) -> str:
+    """Find all <img ...> in HTML content, cache each image locally,
+    and rewrite src to /api/press/images/... proxy path.
+    Removes img tags where caching fails (broken > missing)."""
+    if not html_content:
+        return html_content
+    import re as _re
+    def _replace_img(match):
+        full_tag = match.group(0)
+        src = match.group(1)
+        if not src or src.startswith('/api/press/images/'):
+            return full_tag
+        cached = cache_press_image(src)
+        if cached:
+            return full_tag.replace(src, cached)
+        return ''  # Remove img if caching failed
+    return _re.sub(r'<img\s+src="([^"]+)"[^>]*>', _replace_img, html_content)
+
+
+def decode_google_news_url(google_url: str) -> str | None:
+    """Decode a Google News redirect URL to the real article URL."""
+    if 'news.google.com' not in google_url:
+        return google_url
+    try:
+        from googlenewsdecoder import gnewsdecoder
+        result = gnewsdecoder(google_url, interval=1)
+        if result.get('status'):
+            return result['decoded_url']
+    except Exception:
+        pass
+    return None
+
+
+def extract_content(article_url: str) -> tuple[str | None, str | None]:
+    """Extract article content and og:image from the real article URL.
+    Tries direct fetch first, falls back to FlareSolverr for bot-protected sites.
+    Returns (content_html, image_url)."""
+    content, image = _extract_content_direct(article_url)
+    if content:
+        return (content, image)
+    # Fallback to FlareSolverr for bot-protected sites
+    content2, image2 = _extract_content_flaresolverr(article_url)
+    return (content2, image2 or image)
+
+
+def _fetch_html_flaresolverr(url: str) -> str | None:
+    """Fetch HTML via FlareSolverr (headless browser)."""
+    try:
+        import requests
+        resp = requests.post('http://localhost:8191/v1', json={
+            'cmd': 'request.get',
+            'url': url,
+            'maxTimeout': 30000
+        }, timeout=45)
+        data = resp.json()
+        if data.get('status') == 'ok':
+            html = data.get('solution', {}).get('response', '')
+            if len(html) > 500:
+                return html
+    except Exception:
+        pass
+    return None
+
+
+def _extract_content_flaresolverr(url: str) -> tuple[str | None, str | None]:
+    """Extract content using FlareSolverr as fetcher."""
+    raw_html = _fetch_html_flaresolverr(url)
+    if not raw_html:
+        return (None, None)
+    return _parse_article_html(raw_html, url)
+
+
+def _extract_content_direct(url: str) -> tuple[str | None, str | None]:
+    """Self-contained article extraction. Returns (content_html, image_url)."""
+    import urllib.request
+
+    try:
+        req = urllib.request.Request(url, headers={
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+        })
+        with urllib.request.urlopen(req, timeout=20) as response:
+            raw_html = response.read().decode('utf-8', errors='replace')
+        return _parse_article_html(raw_html, url)
+    except Exception:
+        return (None, None)
+
+
+def _parse_article_html(raw_html: str, url: str) -> tuple[str | None, str | None]:
+    """Parse raw HTML into article content and og:image. Returns (content_html, image_url)."""
+    import re
+    from urllib.parse import urljoin
+
+    try:
+
+        from readability import Document
+        from bs4 import BeautifulSoup
+
+        # Extract og:image for thumbnail
+        og_soup = BeautifulSoup(raw_html, 'html.parser')
+        og_image = None
+        og_tag = og_soup.find('meta', property='og:image')
+        if og_tag and og_tag.get('content'):
+            og_image = og_tag['content']
+        if not og_image:
+            tw_tag = og_soup.find('meta', attrs={'name': 'twitter:image'})
+            if tw_tag and tw_tag.get('content'):
+                og_image = tw_tag['content']
+        import bleach
+
+        doc = Document(raw_html, url=url)
+        content_html = doc.summary()
+
+        if not content_html or len(content_html.strip()) < 50:
+            return (None, og_image)
+
+        reader_soup = BeautifulSoup(content_html, 'html.parser')
+
+        junk_text_re = re.compile(
+            r'(^published|^updated|^modified|^posted|^by\s|^written by|^photo:|^image:|^credit:|'
+            r'share or comment|share this article|comment on this|follow us on|'
+            r'sign up for|subscribe to|have you got a story|tips@|email us)',
+            re.I
+        )
+
+        inline_tags = ['b', 'i', 'em', 'strong', 'a', 'br']
+        inline_attrs = {'a': ['href']}
+
+        html_parts = []
+        for el in reader_soup.find_all(['p', 'h2', 'h3', 'h4', 'blockquote', 'ul', 'ol']):
+            text = el.get_text(strip=True)
+            if len(text) < 30:
+                continue
+            if junk_text_re.search(text):
+                continue
+            tag = el.name
+            inner = bleach.clean(
+                el.decode_contents(), tags=inline_tags,
+                attributes=inline_attrs, strip=True, protocols=['http', 'https']
+            ).strip()
+            if not inner:
+                continue
+            if tag == 'p':
+                html_parts.append(f'<p>{inner}</p>')
+            elif tag in ('h2', 'h3', 'h4'):
+                html_parts.append(f'<{tag}>{inner}</{tag}>')
+            elif tag == 'blockquote':
+                html_parts.append(f'<blockquote><p>{inner}</p></blockquote>')
+            elif tag in ('ul', 'ol'):
+                items = []
+                for li in el.find_all('li', recursive=False):
+                    li_inner = bleach.clean(
+                        li.decode_contents(), tags=inline_tags,
+                        attributes=inline_attrs, strip=True, protocols=['http', 'https']
+                    ).strip()
+                    if li_inner and len(li.get_text(strip=True)) > 10:
+                        items.append(f'<li>{li_inner}</li>')
+                if items:
+                    html_parts.append(f'<{tag}>{"".join(items)}</{tag}>')
+
+        # Images from readability
+        junk_img_re = re.compile(r'(logo|icon|pixel|spacer|blank|1x1|svg|avatar|spinner|/ct/|cts\.businesswire)', re.I)
+        seen_srcs = set()
+        article_images = []
+        for img in reader_soup.find_all('img'):
+            src = img.get('src', '')
+            if src and src.startswith(('http://', 'https://')) and src not in seen_srcs:
+                if junk_img_re.search(src):
+                    continue
+                seen_srcs.add(src)
+                alt = (img.get('alt', '') or '').strip()
+                article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
+
+        # If readability found no images, grab first real image from original HTML
+        if not article_images:
+            orig_soup = BeautifulSoup(raw_html, 'html.parser')
+            for noise in orig_soup.find_all(['script', 'style', 'nav', 'footer', 'header',
+                                              'aside', 'form', 'noscript', 'svg']):
+                noise.decompose()
+            for img in orig_soup.find_all('img'):
+                src = (img.get('data-src') or img.get('data-lazy-src') or
+                       img.get('data-original') or img.get('src') or '')
+                if not src or not src.startswith(('http://', 'https://')):
+                    continue
+                src_lower = src.lower()
+                if any(x in src_lower for x in ('logo', 'icon', 'pixel', 'spacer', 'blank',
+                                                  '1x1', 'svg', 'avatar', 'spinner', '/ct/')):
+                    continue
+                alt = (img.get('alt', '') or '').strip()
+                article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
+                break  # Only first real image
+
+        # Merge text + images
+        if article_images and html_parts:
+            text_count = len(html_parts)
+            img_count = len(article_images)
+            interval = max(1, text_count // (img_count + 1))
+            merged = []
+            img_idx = 0
+            for i, part in enumerate(html_parts):
+                merged.append(part)
+                if img_idx < img_count and (i + 1) % interval == 0:
+                    merged.append(article_images[img_idx])
+                    img_idx += 1
+            while img_idx < img_count:
+                merged.append(article_images[img_idx])
+                img_idx += 1
+            html_parts = merged
+        elif article_images and not html_parts:
+            html_parts = article_images
+
+        if not html_parts:
+            text = reader_soup.get_text(separator='\n\n', strip=True)
+            if text:
+                for para in text.split('\n\n'):
+                    para = para.strip()
+                    if len(para) > 30:
+                        html_parts.append(f'<p>{bleach.clean(para)}</p>')
+
+        if not html_parts:
+            return (None, og_image)
+
+        # Quality check
+        from bs4 import BeautifulSoup as BS
+        clean_parts = []
+        for part in html_parts:
+            part_soup = BS(part, 'html.parser')
+            part_text = part_soup.get_text(strip=True)
+            if len(part_text) > 100:
+                words = part_text.split()
+                avg_word_len = len(part_text) / max(len(words), 1)
+                if avg_word_len > 12:
+                    continue
+            clean_parts.append(part)
+
+        if not clean_parts:
+            return (None, og_image)
+
+        result = '\n'.join(clean_parts)
+        plain_text = BS(result, 'html.parser').get_text(separator=' ', strip=True)
+
+        garbage_re = re.compile(
+            r'(use (left|right|escape)|arrow keys|navigate between|'
+            r'sign (in|up) with|we won.t post|social account|'
+            r'accept cookies|cookie policy|privacy policy|terms of (use|service)|'
+            r'AlabamaAlaska|CaliforniaColorado|United States of America)',
+            re.I
+        )
+        if len(plain_text) < 200 or garbage_re.search(plain_text):
+            return (None, og_image)
+
+        return (result, og_image)
+    except Exception:
+        return (None, None)
+
+
+def main():
+    # Get configured celebrities
+    env = os.environ.copy()
+    env['PGPASSWORD'] = DB_PASSWORD
+
+    result = subprocess.run(
+        ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
+         '-tAc', "SELECT celebrity_ids FROM press_config WHERE id = 1"],
+        capture_output=True, text=True, env=env
+    )
+    celebrity_ids = json.loads(result.stdout.strip()) if result.stdout.strip() else []
+
+    if not celebrity_ids:
+        print("No celebrities configured in press_config")
+        return
+
+    # Get celebrity names
+    placeholders = ','.join(str(i) for i in celebrity_ids)
+    result = subprocess.run(
+        ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
+         '-tAF', '|', '-c', f"SELECT id, name FROM celebrity_profiles WHERE id IN ({placeholders})"],
+        capture_output=True, text=True, env=env
+    )
+    celebrities = []
+    for line in result.stdout.strip().splitlines():
+        if '|' in line:
+            parts = line.split('|')
+            celebrities.append({'id': int(parts[0]), 'name': parts[1].strip()})
+
+    if not celebrities:
+        print("No celebrities found")
+        return
+
+    # Get existing URL hashes for dedup
+    result = subprocess.run(
+        ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
+         '-tAc', "SELECT url_hash FROM press_articles"],
+        capture_output=True, text=True, env=env
+    )
+    existing_hashes = set(line.strip() for line in result.stdout.strip().splitlines() if line.strip())
+    print(f"Existing articles: {len(existing_hashes)}")
+
+    # Also get existing titles per celebrity for dedup
+    result = subprocess.run(
+        ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
+         '-tAF', '|', '-c', "SELECT celebrity_id, title FROM press_articles"],
+        capture_output=True, text=True, env=env
+    )
+    existing_titles = set()
+    for line in result.stdout.strip().splitlines():
+        if '|' in line:
+            parts = line.split('|', 1)
+            existing_titles.add((parts[0].strip(), parts[1].strip()))
+
+    now = datetime.now()
+    total_new = 0
+    total_fetched = 0
+
+    for celeb in celebrities:
+        celeb_id = celeb['id']
+        celeb_name = celeb['name']
+        print(f"\n{'='*60}")
+        print(f"Backfilling: {celeb_name} (id={celeb_id})")
+        print(f"{'='*60}")
+
+        celeb_new = 0
+
+        # Query in 1-week windows going back
+        for week in range(WEEKS_BACK):
+            end_dt = now - timedelta(weeks=week)
+            start_dt = now - timedelta(weeks=week + 1)
+
+            start_str = start_dt.strftime('%Y-%m-%d')
+            end_str = end_dt.strftime('%Y-%m-%d')
+            week_label = f"Week -{week+1} ({start_dt.strftime('%b %d')} - {end_dt.strftime('%b %d')})"
+            print(f"\n  {week_label}...", end='', flush=True)
+
+            articles = fetch_google_news_window(celeb_name, start_str, end_str)
+            total_fetched += len(articles)
+
+            if not articles:
+                print(f" no articles")
+                continue
+
+            # Warn if we hit the 100 cap (may be missing articles)
+            cap_warning = " [HIT 100 CAP]" if len(articles) >= 100 else ""
+            print(f" {len(articles)} found{cap_warning}", flush=True)
+            week_new = 0
+
+            for article in articles:
+                google_url = article.get('url', '')
+                if not google_url:
+                    continue
+
+                title = article.get('title', '').strip()
+                if title and (str(celeb_id), title) in existing_titles:
+                    continue
+
+                # Only keep articles where celeb name appears in the title
+                if not title or celeb_name.lower() not in title.lower():
+                    continue
+
+                # Decode Google News URL to real article URL
+                article_url = decode_google_news_url(google_url)
+                if not article_url:
+                    continue
+
+                # Skip domains that are JS-rendered or block scrapers
+                parsed_check = urlparse(article_url)
+                host = parsed_check.netloc.lower()
+                # Check if host or any parent domain is in SKIP_DOMAINS
+                if any(host == d or host.endswith('.' + d) for d in SKIP_DOMAINS):
+                    continue
+
+                url_hash = hashlib.sha256(article_url.encode('utf-8')).hexdigest()
+                if url_hash in existing_hashes:
+                    continue
+
+                # Parse domain from real URL
+                parsed = urlparse(article_url)
+                domain = parsed.netloc.replace('www.', '')
+
+                published_date = article.get('published_date', '')
+                source = article.get('source', '')
+
+                # Extract content and og:image (with rate limiting to be polite)
+                content, og_image = extract_content(article_url)
+
+                # Cache all inline images in the content to local proxy
+                if content:
+                    content = cache_content_images(content)
+
+                if content:
+                    import re as _re3
+                    snippet = _re3.sub(r'<[^>]+>', ' ', content)
+                    snippet = ' '.join(snippet.split())[:300]
+                else:
+                    snippet = title[:300] if title else ''
+
+                # Cache the og:image locally, fall back to first inline image
+                image_url = cache_press_image(og_image) if og_image else None
+                if not image_url and content:
+                    import re as _re2
+                    m = _re2.search(r'<img\s+src="(/api/press/images/[^"]+)"', content)
+                    if m:
+                        image_url = m.group(1)
+                time.sleep(0.5)
+
+                # Insert using parameterized query via psycopg2
+                import psycopg2
+                try:
+                    pg_conn = psycopg2.connect(
+                        host='localhost', user='media_downloader',
+                        password=env.get('PGPASSWORD', ''), dbname='media_downloader'
+                    )
+                    pg_cur = pg_conn.cursor()
+                    pg_cur.execute("""INSERT INTO press_articles
+                        (celebrity_id, title, url, url_hash, domain, published_date,
+                         image_url, language, country, article_content, snippet, notified, read)
+                        VALUES (%s, %s, %s, %s, %s, %s, %s, 'en', '', %s, %s, 1, 0)
+                        ON CONFLICT DO NOTHING""",
+                        (celeb_id, title, article_url, url_hash, domain,
+                         published_date, image_url or '', content, snippet))
+                    inserted = pg_cur.rowcount > 0
+                    pg_conn.commit()
+                    pg_cur.close()
+                    pg_conn.close()
+                except Exception as db_err:
+                    print(f"    DB error: {db_err}")
+                    inserted = False
+                if inserted:
+                    week_new += 1
+                    existing_hashes.add(url_hash)
+                    existing_titles.add((str(celeb_id), title))
+
+            if week_new > 0:
+                print(f"    Added {week_new} new articles")
+            celeb_new += week_new
+
+            # Small delay between queries to be polite
+            time.sleep(1)
+
+        total_new += celeb_new
+        print(f"\n  {celeb_name}: {celeb_new} new articles added")
+
+    print(f"\n{'='*60}")
+    print(f"DONE: Fetched {total_fetched} total, added {total_new} new articles")
+    print(f"{'='*60}")
+
+
+if __name__ == '__main__':
+    main()