#!/usr/bin/env python3
"""
Backfill press articles from Google News RSS for the last year.
Google News RSS:
- 100 articles per query (cap)
- No rate limiting, no API key needed
- ~12 months of history
- Strategy: 1-week windows to stay under the 100 cap
"""
import hashlib
import json
import os
import subprocess
import sys
import time
import urllib.error
import urllib.request
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
from urllib.parse import urlparse
# Bootstrap database
sys.path.insert(0, '/opt/media-downloader')
import modules.db_bootstrap # noqa: E402,F401
from modules.universal_logger import get_logger
logger = get_logger('PressBackfill')
DB_PASSWORD = "PNsihOXvvuPwWiIvGlsc9Fh2YmMmB"
WEEKS_BACK = 52
# Domains that return no content even with FlareSolverr
SKIP_DOMAINS = {
'msn.com',
'news.google.com',
'imdb.com',
'st-aug.edu',
}
def fetch_google_news_window(name: str, start_date: str, end_date: str) -> list:
"""Fetch Google News RSS articles for a specific time window.
Returns list of dicts with: title, url, published_date, source."""
query = f'%22{name.replace(" ", "+")}%22+after:{start_date}+before:{end_date}'
url = f'https://news.google.com/rss/search?q={query}&hl=en&gl=US&ceid=US:en'
for attempt in range(3):
try:
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
with urllib.request.urlopen(req, timeout=30) as response:
data = response.read().decode('utf-8')
root = ET.fromstring(data)
articles = []
for item in root.findall('.//item'):
title_el = item.find('title')
link_el = item.find('link')
pub_el = item.find('pubDate')
source_el = item.find('source')
if title_el is None or link_el is None:
continue
title = title_el.text or ''
# Google News titles often end with " - Source Name", strip it
source_name = source_el.text if source_el is not None else ''
if source_name and title.endswith(f' - {source_name}'):
title = title[:-len(f' - {source_name}')].strip()
# Parse pubDate (RFC 2822 format)
published_date = ''
if pub_el is not None and pub_el.text:
try:
from email.utils import parsedate_to_datetime
dt = parsedate_to_datetime(pub_el.text)
published_date = dt.isoformat()
except Exception:
published_date = pub_el.text
articles.append({
'title': title,
'url': link_el.text or '',
'published_date': published_date,
'source': source_name,
})
return articles
except Exception as e:
if attempt < 2:
time.sleep(5)
continue
print(f" Error fetching Google News: {e}")
return []
return []
PRESS_IMAGE_CACHE = '/opt/media-downloader/data/press_images'
os.makedirs(PRESS_IMAGE_CACHE, exist_ok=True)
def cache_press_image(image_url: str) -> str | None:
"""Download and cache an image locally. Returns API path."""
if not image_url:
return None
url_hash = hashlib.sha256(image_url.encode('utf-8')).hexdigest()[:16]
# Check if already cached
for ext in ('.jpg', '.jpeg', '.png', '.webp', '.gif'):
cached = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}")
if os.path.exists(cached) and os.path.getsize(cached) > 0:
return f"/api/press/images/{url_hash}{ext}"
# Download
try:
req = urllib.request.Request(image_url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'image/*,*/*',
})
with urllib.request.urlopen(req, timeout=15) as resp:
image_data = resp.read()
if len(image_data) < 1000:
return None
except Exception:
# Try via FlareSolverr — but it can't fetch binary, so try fetching
# the page and extracting the image URL that works
return None
ext = '.jpg'
url_lower = image_url.lower()
if '.png' in url_lower:
ext = '.png'
elif '.webp' in url_lower:
ext = '.webp'
elif '.gif' in url_lower:
ext = '.gif'
cached_path = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}")
with open(cached_path, 'wb') as f:
f.write(image_data)
return f"/api/press/images/{url_hash}{ext}"
def cache_content_images(html_content: str) -> str:
"""Find all in HTML content, cache each image locally,
and rewrite src to /api/press/images/... proxy path.
Removes img tags where caching fails (broken > missing)."""
if not html_content:
return html_content
import re as _re
def _replace_img(match):
full_tag = match.group(0)
src = match.group(1)
if not src or src.startswith('/api/press/images/'):
return full_tag
cached = cache_press_image(src)
if cached:
return full_tag.replace(src, cached)
return '' # Remove img if caching failed
return _re.sub(r'
]*>', _replace_img, html_content)
def decode_google_news_url(google_url: str) -> str | None:
"""Decode a Google News redirect URL to the real article URL."""
if 'news.google.com' not in google_url:
return google_url
try:
from googlenewsdecoder import gnewsdecoder
result = gnewsdecoder(google_url, interval=1)
if result.get('status'):
return result['decoded_url']
except Exception:
pass
return None
def extract_content(article_url: str) -> tuple[str | None, str | None]:
"""Extract article content and og:image from the real article URL.
Tries direct fetch first, falls back to FlareSolverr for bot-protected sites.
Returns (content_html, image_url)."""
content, image = _extract_content_direct(article_url)
if content:
return (content, image)
# Fallback to FlareSolverr for bot-protected sites
content2, image2 = _extract_content_flaresolverr(article_url)
return (content2, image2 or image)
def _fetch_html_flaresolverr(url: str) -> str | None:
"""Fetch HTML via FlareSolverr (headless browser)."""
try:
import requests
resp = requests.post('http://localhost:8191/v1', json={
'cmd': 'request.get',
'url': url,
'maxTimeout': 30000
}, timeout=45)
data = resp.json()
if data.get('status') == 'ok':
html = data.get('solution', {}).get('response', '')
if len(html) > 500:
return html
except Exception:
pass
return None
def _extract_content_flaresolverr(url: str) -> tuple[str | None, str | None]:
"""Extract content using FlareSolverr as fetcher."""
raw_html = _fetch_html_flaresolverr(url)
if not raw_html:
return (None, None)
return _parse_article_html(raw_html, url)
def _extract_content_direct(url: str) -> tuple[str | None, str | None]:
"""Self-contained article extraction. Returns (content_html, image_url)."""
import urllib.request
try:
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
})
with urllib.request.urlopen(req, timeout=20) as response:
raw_html = response.read().decode('utf-8', errors='replace')
return _parse_article_html(raw_html, url)
except Exception:
return (None, None)
def _parse_article_html(raw_html: str, url: str) -> tuple[str | None, str | None]:
"""Parse raw HTML into article content and og:image. Returns (content_html, image_url)."""
import re
from urllib.parse import urljoin
try:
from readability import Document
from bs4 import BeautifulSoup
# Extract og:image for thumbnail
og_soup = BeautifulSoup(raw_html, 'html.parser')
og_image = None
og_tag = og_soup.find('meta', property='og:image')
if og_tag and og_tag.get('content'):
og_image = og_tag['content']
if not og_image:
tw_tag = og_soup.find('meta', attrs={'name': 'twitter:image'})
if tw_tag and tw_tag.get('content'):
og_image = tw_tag['content']
import bleach
doc = Document(raw_html, url=url)
content_html = doc.summary()
if not content_html or len(content_html.strip()) < 50:
return (None, og_image)
reader_soup = BeautifulSoup(content_html, 'html.parser')
junk_text_re = re.compile(
r'(^published|^updated|^modified|^posted|^by\s|^written by|^photo:|^image:|^credit:|'
r'share or comment|share this article|comment on this|follow us on|'
r'sign up for|subscribe to|have you got a story|tips@|email us)',
re.I
)
inline_tags = ['b', 'i', 'em', 'strong', 'a', 'br']
inline_attrs = {'a': ['href']}
html_parts = []
for el in reader_soup.find_all(['p', 'h2', 'h3', 'h4', 'blockquote', 'ul', 'ol']):
text = el.get_text(strip=True)
if len(text) < 30:
continue
if junk_text_re.search(text):
continue
tag = el.name
inner = bleach.clean(
el.decode_contents(), tags=inline_tags,
attributes=inline_attrs, strip=True, protocols=['http', 'https']
).strip()
if not inner:
continue
if tag == 'p':
html_parts.append(f'
{inner}
') elif tag in ('h2', 'h3', 'h4'): html_parts.append(f'<{tag}>{inner}{tag}>') elif tag == 'blockquote': html_parts.append(f'') elif tag in ('ul', 'ol'): items = [] for li in el.find_all('li', recursive=False): li_inner = bleach.clean( li.decode_contents(), tags=inline_tags, attributes=inline_attrs, strip=True, protocols=['http', 'https'] ).strip() if li_inner and len(li.get_text(strip=True)) > 10: items.append(f'{inner}
{bleach.clean(para)}
') if not html_parts: return (None, og_image) # Quality check from bs4 import BeautifulSoup as BS clean_parts = [] for part in html_parts: part_soup = BS(part, 'html.parser') part_text = part_soup.get_text(strip=True) if len(part_text) > 100: words = part_text.split() avg_word_len = len(part_text) / max(len(words), 1) if avg_word_len > 12: continue clean_parts.append(part) if not clean_parts: return (None, og_image) result = '\n'.join(clean_parts) plain_text = BS(result, 'html.parser').get_text(separator=' ', strip=True) garbage_re = re.compile( r'(use (left|right|escape)|arrow keys|navigate between|' r'sign (in|up) with|we won.t post|social account|' r'accept cookies|cookie policy|privacy policy|terms of (use|service)|' r'AlabamaAlaska|CaliforniaColorado|United States of America)', re.I ) if len(plain_text) < 200 or garbage_re.search(plain_text): return (None, og_image) return (result, og_image) except Exception: return (None, None) def main(): # Get configured celebrities env = os.environ.copy() env['PGPASSWORD'] = DB_PASSWORD result = subprocess.run( ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader', '-tAc', "SELECT celebrity_ids FROM press_config WHERE id = 1"], capture_output=True, text=True, env=env ) celebrity_ids = json.loads(result.stdout.strip()) if result.stdout.strip() else [] if not celebrity_ids: print("No celebrities configured in press_config") return # Get celebrity names placeholders = ','.join(str(i) for i in celebrity_ids) result = subprocess.run( ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader', '-tAF', '|', '-c', f"SELECT id, name FROM celebrity_profiles WHERE id IN ({placeholders})"], capture_output=True, text=True, env=env ) celebrities = [] for line in result.stdout.strip().splitlines(): if '|' in line: parts = line.split('|') celebrities.append({'id': int(parts[0]), 'name': parts[1].strip()}) if not celebrities: print("No celebrities found") return # Get existing URL hashes for dedup result = subprocess.run( ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader', '-tAc', "SELECT url_hash FROM press_articles"], capture_output=True, text=True, env=env ) existing_hashes = set(line.strip() for line in result.stdout.strip().splitlines() if line.strip()) print(f"Existing articles: {len(existing_hashes)}") # Also get existing titles per celebrity for dedup result = subprocess.run( ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader', '-tAF', '|', '-c', "SELECT celebrity_id, title FROM press_articles"], capture_output=True, text=True, env=env ) existing_titles = set() for line in result.stdout.strip().splitlines(): if '|' in line: parts = line.split('|', 1) existing_titles.add((parts[0].strip(), parts[1].strip())) now = datetime.now() total_new = 0 total_fetched = 0 for celeb in celebrities: celeb_id = celeb['id'] celeb_name = celeb['name'] print(f"\n{'='*60}") print(f"Backfilling: {celeb_name} (id={celeb_id})") print(f"{'='*60}") celeb_new = 0 # Query in 1-week windows going back for week in range(WEEKS_BACK): end_dt = now - timedelta(weeks=week) start_dt = now - timedelta(weeks=week + 1) start_str = start_dt.strftime('%Y-%m-%d') end_str = end_dt.strftime('%Y-%m-%d') week_label = f"Week -{week+1} ({start_dt.strftime('%b %d')} - {end_dt.strftime('%b %d')})" print(f"\n {week_label}...", end='', flush=True) articles = fetch_google_news_window(celeb_name, start_str, end_str) total_fetched += len(articles) if not articles: print(f" no articles") continue # Warn if we hit the 100 cap (may be missing articles) cap_warning = " [HIT 100 CAP]" if len(articles) >= 100 else "" print(f" {len(articles)} found{cap_warning}", flush=True) week_new = 0 for article in articles: google_url = article.get('url', '') if not google_url: continue title = article.get('title', '').strip() if title and (str(celeb_id), title) in existing_titles: continue # Only keep articles where celeb name appears in the title if not title or celeb_name.lower() not in title.lower(): continue # Decode Google News URL to real article URL article_url = decode_google_news_url(google_url) if not article_url: continue # Skip domains that are JS-rendered or block scrapers parsed_check = urlparse(article_url) host = parsed_check.netloc.lower() # Check if host or any parent domain is in SKIP_DOMAINS if any(host == d or host.endswith('.' + d) for d in SKIP_DOMAINS): continue url_hash = hashlib.sha256(article_url.encode('utf-8')).hexdigest() if url_hash in existing_hashes: continue # Parse domain from real URL parsed = urlparse(article_url) domain = parsed.netloc.replace('www.', '') published_date = article.get('published_date', '') source = article.get('source', '') # Extract content and og:image (with rate limiting to be polite) content, og_image = extract_content(article_url) # Cache all inline images in the content to local proxy if content: content = cache_content_images(content) if content: import re as _re3 snippet = _re3.sub(r'<[^>]+>', ' ', content) snippet = ' '.join(snippet.split())[:300] else: snippet = title[:300] if title else '' # Cache the og:image locally, fall back to first inline image image_url = cache_press_image(og_image) if og_image else None if not image_url and content: import re as _re2 m = _re2.search(r'