#!/usr/bin/env python3 """ Backfill press articles from Google News RSS for the last year. Google News RSS: - 100 articles per query (cap) - No rate limiting, no API key needed - ~12 months of history - Strategy: 1-week windows to stay under the 100 cap """ import hashlib import json import os import subprocess import sys import time import urllib.error import urllib.request import xml.etree.ElementTree as ET from datetime import datetime, timedelta from urllib.parse import urlparse # Bootstrap database sys.path.insert(0, '/opt/media-downloader') import modules.db_bootstrap # noqa: E402,F401 from modules.universal_logger import get_logger logger = get_logger('PressBackfill') DB_PASSWORD = "PNsihOXvvuPwWiIvGlsc9Fh2YmMmB" WEEKS_BACK = 52 # Domains that return no content even with FlareSolverr SKIP_DOMAINS = { 'msn.com', 'news.google.com', 'imdb.com', 'st-aug.edu', } def fetch_google_news_window(name: str, start_date: str, end_date: str) -> list: """Fetch Google News RSS articles for a specific time window. Returns list of dicts with: title, url, published_date, source.""" query = f'%22{name.replace(" ", "+")}%22+after:{start_date}+before:{end_date}' url = f'https://news.google.com/rss/search?q={query}&hl=en&gl=US&ceid=US:en' for attempt in range(3): try: req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) with urllib.request.urlopen(req, timeout=30) as response: data = response.read().decode('utf-8') root = ET.fromstring(data) articles = [] for item in root.findall('.//item'): title_el = item.find('title') link_el = item.find('link') pub_el = item.find('pubDate') source_el = item.find('source') if title_el is None or link_el is None: continue title = title_el.text or '' # Google News titles often end with " - Source Name", strip it source_name = source_el.text if source_el is not None else '' if source_name and title.endswith(f' - {source_name}'): title = title[:-len(f' - {source_name}')].strip() # Parse pubDate (RFC 2822 format) published_date = '' if pub_el is not None and pub_el.text: try: from email.utils import parsedate_to_datetime dt = parsedate_to_datetime(pub_el.text) published_date = dt.isoformat() except Exception: published_date = pub_el.text articles.append({ 'title': title, 'url': link_el.text or '', 'published_date': published_date, 'source': source_name, }) return articles except Exception as e: if attempt < 2: time.sleep(5) continue print(f" Error fetching Google News: {e}") return [] return [] PRESS_IMAGE_CACHE = '/opt/media-downloader/data/press_images' os.makedirs(PRESS_IMAGE_CACHE, exist_ok=True) def cache_press_image(image_url: str) -> str | None: """Download and cache an image locally. Returns API path.""" if not image_url: return None url_hash = hashlib.sha256(image_url.encode('utf-8')).hexdigest()[:16] # Check if already cached for ext in ('.jpg', '.jpeg', '.png', '.webp', '.gif'): cached = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}") if os.path.exists(cached) and os.path.getsize(cached) > 0: return f"/api/press/images/{url_hash}{ext}" # Download try: req = urllib.request.Request(image_url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'image/*,*/*', }) with urllib.request.urlopen(req, timeout=15) as resp: image_data = resp.read() if len(image_data) < 1000: return None except Exception: # Try via FlareSolverr — but it can't fetch binary, so try fetching # the page and extracting the image URL that works return None ext = '.jpg' url_lower = image_url.lower() if '.png' in url_lower: ext = '.png' elif '.webp' in url_lower: ext = '.webp' elif '.gif' in url_lower: ext = '.gif' cached_path = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}") with open(cached_path, 'wb') as f: f.write(image_data) return f"/api/press/images/{url_hash}{ext}" def cache_content_images(html_content: str) -> str: """Find all in HTML content, cache each image locally, and rewrite src to /api/press/images/... proxy path. Removes img tags where caching fails (broken > missing).""" if not html_content: return html_content import re as _re def _replace_img(match): full_tag = match.group(0) src = match.group(1) if not src or src.startswith('/api/press/images/'): return full_tag cached = cache_press_image(src) if cached: return full_tag.replace(src, cached) return '' # Remove img if caching failed return _re.sub(r']*>', _replace_img, html_content) def decode_google_news_url(google_url: str) -> str | None: """Decode a Google News redirect URL to the real article URL.""" if 'news.google.com' not in google_url: return google_url try: from googlenewsdecoder import gnewsdecoder result = gnewsdecoder(google_url, interval=1) if result.get('status'): return result['decoded_url'] except Exception: pass return None def extract_content(article_url: str) -> tuple[str | None, str | None]: """Extract article content and og:image from the real article URL. Tries direct fetch first, falls back to FlareSolverr for bot-protected sites. Returns (content_html, image_url).""" content, image = _extract_content_direct(article_url) if content: return (content, image) # Fallback to FlareSolverr for bot-protected sites content2, image2 = _extract_content_flaresolverr(article_url) return (content2, image2 or image) def _fetch_html_flaresolverr(url: str) -> str | None: """Fetch HTML via FlareSolverr (headless browser).""" try: import requests resp = requests.post('http://localhost:8191/v1', json={ 'cmd': 'request.get', 'url': url, 'maxTimeout': 30000 }, timeout=45) data = resp.json() if data.get('status') == 'ok': html = data.get('solution', {}).get('response', '') if len(html) > 500: return html except Exception: pass return None def _extract_content_flaresolverr(url: str) -> tuple[str | None, str | None]: """Extract content using FlareSolverr as fetcher.""" raw_html = _fetch_html_flaresolverr(url) if not raw_html: return (None, None) return _parse_article_html(raw_html, url) def _extract_content_direct(url: str) -> tuple[str | None, str | None]: """Self-contained article extraction. Returns (content_html, image_url).""" import urllib.request try: req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' }) with urllib.request.urlopen(req, timeout=20) as response: raw_html = response.read().decode('utf-8', errors='replace') return _parse_article_html(raw_html, url) except Exception: return (None, None) def _parse_article_html(raw_html: str, url: str) -> tuple[str | None, str | None]: """Parse raw HTML into article content and og:image. Returns (content_html, image_url).""" import re from urllib.parse import urljoin try: from readability import Document from bs4 import BeautifulSoup # Extract og:image for thumbnail og_soup = BeautifulSoup(raw_html, 'html.parser') og_image = None og_tag = og_soup.find('meta', property='og:image') if og_tag and og_tag.get('content'): og_image = og_tag['content'] if not og_image: tw_tag = og_soup.find('meta', attrs={'name': 'twitter:image'}) if tw_tag and tw_tag.get('content'): og_image = tw_tag['content'] import bleach doc = Document(raw_html, url=url) content_html = doc.summary() if not content_html or len(content_html.strip()) < 50: return (None, og_image) reader_soup = BeautifulSoup(content_html, 'html.parser') junk_text_re = re.compile( r'(^published|^updated|^modified|^posted|^by\s|^written by|^photo:|^image:|^credit:|' r'share or comment|share this article|comment on this|follow us on|' r'sign up for|subscribe to|have you got a story|tips@|email us)', re.I ) inline_tags = ['b', 'i', 'em', 'strong', 'a', 'br'] inline_attrs = {'a': ['href']} html_parts = [] for el in reader_soup.find_all(['p', 'h2', 'h3', 'h4', 'blockquote', 'ul', 'ol']): text = el.get_text(strip=True) if len(text) < 30: continue if junk_text_re.search(text): continue tag = el.name inner = bleach.clean( el.decode_contents(), tags=inline_tags, attributes=inline_attrs, strip=True, protocols=['http', 'https'] ).strip() if not inner: continue if tag == 'p': html_parts.append(f'

{inner}

') elif tag in ('h2', 'h3', 'h4'): html_parts.append(f'<{tag}>{inner}') elif tag == 'blockquote': html_parts.append(f'

{inner}

') elif tag in ('ul', 'ol'): items = [] for li in el.find_all('li', recursive=False): li_inner = bleach.clean( li.decode_contents(), tags=inline_tags, attributes=inline_attrs, strip=True, protocols=['http', 'https'] ).strip() if li_inner and len(li.get_text(strip=True)) > 10: items.append(f'
  • {li_inner}
  • ') if items: html_parts.append(f'<{tag}>{"".join(items)}') # Images from readability junk_img_re = re.compile(r'(logo|icon|pixel|spacer|blank|1x1|svg|avatar|spinner|/ct/|cts\.businesswire)', re.I) seen_srcs = set() article_images = [] for img in reader_soup.find_all('img'): src = img.get('src', '') if src and src.startswith(('http://', 'https://')) and src not in seen_srcs: if junk_img_re.search(src): continue seen_srcs.add(src) alt = (img.get('alt', '') or '').strip() article_images.append(f'{bleach.clean(alt)}') # If readability found no images, grab first real image from original HTML if not article_images: orig_soup = BeautifulSoup(raw_html, 'html.parser') for noise in orig_soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form', 'noscript', 'svg']): noise.decompose() for img in orig_soup.find_all('img'): src = (img.get('data-src') or img.get('data-lazy-src') or img.get('data-original') or img.get('src') or '') if not src or not src.startswith(('http://', 'https://')): continue src_lower = src.lower() if any(x in src_lower for x in ('logo', 'icon', 'pixel', 'spacer', 'blank', '1x1', 'svg', 'avatar', 'spinner', '/ct/')): continue alt = (img.get('alt', '') or '').strip() article_images.append(f'{bleach.clean(alt)}') break # Only first real image # Merge text + images if article_images and html_parts: text_count = len(html_parts) img_count = len(article_images) interval = max(1, text_count // (img_count + 1)) merged = [] img_idx = 0 for i, part in enumerate(html_parts): merged.append(part) if img_idx < img_count and (i + 1) % interval == 0: merged.append(article_images[img_idx]) img_idx += 1 while img_idx < img_count: merged.append(article_images[img_idx]) img_idx += 1 html_parts = merged elif article_images and not html_parts: html_parts = article_images if not html_parts: text = reader_soup.get_text(separator='\n\n', strip=True) if text: for para in text.split('\n\n'): para = para.strip() if len(para) > 30: html_parts.append(f'

    {bleach.clean(para)}

    ') if not html_parts: return (None, og_image) # Quality check from bs4 import BeautifulSoup as BS clean_parts = [] for part in html_parts: part_soup = BS(part, 'html.parser') part_text = part_soup.get_text(strip=True) if len(part_text) > 100: words = part_text.split() avg_word_len = len(part_text) / max(len(words), 1) if avg_word_len > 12: continue clean_parts.append(part) if not clean_parts: return (None, og_image) result = '\n'.join(clean_parts) plain_text = BS(result, 'html.parser').get_text(separator=' ', strip=True) garbage_re = re.compile( r'(use (left|right|escape)|arrow keys|navigate between|' r'sign (in|up) with|we won.t post|social account|' r'accept cookies|cookie policy|privacy policy|terms of (use|service)|' r'AlabamaAlaska|CaliforniaColorado|United States of America)', re.I ) if len(plain_text) < 200 or garbage_re.search(plain_text): return (None, og_image) return (result, og_image) except Exception: return (None, None) def main(): # Get configured celebrities env = os.environ.copy() env['PGPASSWORD'] = DB_PASSWORD result = subprocess.run( ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader', '-tAc', "SELECT celebrity_ids FROM press_config WHERE id = 1"], capture_output=True, text=True, env=env ) celebrity_ids = json.loads(result.stdout.strip()) if result.stdout.strip() else [] if not celebrity_ids: print("No celebrities configured in press_config") return # Get celebrity names placeholders = ','.join(str(i) for i in celebrity_ids) result = subprocess.run( ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader', '-tAF', '|', '-c', f"SELECT id, name FROM celebrity_profiles WHERE id IN ({placeholders})"], capture_output=True, text=True, env=env ) celebrities = [] for line in result.stdout.strip().splitlines(): if '|' in line: parts = line.split('|') celebrities.append({'id': int(parts[0]), 'name': parts[1].strip()}) if not celebrities: print("No celebrities found") return # Get existing URL hashes for dedup result = subprocess.run( ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader', '-tAc', "SELECT url_hash FROM press_articles"], capture_output=True, text=True, env=env ) existing_hashes = set(line.strip() for line in result.stdout.strip().splitlines() if line.strip()) print(f"Existing articles: {len(existing_hashes)}") # Also get existing titles per celebrity for dedup result = subprocess.run( ['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader', '-tAF', '|', '-c', "SELECT celebrity_id, title FROM press_articles"], capture_output=True, text=True, env=env ) existing_titles = set() for line in result.stdout.strip().splitlines(): if '|' in line: parts = line.split('|', 1) existing_titles.add((parts[0].strip(), parts[1].strip())) now = datetime.now() total_new = 0 total_fetched = 0 for celeb in celebrities: celeb_id = celeb['id'] celeb_name = celeb['name'] print(f"\n{'='*60}") print(f"Backfilling: {celeb_name} (id={celeb_id})") print(f"{'='*60}") celeb_new = 0 # Query in 1-week windows going back for week in range(WEEKS_BACK): end_dt = now - timedelta(weeks=week) start_dt = now - timedelta(weeks=week + 1) start_str = start_dt.strftime('%Y-%m-%d') end_str = end_dt.strftime('%Y-%m-%d') week_label = f"Week -{week+1} ({start_dt.strftime('%b %d')} - {end_dt.strftime('%b %d')})" print(f"\n {week_label}...", end='', flush=True) articles = fetch_google_news_window(celeb_name, start_str, end_str) total_fetched += len(articles) if not articles: print(f" no articles") continue # Warn if we hit the 100 cap (may be missing articles) cap_warning = " [HIT 100 CAP]" if len(articles) >= 100 else "" print(f" {len(articles)} found{cap_warning}", flush=True) week_new = 0 for article in articles: google_url = article.get('url', '') if not google_url: continue title = article.get('title', '').strip() if title and (str(celeb_id), title) in existing_titles: continue # Only keep articles where celeb name appears in the title if not title or celeb_name.lower() not in title.lower(): continue # Decode Google News URL to real article URL article_url = decode_google_news_url(google_url) if not article_url: continue # Skip domains that are JS-rendered or block scrapers parsed_check = urlparse(article_url) host = parsed_check.netloc.lower() # Check if host or any parent domain is in SKIP_DOMAINS if any(host == d or host.endswith('.' + d) for d in SKIP_DOMAINS): continue url_hash = hashlib.sha256(article_url.encode('utf-8')).hexdigest() if url_hash in existing_hashes: continue # Parse domain from real URL parsed = urlparse(article_url) domain = parsed.netloc.replace('www.', '') published_date = article.get('published_date', '') source = article.get('source', '') # Extract content and og:image (with rate limiting to be polite) content, og_image = extract_content(article_url) # Cache all inline images in the content to local proxy if content: content = cache_content_images(content) if content: import re as _re3 snippet = _re3.sub(r'<[^>]+>', ' ', content) snippet = ' '.join(snippet.split())[:300] else: snippet = title[:300] if title else '' # Cache the og:image locally, fall back to first inline image image_url = cache_press_image(og_image) if og_image else None if not image_url and content: import re as _re2 m = _re2.search(r' 0 pg_conn.commit() pg_cur.close() pg_conn.close() except Exception as db_err: print(f" DB error: {db_err}") inserted = False if inserted: week_new += 1 existing_hashes.add(url_hash) existing_titles.add((str(celeb_id), title)) if week_new > 0: print(f" Added {week_new} new articles") celeb_new += week_new # Small delay between queries to be polite time.sleep(1) total_new += celeb_new print(f"\n {celeb_name}: {celeb_new} new articles added") print(f"\n{'='*60}") print(f"DONE: Fetched {total_fetched} total, added {total_new} new articles") print(f"{'='*60}") if __name__ == '__main__': main()