Files
media-downloader/scripts/backfill_press.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

595 lines
22 KiB
Python

#!/usr/bin/env python3
"""
Backfill press articles from Google News RSS for the last year.
Google News RSS:
- 100 articles per query (cap)
- No rate limiting, no API key needed
- ~12 months of history
- Strategy: 1-week windows to stay under the 100 cap
"""
import hashlib
import json
import os
import subprocess
import sys
import time
import urllib.error
import urllib.request
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
from urllib.parse import urlparse
# Bootstrap database
sys.path.insert(0, '/opt/media-downloader')
import modules.db_bootstrap # noqa: E402,F401
from modules.universal_logger import get_logger
logger = get_logger('PressBackfill')
DB_PASSWORD = "PNsihOXvvuPwWiIvGlsc9Fh2YmMmB"
WEEKS_BACK = 52
# Domains that return no content even with FlareSolverr
SKIP_DOMAINS = {
'msn.com',
'news.google.com',
'imdb.com',
'st-aug.edu',
}
def fetch_google_news_window(name: str, start_date: str, end_date: str) -> list:
"""Fetch Google News RSS articles for a specific time window.
Returns list of dicts with: title, url, published_date, source."""
query = f'%22{name.replace(" ", "+")}%22+after:{start_date}+before:{end_date}'
url = f'https://news.google.com/rss/search?q={query}&hl=en&gl=US&ceid=US:en'
for attempt in range(3):
try:
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
with urllib.request.urlopen(req, timeout=30) as response:
data = response.read().decode('utf-8')
root = ET.fromstring(data)
articles = []
for item in root.findall('.//item'):
title_el = item.find('title')
link_el = item.find('link')
pub_el = item.find('pubDate')
source_el = item.find('source')
if title_el is None or link_el is None:
continue
title = title_el.text or ''
# Google News titles often end with " - Source Name", strip it
source_name = source_el.text if source_el is not None else ''
if source_name and title.endswith(f' - {source_name}'):
title = title[:-len(f' - {source_name}')].strip()
# Parse pubDate (RFC 2822 format)
published_date = ''
if pub_el is not None and pub_el.text:
try:
from email.utils import parsedate_to_datetime
dt = parsedate_to_datetime(pub_el.text)
published_date = dt.isoformat()
except Exception:
published_date = pub_el.text
articles.append({
'title': title,
'url': link_el.text or '',
'published_date': published_date,
'source': source_name,
})
return articles
except Exception as e:
if attempt < 2:
time.sleep(5)
continue
print(f" Error fetching Google News: {e}")
return []
return []
PRESS_IMAGE_CACHE = '/opt/media-downloader/data/press_images'
os.makedirs(PRESS_IMAGE_CACHE, exist_ok=True)
def cache_press_image(image_url: str) -> str | None:
"""Download and cache an image locally. Returns API path."""
if not image_url:
return None
url_hash = hashlib.sha256(image_url.encode('utf-8')).hexdigest()[:16]
# Check if already cached
for ext in ('.jpg', '.jpeg', '.png', '.webp', '.gif'):
cached = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}")
if os.path.exists(cached) and os.path.getsize(cached) > 0:
return f"/api/press/images/{url_hash}{ext}"
# Download
try:
req = urllib.request.Request(image_url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'image/*,*/*',
})
with urllib.request.urlopen(req, timeout=15) as resp:
image_data = resp.read()
if len(image_data) < 1000:
return None
except Exception:
# Try via FlareSolverr — but it can't fetch binary, so try fetching
# the page and extracting the image URL that works
return None
ext = '.jpg'
url_lower = image_url.lower()
if '.png' in url_lower:
ext = '.png'
elif '.webp' in url_lower:
ext = '.webp'
elif '.gif' in url_lower:
ext = '.gif'
cached_path = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}")
with open(cached_path, 'wb') as f:
f.write(image_data)
return f"/api/press/images/{url_hash}{ext}"
def cache_content_images(html_content: str) -> str:
"""Find all <img ...> in HTML content, cache each image locally,
and rewrite src to /api/press/images/... proxy path.
Removes img tags where caching fails (broken > missing)."""
if not html_content:
return html_content
import re as _re
def _replace_img(match):
full_tag = match.group(0)
src = match.group(1)
if not src or src.startswith('/api/press/images/'):
return full_tag
cached = cache_press_image(src)
if cached:
return full_tag.replace(src, cached)
return '' # Remove img if caching failed
return _re.sub(r'<img\s+src="([^"]+)"[^>]*>', _replace_img, html_content)
def decode_google_news_url(google_url: str) -> str | None:
"""Decode a Google News redirect URL to the real article URL."""
if 'news.google.com' not in google_url:
return google_url
try:
from googlenewsdecoder import gnewsdecoder
result = gnewsdecoder(google_url, interval=1)
if result.get('status'):
return result['decoded_url']
except Exception:
pass
return None
def extract_content(article_url: str) -> tuple[str | None, str | None]:
"""Extract article content and og:image from the real article URL.
Tries direct fetch first, falls back to FlareSolverr for bot-protected sites.
Returns (content_html, image_url)."""
content, image = _extract_content_direct(article_url)
if content:
return (content, image)
# Fallback to FlareSolverr for bot-protected sites
content2, image2 = _extract_content_flaresolverr(article_url)
return (content2, image2 or image)
def _fetch_html_flaresolverr(url: str) -> str | None:
"""Fetch HTML via FlareSolverr (headless browser)."""
try:
import requests
resp = requests.post('http://localhost:8191/v1', json={
'cmd': 'request.get',
'url': url,
'maxTimeout': 30000
}, timeout=45)
data = resp.json()
if data.get('status') == 'ok':
html = data.get('solution', {}).get('response', '')
if len(html) > 500:
return html
except Exception:
pass
return None
def _extract_content_flaresolverr(url: str) -> tuple[str | None, str | None]:
"""Extract content using FlareSolverr as fetcher."""
raw_html = _fetch_html_flaresolverr(url)
if not raw_html:
return (None, None)
return _parse_article_html(raw_html, url)
def _extract_content_direct(url: str) -> tuple[str | None, str | None]:
"""Self-contained article extraction. Returns (content_html, image_url)."""
import urllib.request
try:
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
})
with urllib.request.urlopen(req, timeout=20) as response:
raw_html = response.read().decode('utf-8', errors='replace')
return _parse_article_html(raw_html, url)
except Exception:
return (None, None)
def _parse_article_html(raw_html: str, url: str) -> tuple[str | None, str | None]:
"""Parse raw HTML into article content and og:image. Returns (content_html, image_url)."""
import re
from urllib.parse import urljoin
try:
from readability import Document
from bs4 import BeautifulSoup
# Extract og:image for thumbnail
og_soup = BeautifulSoup(raw_html, 'html.parser')
og_image = None
og_tag = og_soup.find('meta', property='og:image')
if og_tag and og_tag.get('content'):
og_image = og_tag['content']
if not og_image:
tw_tag = og_soup.find('meta', attrs={'name': 'twitter:image'})
if tw_tag and tw_tag.get('content'):
og_image = tw_tag['content']
import bleach
doc = Document(raw_html, url=url)
content_html = doc.summary()
if not content_html or len(content_html.strip()) < 50:
return (None, og_image)
reader_soup = BeautifulSoup(content_html, 'html.parser')
junk_text_re = re.compile(
r'(^published|^updated|^modified|^posted|^by\s|^written by|^photo:|^image:|^credit:|'
r'share or comment|share this article|comment on this|follow us on|'
r'sign up for|subscribe to|have you got a story|tips@|email us)',
re.I
)
inline_tags = ['b', 'i', 'em', 'strong', 'a', 'br']
inline_attrs = {'a': ['href']}
html_parts = []
for el in reader_soup.find_all(['p', 'h2', 'h3', 'h4', 'blockquote', 'ul', 'ol']):
text = el.get_text(strip=True)
if len(text) < 30:
continue
if junk_text_re.search(text):
continue
tag = el.name
inner = bleach.clean(
el.decode_contents(), tags=inline_tags,
attributes=inline_attrs, strip=True, protocols=['http', 'https']
).strip()
if not inner:
continue
if tag == 'p':
html_parts.append(f'<p>{inner}</p>')
elif tag in ('h2', 'h3', 'h4'):
html_parts.append(f'<{tag}>{inner}</{tag}>')
elif tag == 'blockquote':
html_parts.append(f'<blockquote><p>{inner}</p></blockquote>')
elif tag in ('ul', 'ol'):
items = []
for li in el.find_all('li', recursive=False):
li_inner = bleach.clean(
li.decode_contents(), tags=inline_tags,
attributes=inline_attrs, strip=True, protocols=['http', 'https']
).strip()
if li_inner and len(li.get_text(strip=True)) > 10:
items.append(f'<li>{li_inner}</li>')
if items:
html_parts.append(f'<{tag}>{"".join(items)}</{tag}>')
# Images from readability
junk_img_re = re.compile(r'(logo|icon|pixel|spacer|blank|1x1|svg|avatar|spinner|/ct/|cts\.businesswire)', re.I)
seen_srcs = set()
article_images = []
for img in reader_soup.find_all('img'):
src = img.get('src', '')
if src and src.startswith(('http://', 'https://')) and src not in seen_srcs:
if junk_img_re.search(src):
continue
seen_srcs.add(src)
alt = (img.get('alt', '') or '').strip()
article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
# If readability found no images, grab first real image from original HTML
if not article_images:
orig_soup = BeautifulSoup(raw_html, 'html.parser')
for noise in orig_soup.find_all(['script', 'style', 'nav', 'footer', 'header',
'aside', 'form', 'noscript', 'svg']):
noise.decompose()
for img in orig_soup.find_all('img'):
src = (img.get('data-src') or img.get('data-lazy-src') or
img.get('data-original') or img.get('src') or '')
if not src or not src.startswith(('http://', 'https://')):
continue
src_lower = src.lower()
if any(x in src_lower for x in ('logo', 'icon', 'pixel', 'spacer', 'blank',
'1x1', 'svg', 'avatar', 'spinner', '/ct/')):
continue
alt = (img.get('alt', '') or '').strip()
article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
break # Only first real image
# Merge text + images
if article_images and html_parts:
text_count = len(html_parts)
img_count = len(article_images)
interval = max(1, text_count // (img_count + 1))
merged = []
img_idx = 0
for i, part in enumerate(html_parts):
merged.append(part)
if img_idx < img_count and (i + 1) % interval == 0:
merged.append(article_images[img_idx])
img_idx += 1
while img_idx < img_count:
merged.append(article_images[img_idx])
img_idx += 1
html_parts = merged
elif article_images and not html_parts:
html_parts = article_images
if not html_parts:
text = reader_soup.get_text(separator='\n\n', strip=True)
if text:
for para in text.split('\n\n'):
para = para.strip()
if len(para) > 30:
html_parts.append(f'<p>{bleach.clean(para)}</p>')
if not html_parts:
return (None, og_image)
# Quality check
from bs4 import BeautifulSoup as BS
clean_parts = []
for part in html_parts:
part_soup = BS(part, 'html.parser')
part_text = part_soup.get_text(strip=True)
if len(part_text) > 100:
words = part_text.split()
avg_word_len = len(part_text) / max(len(words), 1)
if avg_word_len > 12:
continue
clean_parts.append(part)
if not clean_parts:
return (None, og_image)
result = '\n'.join(clean_parts)
plain_text = BS(result, 'html.parser').get_text(separator=' ', strip=True)
garbage_re = re.compile(
r'(use (left|right|escape)|arrow keys|navigate between|'
r'sign (in|up) with|we won.t post|social account|'
r'accept cookies|cookie policy|privacy policy|terms of (use|service)|'
r'AlabamaAlaska|CaliforniaColorado|United States of America)',
re.I
)
if len(plain_text) < 200 or garbage_re.search(plain_text):
return (None, og_image)
return (result, og_image)
except Exception:
return (None, None)
def main():
# Get configured celebrities
env = os.environ.copy()
env['PGPASSWORD'] = DB_PASSWORD
result = subprocess.run(
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
'-tAc', "SELECT celebrity_ids FROM press_config WHERE id = 1"],
capture_output=True, text=True, env=env
)
celebrity_ids = json.loads(result.stdout.strip()) if result.stdout.strip() else []
if not celebrity_ids:
print("No celebrities configured in press_config")
return
# Get celebrity names
placeholders = ','.join(str(i) for i in celebrity_ids)
result = subprocess.run(
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
'-tAF', '|', '-c', f"SELECT id, name FROM celebrity_profiles WHERE id IN ({placeholders})"],
capture_output=True, text=True, env=env
)
celebrities = []
for line in result.stdout.strip().splitlines():
if '|' in line:
parts = line.split('|')
celebrities.append({'id': int(parts[0]), 'name': parts[1].strip()})
if not celebrities:
print("No celebrities found")
return
# Get existing URL hashes for dedup
result = subprocess.run(
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
'-tAc', "SELECT url_hash FROM press_articles"],
capture_output=True, text=True, env=env
)
existing_hashes = set(line.strip() for line in result.stdout.strip().splitlines() if line.strip())
print(f"Existing articles: {len(existing_hashes)}")
# Also get existing titles per celebrity for dedup
result = subprocess.run(
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
'-tAF', '|', '-c', "SELECT celebrity_id, title FROM press_articles"],
capture_output=True, text=True, env=env
)
existing_titles = set()
for line in result.stdout.strip().splitlines():
if '|' in line:
parts = line.split('|', 1)
existing_titles.add((parts[0].strip(), parts[1].strip()))
now = datetime.now()
total_new = 0
total_fetched = 0
for celeb in celebrities:
celeb_id = celeb['id']
celeb_name = celeb['name']
print(f"\n{'='*60}")
print(f"Backfilling: {celeb_name} (id={celeb_id})")
print(f"{'='*60}")
celeb_new = 0
# Query in 1-week windows going back
for week in range(WEEKS_BACK):
end_dt = now - timedelta(weeks=week)
start_dt = now - timedelta(weeks=week + 1)
start_str = start_dt.strftime('%Y-%m-%d')
end_str = end_dt.strftime('%Y-%m-%d')
week_label = f"Week -{week+1} ({start_dt.strftime('%b %d')} - {end_dt.strftime('%b %d')})"
print(f"\n {week_label}...", end='', flush=True)
articles = fetch_google_news_window(celeb_name, start_str, end_str)
total_fetched += len(articles)
if not articles:
print(f" no articles")
continue
# Warn if we hit the 100 cap (may be missing articles)
cap_warning = " [HIT 100 CAP]" if len(articles) >= 100 else ""
print(f" {len(articles)} found{cap_warning}", flush=True)
week_new = 0
for article in articles:
google_url = article.get('url', '')
if not google_url:
continue
title = article.get('title', '').strip()
if title and (str(celeb_id), title) in existing_titles:
continue
# Only keep articles where celeb name appears in the title
if not title or celeb_name.lower() not in title.lower():
continue
# Decode Google News URL to real article URL
article_url = decode_google_news_url(google_url)
if not article_url:
continue
# Skip domains that are JS-rendered or block scrapers
parsed_check = urlparse(article_url)
host = parsed_check.netloc.lower()
# Check if host or any parent domain is in SKIP_DOMAINS
if any(host == d or host.endswith('.' + d) for d in SKIP_DOMAINS):
continue
url_hash = hashlib.sha256(article_url.encode('utf-8')).hexdigest()
if url_hash in existing_hashes:
continue
# Parse domain from real URL
parsed = urlparse(article_url)
domain = parsed.netloc.replace('www.', '')
published_date = article.get('published_date', '')
source = article.get('source', '')
# Extract content and og:image (with rate limiting to be polite)
content, og_image = extract_content(article_url)
# Cache all inline images in the content to local proxy
if content:
content = cache_content_images(content)
if content:
import re as _re3
snippet = _re3.sub(r'<[^>]+>', ' ', content)
snippet = ' '.join(snippet.split())[:300]
else:
snippet = title[:300] if title else ''
# Cache the og:image locally, fall back to first inline image
image_url = cache_press_image(og_image) if og_image else None
if not image_url and content:
import re as _re2
m = _re2.search(r'<img\s+src="(/api/press/images/[^"]+)"', content)
if m:
image_url = m.group(1)
time.sleep(0.5)
# Insert using parameterized query via psycopg2
import psycopg2
try:
pg_conn = psycopg2.connect(
host='localhost', user='media_downloader',
password=env.get('PGPASSWORD', ''), dbname='media_downloader'
)
pg_cur = pg_conn.cursor()
pg_cur.execute("""INSERT INTO press_articles
(celebrity_id, title, url, url_hash, domain, published_date,
image_url, language, country, article_content, snippet, notified, read)
VALUES (%s, %s, %s, %s, %s, %s, %s, 'en', '', %s, %s, 1, 0)
ON CONFLICT DO NOTHING""",
(celeb_id, title, article_url, url_hash, domain,
published_date, image_url or '', content, snippet))
inserted = pg_cur.rowcount > 0
pg_conn.commit()
pg_cur.close()
pg_conn.close()
except Exception as db_err:
print(f" DB error: {db_err}")
inserted = False
if inserted:
week_new += 1
existing_hashes.add(url_hash)
existing_titles.add((str(celeb_id), title))
if week_new > 0:
print(f" Added {week_new} new articles")
celeb_new += week_new
# Small delay between queries to be polite
time.sleep(1)
total_new += celeb_new
print(f"\n {celeb_name}: {celeb_new} new articles added")
print(f"\n{'='*60}")
print(f"DONE: Fetched {total_fetched} total, added {total_new} new articles")
print(f"{'='*60}")
if __name__ == '__main__':
main()