595 lines
22 KiB
Python
595 lines
22 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Backfill press articles from Google News RSS for the last year.
|
|
|
|
Google News RSS:
|
|
- 100 articles per query (cap)
|
|
- No rate limiting, no API key needed
|
|
- ~12 months of history
|
|
- Strategy: 1-week windows to stay under the 100 cap
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import urllib.error
|
|
import urllib.request
|
|
import xml.etree.ElementTree as ET
|
|
from datetime import datetime, timedelta
|
|
from urllib.parse import urlparse
|
|
|
|
# Bootstrap database
|
|
sys.path.insert(0, '/opt/media-downloader')
|
|
import modules.db_bootstrap # noqa: E402,F401
|
|
|
|
from modules.universal_logger import get_logger
|
|
|
|
logger = get_logger('PressBackfill')
|
|
|
|
DB_PASSWORD = "PNsihOXvvuPwWiIvGlsc9Fh2YmMmB"
|
|
WEEKS_BACK = 52
|
|
|
|
# Domains that return no content even with FlareSolverr
|
|
SKIP_DOMAINS = {
|
|
'msn.com',
|
|
'news.google.com',
|
|
'imdb.com',
|
|
'st-aug.edu',
|
|
}
|
|
|
|
|
|
def fetch_google_news_window(name: str, start_date: str, end_date: str) -> list:
|
|
"""Fetch Google News RSS articles for a specific time window.
|
|
Returns list of dicts with: title, url, published_date, source."""
|
|
query = f'%22{name.replace(" ", "+")}%22+after:{start_date}+before:{end_date}'
|
|
url = f'https://news.google.com/rss/search?q={query}&hl=en&gl=US&ceid=US:en'
|
|
|
|
for attempt in range(3):
|
|
try:
|
|
req = urllib.request.Request(url, headers={
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
})
|
|
with urllib.request.urlopen(req, timeout=30) as response:
|
|
data = response.read().decode('utf-8')
|
|
|
|
root = ET.fromstring(data)
|
|
articles = []
|
|
for item in root.findall('.//item'):
|
|
title_el = item.find('title')
|
|
link_el = item.find('link')
|
|
pub_el = item.find('pubDate')
|
|
source_el = item.find('source')
|
|
|
|
if title_el is None or link_el is None:
|
|
continue
|
|
|
|
title = title_el.text or ''
|
|
# Google News titles often end with " - Source Name", strip it
|
|
source_name = source_el.text if source_el is not None else ''
|
|
if source_name and title.endswith(f' - {source_name}'):
|
|
title = title[:-len(f' - {source_name}')].strip()
|
|
|
|
# Parse pubDate (RFC 2822 format)
|
|
published_date = ''
|
|
if pub_el is not None and pub_el.text:
|
|
try:
|
|
from email.utils import parsedate_to_datetime
|
|
dt = parsedate_to_datetime(pub_el.text)
|
|
published_date = dt.isoformat()
|
|
except Exception:
|
|
published_date = pub_el.text
|
|
|
|
articles.append({
|
|
'title': title,
|
|
'url': link_el.text or '',
|
|
'published_date': published_date,
|
|
'source': source_name,
|
|
})
|
|
return articles
|
|
except Exception as e:
|
|
if attempt < 2:
|
|
time.sleep(5)
|
|
continue
|
|
print(f" Error fetching Google News: {e}")
|
|
return []
|
|
return []
|
|
|
|
|
|
PRESS_IMAGE_CACHE = '/opt/media-downloader/data/press_images'
|
|
os.makedirs(PRESS_IMAGE_CACHE, exist_ok=True)
|
|
|
|
|
|
def cache_press_image(image_url: str) -> str | None:
|
|
"""Download and cache an image locally. Returns API path."""
|
|
if not image_url:
|
|
return None
|
|
|
|
url_hash = hashlib.sha256(image_url.encode('utf-8')).hexdigest()[:16]
|
|
|
|
# Check if already cached
|
|
for ext in ('.jpg', '.jpeg', '.png', '.webp', '.gif'):
|
|
cached = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}")
|
|
if os.path.exists(cached) and os.path.getsize(cached) > 0:
|
|
return f"/api/press/images/{url_hash}{ext}"
|
|
|
|
# Download
|
|
try:
|
|
req = urllib.request.Request(image_url, headers={
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
'Accept': 'image/*,*/*',
|
|
})
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
image_data = resp.read()
|
|
if len(image_data) < 1000:
|
|
return None
|
|
except Exception:
|
|
# Try via FlareSolverr — but it can't fetch binary, so try fetching
|
|
# the page and extracting the image URL that works
|
|
return None
|
|
|
|
ext = '.jpg'
|
|
url_lower = image_url.lower()
|
|
if '.png' in url_lower:
|
|
ext = '.png'
|
|
elif '.webp' in url_lower:
|
|
ext = '.webp'
|
|
elif '.gif' in url_lower:
|
|
ext = '.gif'
|
|
|
|
cached_path = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}")
|
|
with open(cached_path, 'wb') as f:
|
|
f.write(image_data)
|
|
return f"/api/press/images/{url_hash}{ext}"
|
|
|
|
|
|
def cache_content_images(html_content: str) -> str:
|
|
"""Find all <img ...> in HTML content, cache each image locally,
|
|
and rewrite src to /api/press/images/... proxy path.
|
|
Removes img tags where caching fails (broken > missing)."""
|
|
if not html_content:
|
|
return html_content
|
|
import re as _re
|
|
def _replace_img(match):
|
|
full_tag = match.group(0)
|
|
src = match.group(1)
|
|
if not src or src.startswith('/api/press/images/'):
|
|
return full_tag
|
|
cached = cache_press_image(src)
|
|
if cached:
|
|
return full_tag.replace(src, cached)
|
|
return '' # Remove img if caching failed
|
|
return _re.sub(r'<img\s+src="([^"]+)"[^>]*>', _replace_img, html_content)
|
|
|
|
|
|
def decode_google_news_url(google_url: str) -> str | None:
|
|
"""Decode a Google News redirect URL to the real article URL."""
|
|
if 'news.google.com' not in google_url:
|
|
return google_url
|
|
try:
|
|
from googlenewsdecoder import gnewsdecoder
|
|
result = gnewsdecoder(google_url, interval=1)
|
|
if result.get('status'):
|
|
return result['decoded_url']
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
|
|
def extract_content(article_url: str) -> tuple[str | None, str | None]:
|
|
"""Extract article content and og:image from the real article URL.
|
|
Tries direct fetch first, falls back to FlareSolverr for bot-protected sites.
|
|
Returns (content_html, image_url)."""
|
|
content, image = _extract_content_direct(article_url)
|
|
if content:
|
|
return (content, image)
|
|
# Fallback to FlareSolverr for bot-protected sites
|
|
content2, image2 = _extract_content_flaresolverr(article_url)
|
|
return (content2, image2 or image)
|
|
|
|
|
|
def _fetch_html_flaresolverr(url: str) -> str | None:
|
|
"""Fetch HTML via FlareSolverr (headless browser)."""
|
|
try:
|
|
import requests
|
|
resp = requests.post('http://localhost:8191/v1', json={
|
|
'cmd': 'request.get',
|
|
'url': url,
|
|
'maxTimeout': 30000
|
|
}, timeout=45)
|
|
data = resp.json()
|
|
if data.get('status') == 'ok':
|
|
html = data.get('solution', {}).get('response', '')
|
|
if len(html) > 500:
|
|
return html
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
|
|
def _extract_content_flaresolverr(url: str) -> tuple[str | None, str | None]:
|
|
"""Extract content using FlareSolverr as fetcher."""
|
|
raw_html = _fetch_html_flaresolverr(url)
|
|
if not raw_html:
|
|
return (None, None)
|
|
return _parse_article_html(raw_html, url)
|
|
|
|
|
|
def _extract_content_direct(url: str) -> tuple[str | None, str | None]:
|
|
"""Self-contained article extraction. Returns (content_html, image_url)."""
|
|
import urllib.request
|
|
|
|
try:
|
|
req = urllib.request.Request(url, headers={
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
})
|
|
with urllib.request.urlopen(req, timeout=20) as response:
|
|
raw_html = response.read().decode('utf-8', errors='replace')
|
|
return _parse_article_html(raw_html, url)
|
|
except Exception:
|
|
return (None, None)
|
|
|
|
|
|
def _parse_article_html(raw_html: str, url: str) -> tuple[str | None, str | None]:
|
|
"""Parse raw HTML into article content and og:image. Returns (content_html, image_url)."""
|
|
import re
|
|
from urllib.parse import urljoin
|
|
|
|
try:
|
|
|
|
from readability import Document
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Extract og:image for thumbnail
|
|
og_soup = BeautifulSoup(raw_html, 'html.parser')
|
|
og_image = None
|
|
og_tag = og_soup.find('meta', property='og:image')
|
|
if og_tag and og_tag.get('content'):
|
|
og_image = og_tag['content']
|
|
if not og_image:
|
|
tw_tag = og_soup.find('meta', attrs={'name': 'twitter:image'})
|
|
if tw_tag and tw_tag.get('content'):
|
|
og_image = tw_tag['content']
|
|
import bleach
|
|
|
|
doc = Document(raw_html, url=url)
|
|
content_html = doc.summary()
|
|
|
|
if not content_html or len(content_html.strip()) < 50:
|
|
return (None, og_image)
|
|
|
|
reader_soup = BeautifulSoup(content_html, 'html.parser')
|
|
|
|
junk_text_re = re.compile(
|
|
r'(^published|^updated|^modified|^posted|^by\s|^written by|^photo:|^image:|^credit:|'
|
|
r'share or comment|share this article|comment on this|follow us on|'
|
|
r'sign up for|subscribe to|have you got a story|tips@|email us)',
|
|
re.I
|
|
)
|
|
|
|
inline_tags = ['b', 'i', 'em', 'strong', 'a', 'br']
|
|
inline_attrs = {'a': ['href']}
|
|
|
|
html_parts = []
|
|
for el in reader_soup.find_all(['p', 'h2', 'h3', 'h4', 'blockquote', 'ul', 'ol']):
|
|
text = el.get_text(strip=True)
|
|
if len(text) < 30:
|
|
continue
|
|
if junk_text_re.search(text):
|
|
continue
|
|
tag = el.name
|
|
inner = bleach.clean(
|
|
el.decode_contents(), tags=inline_tags,
|
|
attributes=inline_attrs, strip=True, protocols=['http', 'https']
|
|
).strip()
|
|
if not inner:
|
|
continue
|
|
if tag == 'p':
|
|
html_parts.append(f'<p>{inner}</p>')
|
|
elif tag in ('h2', 'h3', 'h4'):
|
|
html_parts.append(f'<{tag}>{inner}</{tag}>')
|
|
elif tag == 'blockquote':
|
|
html_parts.append(f'<blockquote><p>{inner}</p></blockquote>')
|
|
elif tag in ('ul', 'ol'):
|
|
items = []
|
|
for li in el.find_all('li', recursive=False):
|
|
li_inner = bleach.clean(
|
|
li.decode_contents(), tags=inline_tags,
|
|
attributes=inline_attrs, strip=True, protocols=['http', 'https']
|
|
).strip()
|
|
if li_inner and len(li.get_text(strip=True)) > 10:
|
|
items.append(f'<li>{li_inner}</li>')
|
|
if items:
|
|
html_parts.append(f'<{tag}>{"".join(items)}</{tag}>')
|
|
|
|
# Images from readability
|
|
junk_img_re = re.compile(r'(logo|icon|pixel|spacer|blank|1x1|svg|avatar|spinner|/ct/|cts\.businesswire)', re.I)
|
|
seen_srcs = set()
|
|
article_images = []
|
|
for img in reader_soup.find_all('img'):
|
|
src = img.get('src', '')
|
|
if src and src.startswith(('http://', 'https://')) and src not in seen_srcs:
|
|
if junk_img_re.search(src):
|
|
continue
|
|
seen_srcs.add(src)
|
|
alt = (img.get('alt', '') or '').strip()
|
|
article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
|
|
|
|
# If readability found no images, grab first real image from original HTML
|
|
if not article_images:
|
|
orig_soup = BeautifulSoup(raw_html, 'html.parser')
|
|
for noise in orig_soup.find_all(['script', 'style', 'nav', 'footer', 'header',
|
|
'aside', 'form', 'noscript', 'svg']):
|
|
noise.decompose()
|
|
for img in orig_soup.find_all('img'):
|
|
src = (img.get('data-src') or img.get('data-lazy-src') or
|
|
img.get('data-original') or img.get('src') or '')
|
|
if not src or not src.startswith(('http://', 'https://')):
|
|
continue
|
|
src_lower = src.lower()
|
|
if any(x in src_lower for x in ('logo', 'icon', 'pixel', 'spacer', 'blank',
|
|
'1x1', 'svg', 'avatar', 'spinner', '/ct/')):
|
|
continue
|
|
alt = (img.get('alt', '') or '').strip()
|
|
article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
|
|
break # Only first real image
|
|
|
|
# Merge text + images
|
|
if article_images and html_parts:
|
|
text_count = len(html_parts)
|
|
img_count = len(article_images)
|
|
interval = max(1, text_count // (img_count + 1))
|
|
merged = []
|
|
img_idx = 0
|
|
for i, part in enumerate(html_parts):
|
|
merged.append(part)
|
|
if img_idx < img_count and (i + 1) % interval == 0:
|
|
merged.append(article_images[img_idx])
|
|
img_idx += 1
|
|
while img_idx < img_count:
|
|
merged.append(article_images[img_idx])
|
|
img_idx += 1
|
|
html_parts = merged
|
|
elif article_images and not html_parts:
|
|
html_parts = article_images
|
|
|
|
if not html_parts:
|
|
text = reader_soup.get_text(separator='\n\n', strip=True)
|
|
if text:
|
|
for para in text.split('\n\n'):
|
|
para = para.strip()
|
|
if len(para) > 30:
|
|
html_parts.append(f'<p>{bleach.clean(para)}</p>')
|
|
|
|
if not html_parts:
|
|
return (None, og_image)
|
|
|
|
# Quality check
|
|
from bs4 import BeautifulSoup as BS
|
|
clean_parts = []
|
|
for part in html_parts:
|
|
part_soup = BS(part, 'html.parser')
|
|
part_text = part_soup.get_text(strip=True)
|
|
if len(part_text) > 100:
|
|
words = part_text.split()
|
|
avg_word_len = len(part_text) / max(len(words), 1)
|
|
if avg_word_len > 12:
|
|
continue
|
|
clean_parts.append(part)
|
|
|
|
if not clean_parts:
|
|
return (None, og_image)
|
|
|
|
result = '\n'.join(clean_parts)
|
|
plain_text = BS(result, 'html.parser').get_text(separator=' ', strip=True)
|
|
|
|
garbage_re = re.compile(
|
|
r'(use (left|right|escape)|arrow keys|navigate between|'
|
|
r'sign (in|up) with|we won.t post|social account|'
|
|
r'accept cookies|cookie policy|privacy policy|terms of (use|service)|'
|
|
r'AlabamaAlaska|CaliforniaColorado|United States of America)',
|
|
re.I
|
|
)
|
|
if len(plain_text) < 200 or garbage_re.search(plain_text):
|
|
return (None, og_image)
|
|
|
|
return (result, og_image)
|
|
except Exception:
|
|
return (None, None)
|
|
|
|
|
|
def main():
|
|
# Get configured celebrities
|
|
env = os.environ.copy()
|
|
env['PGPASSWORD'] = DB_PASSWORD
|
|
|
|
result = subprocess.run(
|
|
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
|
|
'-tAc', "SELECT celebrity_ids FROM press_config WHERE id = 1"],
|
|
capture_output=True, text=True, env=env
|
|
)
|
|
celebrity_ids = json.loads(result.stdout.strip()) if result.stdout.strip() else []
|
|
|
|
if not celebrity_ids:
|
|
print("No celebrities configured in press_config")
|
|
return
|
|
|
|
# Get celebrity names
|
|
placeholders = ','.join(str(i) for i in celebrity_ids)
|
|
result = subprocess.run(
|
|
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
|
|
'-tAF', '|', '-c', f"SELECT id, name FROM celebrity_profiles WHERE id IN ({placeholders})"],
|
|
capture_output=True, text=True, env=env
|
|
)
|
|
celebrities = []
|
|
for line in result.stdout.strip().splitlines():
|
|
if '|' in line:
|
|
parts = line.split('|')
|
|
celebrities.append({'id': int(parts[0]), 'name': parts[1].strip()})
|
|
|
|
if not celebrities:
|
|
print("No celebrities found")
|
|
return
|
|
|
|
# Get existing URL hashes for dedup
|
|
result = subprocess.run(
|
|
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
|
|
'-tAc', "SELECT url_hash FROM press_articles"],
|
|
capture_output=True, text=True, env=env
|
|
)
|
|
existing_hashes = set(line.strip() for line in result.stdout.strip().splitlines() if line.strip())
|
|
print(f"Existing articles: {len(existing_hashes)}")
|
|
|
|
# Also get existing titles per celebrity for dedup
|
|
result = subprocess.run(
|
|
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
|
|
'-tAF', '|', '-c', "SELECT celebrity_id, title FROM press_articles"],
|
|
capture_output=True, text=True, env=env
|
|
)
|
|
existing_titles = set()
|
|
for line in result.stdout.strip().splitlines():
|
|
if '|' in line:
|
|
parts = line.split('|', 1)
|
|
existing_titles.add((parts[0].strip(), parts[1].strip()))
|
|
|
|
now = datetime.now()
|
|
total_new = 0
|
|
total_fetched = 0
|
|
|
|
for celeb in celebrities:
|
|
celeb_id = celeb['id']
|
|
celeb_name = celeb['name']
|
|
print(f"\n{'='*60}")
|
|
print(f"Backfilling: {celeb_name} (id={celeb_id})")
|
|
print(f"{'='*60}")
|
|
|
|
celeb_new = 0
|
|
|
|
# Query in 1-week windows going back
|
|
for week in range(WEEKS_BACK):
|
|
end_dt = now - timedelta(weeks=week)
|
|
start_dt = now - timedelta(weeks=week + 1)
|
|
|
|
start_str = start_dt.strftime('%Y-%m-%d')
|
|
end_str = end_dt.strftime('%Y-%m-%d')
|
|
week_label = f"Week -{week+1} ({start_dt.strftime('%b %d')} - {end_dt.strftime('%b %d')})"
|
|
print(f"\n {week_label}...", end='', flush=True)
|
|
|
|
articles = fetch_google_news_window(celeb_name, start_str, end_str)
|
|
total_fetched += len(articles)
|
|
|
|
if not articles:
|
|
print(f" no articles")
|
|
continue
|
|
|
|
# Warn if we hit the 100 cap (may be missing articles)
|
|
cap_warning = " [HIT 100 CAP]" if len(articles) >= 100 else ""
|
|
print(f" {len(articles)} found{cap_warning}", flush=True)
|
|
week_new = 0
|
|
|
|
for article in articles:
|
|
google_url = article.get('url', '')
|
|
if not google_url:
|
|
continue
|
|
|
|
title = article.get('title', '').strip()
|
|
if title and (str(celeb_id), title) in existing_titles:
|
|
continue
|
|
|
|
# Only keep articles where celeb name appears in the title
|
|
if not title or celeb_name.lower() not in title.lower():
|
|
continue
|
|
|
|
# Decode Google News URL to real article URL
|
|
article_url = decode_google_news_url(google_url)
|
|
if not article_url:
|
|
continue
|
|
|
|
# Skip domains that are JS-rendered or block scrapers
|
|
parsed_check = urlparse(article_url)
|
|
host = parsed_check.netloc.lower()
|
|
# Check if host or any parent domain is in SKIP_DOMAINS
|
|
if any(host == d or host.endswith('.' + d) for d in SKIP_DOMAINS):
|
|
continue
|
|
|
|
url_hash = hashlib.sha256(article_url.encode('utf-8')).hexdigest()
|
|
if url_hash in existing_hashes:
|
|
continue
|
|
|
|
# Parse domain from real URL
|
|
parsed = urlparse(article_url)
|
|
domain = parsed.netloc.replace('www.', '')
|
|
|
|
published_date = article.get('published_date', '')
|
|
source = article.get('source', '')
|
|
|
|
# Extract content and og:image (with rate limiting to be polite)
|
|
content, og_image = extract_content(article_url)
|
|
|
|
# Cache all inline images in the content to local proxy
|
|
if content:
|
|
content = cache_content_images(content)
|
|
|
|
if content:
|
|
import re as _re3
|
|
snippet = _re3.sub(r'<[^>]+>', ' ', content)
|
|
snippet = ' '.join(snippet.split())[:300]
|
|
else:
|
|
snippet = title[:300] if title else ''
|
|
|
|
# Cache the og:image locally, fall back to first inline image
|
|
image_url = cache_press_image(og_image) if og_image else None
|
|
if not image_url and content:
|
|
import re as _re2
|
|
m = _re2.search(r'<img\s+src="(/api/press/images/[^"]+)"', content)
|
|
if m:
|
|
image_url = m.group(1)
|
|
time.sleep(0.5)
|
|
|
|
# Insert using parameterized query via psycopg2
|
|
import psycopg2
|
|
try:
|
|
pg_conn = psycopg2.connect(
|
|
host='localhost', user='media_downloader',
|
|
password=env.get('PGPASSWORD', ''), dbname='media_downloader'
|
|
)
|
|
pg_cur = pg_conn.cursor()
|
|
pg_cur.execute("""INSERT INTO press_articles
|
|
(celebrity_id, title, url, url_hash, domain, published_date,
|
|
image_url, language, country, article_content, snippet, notified, read)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, 'en', '', %s, %s, 1, 0)
|
|
ON CONFLICT DO NOTHING""",
|
|
(celeb_id, title, article_url, url_hash, domain,
|
|
published_date, image_url or '', content, snippet))
|
|
inserted = pg_cur.rowcount > 0
|
|
pg_conn.commit()
|
|
pg_cur.close()
|
|
pg_conn.close()
|
|
except Exception as db_err:
|
|
print(f" DB error: {db_err}")
|
|
inserted = False
|
|
if inserted:
|
|
week_new += 1
|
|
existing_hashes.add(url_hash)
|
|
existing_titles.add((str(celeb_id), title))
|
|
|
|
if week_new > 0:
|
|
print(f" Added {week_new} new articles")
|
|
celeb_new += week_new
|
|
|
|
# Small delay between queries to be polite
|
|
time.sleep(1)
|
|
|
|
total_new += celeb_new
|
|
print(f"\n {celeb_name}: {celeb_new} new articles added")
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"DONE: Fetched {total_fetched} total, added {total_new} new articles")
|
|
print(f"{'='*60}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|