594
scripts/backfill_press.py
Normal file
594
scripts/backfill_press.py
Normal file
@@ -0,0 +1,594 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Backfill press articles from Google News RSS for the last year.
|
||||
|
||||
Google News RSS:
|
||||
- 100 articles per query (cap)
|
||||
- No rate limiting, no API key needed
|
||||
- ~12 months of history
|
||||
- Strategy: 1-week windows to stay under the 100 cap
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
import xml.etree.ElementTree as ET
|
||||
from datetime import datetime, timedelta
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Bootstrap database
|
||||
sys.path.insert(0, '/opt/media-downloader')
|
||||
import modules.db_bootstrap # noqa: E402,F401
|
||||
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('PressBackfill')
|
||||
|
||||
DB_PASSWORD = "PNsihOXvvuPwWiIvGlsc9Fh2YmMmB"
|
||||
WEEKS_BACK = 52
|
||||
|
||||
# Domains that return no content even with FlareSolverr
|
||||
SKIP_DOMAINS = {
|
||||
'msn.com',
|
||||
'news.google.com',
|
||||
'imdb.com',
|
||||
'st-aug.edu',
|
||||
}
|
||||
|
||||
|
||||
def fetch_google_news_window(name: str, start_date: str, end_date: str) -> list:
|
||||
"""Fetch Google News RSS articles for a specific time window.
|
||||
Returns list of dicts with: title, url, published_date, source."""
|
||||
query = f'%22{name.replace(" ", "+")}%22+after:{start_date}+before:{end_date}'
|
||||
url = f'https://news.google.com/rss/search?q={query}&hl=en&gl=US&ceid=US:en'
|
||||
|
||||
for attempt in range(3):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=30) as response:
|
||||
data = response.read().decode('utf-8')
|
||||
|
||||
root = ET.fromstring(data)
|
||||
articles = []
|
||||
for item in root.findall('.//item'):
|
||||
title_el = item.find('title')
|
||||
link_el = item.find('link')
|
||||
pub_el = item.find('pubDate')
|
||||
source_el = item.find('source')
|
||||
|
||||
if title_el is None or link_el is None:
|
||||
continue
|
||||
|
||||
title = title_el.text or ''
|
||||
# Google News titles often end with " - Source Name", strip it
|
||||
source_name = source_el.text if source_el is not None else ''
|
||||
if source_name and title.endswith(f' - {source_name}'):
|
||||
title = title[:-len(f' - {source_name}')].strip()
|
||||
|
||||
# Parse pubDate (RFC 2822 format)
|
||||
published_date = ''
|
||||
if pub_el is not None and pub_el.text:
|
||||
try:
|
||||
from email.utils import parsedate_to_datetime
|
||||
dt = parsedate_to_datetime(pub_el.text)
|
||||
published_date = dt.isoformat()
|
||||
except Exception:
|
||||
published_date = pub_el.text
|
||||
|
||||
articles.append({
|
||||
'title': title,
|
||||
'url': link_el.text or '',
|
||||
'published_date': published_date,
|
||||
'source': source_name,
|
||||
})
|
||||
return articles
|
||||
except Exception as e:
|
||||
if attempt < 2:
|
||||
time.sleep(5)
|
||||
continue
|
||||
print(f" Error fetching Google News: {e}")
|
||||
return []
|
||||
return []
|
||||
|
||||
|
||||
PRESS_IMAGE_CACHE = '/opt/media-downloader/data/press_images'
|
||||
os.makedirs(PRESS_IMAGE_CACHE, exist_ok=True)
|
||||
|
||||
|
||||
def cache_press_image(image_url: str) -> str | None:
|
||||
"""Download and cache an image locally. Returns API path."""
|
||||
if not image_url:
|
||||
return None
|
||||
|
||||
url_hash = hashlib.sha256(image_url.encode('utf-8')).hexdigest()[:16]
|
||||
|
||||
# Check if already cached
|
||||
for ext in ('.jpg', '.jpeg', '.png', '.webp', '.gif'):
|
||||
cached = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}")
|
||||
if os.path.exists(cached) and os.path.getsize(cached) > 0:
|
||||
return f"/api/press/images/{url_hash}{ext}"
|
||||
|
||||
# Download
|
||||
try:
|
||||
req = urllib.request.Request(image_url, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
'Accept': 'image/*,*/*',
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
image_data = resp.read()
|
||||
if len(image_data) < 1000:
|
||||
return None
|
||||
except Exception:
|
||||
# Try via FlareSolverr — but it can't fetch binary, so try fetching
|
||||
# the page and extracting the image URL that works
|
||||
return None
|
||||
|
||||
ext = '.jpg'
|
||||
url_lower = image_url.lower()
|
||||
if '.png' in url_lower:
|
||||
ext = '.png'
|
||||
elif '.webp' in url_lower:
|
||||
ext = '.webp'
|
||||
elif '.gif' in url_lower:
|
||||
ext = '.gif'
|
||||
|
||||
cached_path = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}")
|
||||
with open(cached_path, 'wb') as f:
|
||||
f.write(image_data)
|
||||
return f"/api/press/images/{url_hash}{ext}"
|
||||
|
||||
|
||||
def cache_content_images(html_content: str) -> str:
|
||||
"""Find all <img ...> in HTML content, cache each image locally,
|
||||
and rewrite src to /api/press/images/... proxy path.
|
||||
Removes img tags where caching fails (broken > missing)."""
|
||||
if not html_content:
|
||||
return html_content
|
||||
import re as _re
|
||||
def _replace_img(match):
|
||||
full_tag = match.group(0)
|
||||
src = match.group(1)
|
||||
if not src or src.startswith('/api/press/images/'):
|
||||
return full_tag
|
||||
cached = cache_press_image(src)
|
||||
if cached:
|
||||
return full_tag.replace(src, cached)
|
||||
return '' # Remove img if caching failed
|
||||
return _re.sub(r'<img\s+src="([^"]+)"[^>]*>', _replace_img, html_content)
|
||||
|
||||
|
||||
def decode_google_news_url(google_url: str) -> str | None:
|
||||
"""Decode a Google News redirect URL to the real article URL."""
|
||||
if 'news.google.com' not in google_url:
|
||||
return google_url
|
||||
try:
|
||||
from googlenewsdecoder import gnewsdecoder
|
||||
result = gnewsdecoder(google_url, interval=1)
|
||||
if result.get('status'):
|
||||
return result['decoded_url']
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def extract_content(article_url: str) -> tuple[str | None, str | None]:
|
||||
"""Extract article content and og:image from the real article URL.
|
||||
Tries direct fetch first, falls back to FlareSolverr for bot-protected sites.
|
||||
Returns (content_html, image_url)."""
|
||||
content, image = _extract_content_direct(article_url)
|
||||
if content:
|
||||
return (content, image)
|
||||
# Fallback to FlareSolverr for bot-protected sites
|
||||
content2, image2 = _extract_content_flaresolverr(article_url)
|
||||
return (content2, image2 or image)
|
||||
|
||||
|
||||
def _fetch_html_flaresolverr(url: str) -> str | None:
|
||||
"""Fetch HTML via FlareSolverr (headless browser)."""
|
||||
try:
|
||||
import requests
|
||||
resp = requests.post('http://localhost:8191/v1', json={
|
||||
'cmd': 'request.get',
|
||||
'url': url,
|
||||
'maxTimeout': 30000
|
||||
}, timeout=45)
|
||||
data = resp.json()
|
||||
if data.get('status') == 'ok':
|
||||
html = data.get('solution', {}).get('response', '')
|
||||
if len(html) > 500:
|
||||
return html
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _extract_content_flaresolverr(url: str) -> tuple[str | None, str | None]:
|
||||
"""Extract content using FlareSolverr as fetcher."""
|
||||
raw_html = _fetch_html_flaresolverr(url)
|
||||
if not raw_html:
|
||||
return (None, None)
|
||||
return _parse_article_html(raw_html, url)
|
||||
|
||||
|
||||
def _extract_content_direct(url: str) -> tuple[str | None, str | None]:
|
||||
"""Self-contained article extraction. Returns (content_html, image_url)."""
|
||||
import urllib.request
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=20) as response:
|
||||
raw_html = response.read().decode('utf-8', errors='replace')
|
||||
return _parse_article_html(raw_html, url)
|
||||
except Exception:
|
||||
return (None, None)
|
||||
|
||||
|
||||
def _parse_article_html(raw_html: str, url: str) -> tuple[str | None, str | None]:
|
||||
"""Parse raw HTML into article content and og:image. Returns (content_html, image_url)."""
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
try:
|
||||
|
||||
from readability import Document
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Extract og:image for thumbnail
|
||||
og_soup = BeautifulSoup(raw_html, 'html.parser')
|
||||
og_image = None
|
||||
og_tag = og_soup.find('meta', property='og:image')
|
||||
if og_tag and og_tag.get('content'):
|
||||
og_image = og_tag['content']
|
||||
if not og_image:
|
||||
tw_tag = og_soup.find('meta', attrs={'name': 'twitter:image'})
|
||||
if tw_tag and tw_tag.get('content'):
|
||||
og_image = tw_tag['content']
|
||||
import bleach
|
||||
|
||||
doc = Document(raw_html, url=url)
|
||||
content_html = doc.summary()
|
||||
|
||||
if not content_html or len(content_html.strip()) < 50:
|
||||
return (None, og_image)
|
||||
|
||||
reader_soup = BeautifulSoup(content_html, 'html.parser')
|
||||
|
||||
junk_text_re = re.compile(
|
||||
r'(^published|^updated|^modified|^posted|^by\s|^written by|^photo:|^image:|^credit:|'
|
||||
r'share or comment|share this article|comment on this|follow us on|'
|
||||
r'sign up for|subscribe to|have you got a story|tips@|email us)',
|
||||
re.I
|
||||
)
|
||||
|
||||
inline_tags = ['b', 'i', 'em', 'strong', 'a', 'br']
|
||||
inline_attrs = {'a': ['href']}
|
||||
|
||||
html_parts = []
|
||||
for el in reader_soup.find_all(['p', 'h2', 'h3', 'h4', 'blockquote', 'ul', 'ol']):
|
||||
text = el.get_text(strip=True)
|
||||
if len(text) < 30:
|
||||
continue
|
||||
if junk_text_re.search(text):
|
||||
continue
|
||||
tag = el.name
|
||||
inner = bleach.clean(
|
||||
el.decode_contents(), tags=inline_tags,
|
||||
attributes=inline_attrs, strip=True, protocols=['http', 'https']
|
||||
).strip()
|
||||
if not inner:
|
||||
continue
|
||||
if tag == 'p':
|
||||
html_parts.append(f'<p>{inner}</p>')
|
||||
elif tag in ('h2', 'h3', 'h4'):
|
||||
html_parts.append(f'<{tag}>{inner}</{tag}>')
|
||||
elif tag == 'blockquote':
|
||||
html_parts.append(f'<blockquote><p>{inner}</p></blockquote>')
|
||||
elif tag in ('ul', 'ol'):
|
||||
items = []
|
||||
for li in el.find_all('li', recursive=False):
|
||||
li_inner = bleach.clean(
|
||||
li.decode_contents(), tags=inline_tags,
|
||||
attributes=inline_attrs, strip=True, protocols=['http', 'https']
|
||||
).strip()
|
||||
if li_inner and len(li.get_text(strip=True)) > 10:
|
||||
items.append(f'<li>{li_inner}</li>')
|
||||
if items:
|
||||
html_parts.append(f'<{tag}>{"".join(items)}</{tag}>')
|
||||
|
||||
# Images from readability
|
||||
junk_img_re = re.compile(r'(logo|icon|pixel|spacer|blank|1x1|svg|avatar|spinner|/ct/|cts\.businesswire)', re.I)
|
||||
seen_srcs = set()
|
||||
article_images = []
|
||||
for img in reader_soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src and src.startswith(('http://', 'https://')) and src not in seen_srcs:
|
||||
if junk_img_re.search(src):
|
||||
continue
|
||||
seen_srcs.add(src)
|
||||
alt = (img.get('alt', '') or '').strip()
|
||||
article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
|
||||
|
||||
# If readability found no images, grab first real image from original HTML
|
||||
if not article_images:
|
||||
orig_soup = BeautifulSoup(raw_html, 'html.parser')
|
||||
for noise in orig_soup.find_all(['script', 'style', 'nav', 'footer', 'header',
|
||||
'aside', 'form', 'noscript', 'svg']):
|
||||
noise.decompose()
|
||||
for img in orig_soup.find_all('img'):
|
||||
src = (img.get('data-src') or img.get('data-lazy-src') or
|
||||
img.get('data-original') or img.get('src') or '')
|
||||
if not src or not src.startswith(('http://', 'https://')):
|
||||
continue
|
||||
src_lower = src.lower()
|
||||
if any(x in src_lower for x in ('logo', 'icon', 'pixel', 'spacer', 'blank',
|
||||
'1x1', 'svg', 'avatar', 'spinner', '/ct/')):
|
||||
continue
|
||||
alt = (img.get('alt', '') or '').strip()
|
||||
article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
|
||||
break # Only first real image
|
||||
|
||||
# Merge text + images
|
||||
if article_images and html_parts:
|
||||
text_count = len(html_parts)
|
||||
img_count = len(article_images)
|
||||
interval = max(1, text_count // (img_count + 1))
|
||||
merged = []
|
||||
img_idx = 0
|
||||
for i, part in enumerate(html_parts):
|
||||
merged.append(part)
|
||||
if img_idx < img_count and (i + 1) % interval == 0:
|
||||
merged.append(article_images[img_idx])
|
||||
img_idx += 1
|
||||
while img_idx < img_count:
|
||||
merged.append(article_images[img_idx])
|
||||
img_idx += 1
|
||||
html_parts = merged
|
||||
elif article_images and not html_parts:
|
||||
html_parts = article_images
|
||||
|
||||
if not html_parts:
|
||||
text = reader_soup.get_text(separator='\n\n', strip=True)
|
||||
if text:
|
||||
for para in text.split('\n\n'):
|
||||
para = para.strip()
|
||||
if len(para) > 30:
|
||||
html_parts.append(f'<p>{bleach.clean(para)}</p>')
|
||||
|
||||
if not html_parts:
|
||||
return (None, og_image)
|
||||
|
||||
# Quality check
|
||||
from bs4 import BeautifulSoup as BS
|
||||
clean_parts = []
|
||||
for part in html_parts:
|
||||
part_soup = BS(part, 'html.parser')
|
||||
part_text = part_soup.get_text(strip=True)
|
||||
if len(part_text) > 100:
|
||||
words = part_text.split()
|
||||
avg_word_len = len(part_text) / max(len(words), 1)
|
||||
if avg_word_len > 12:
|
||||
continue
|
||||
clean_parts.append(part)
|
||||
|
||||
if not clean_parts:
|
||||
return (None, og_image)
|
||||
|
||||
result = '\n'.join(clean_parts)
|
||||
plain_text = BS(result, 'html.parser').get_text(separator=' ', strip=True)
|
||||
|
||||
garbage_re = re.compile(
|
||||
r'(use (left|right|escape)|arrow keys|navigate between|'
|
||||
r'sign (in|up) with|we won.t post|social account|'
|
||||
r'accept cookies|cookie policy|privacy policy|terms of (use|service)|'
|
||||
r'AlabamaAlaska|CaliforniaColorado|United States of America)',
|
||||
re.I
|
||||
)
|
||||
if len(plain_text) < 200 or garbage_re.search(plain_text):
|
||||
return (None, og_image)
|
||||
|
||||
return (result, og_image)
|
||||
except Exception:
|
||||
return (None, None)
|
||||
|
||||
|
||||
def main():
|
||||
# Get configured celebrities
|
||||
env = os.environ.copy()
|
||||
env['PGPASSWORD'] = DB_PASSWORD
|
||||
|
||||
result = subprocess.run(
|
||||
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
|
||||
'-tAc', "SELECT celebrity_ids FROM press_config WHERE id = 1"],
|
||||
capture_output=True, text=True, env=env
|
||||
)
|
||||
celebrity_ids = json.loads(result.stdout.strip()) if result.stdout.strip() else []
|
||||
|
||||
if not celebrity_ids:
|
||||
print("No celebrities configured in press_config")
|
||||
return
|
||||
|
||||
# Get celebrity names
|
||||
placeholders = ','.join(str(i) for i in celebrity_ids)
|
||||
result = subprocess.run(
|
||||
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
|
||||
'-tAF', '|', '-c', f"SELECT id, name FROM celebrity_profiles WHERE id IN ({placeholders})"],
|
||||
capture_output=True, text=True, env=env
|
||||
)
|
||||
celebrities = []
|
||||
for line in result.stdout.strip().splitlines():
|
||||
if '|' in line:
|
||||
parts = line.split('|')
|
||||
celebrities.append({'id': int(parts[0]), 'name': parts[1].strip()})
|
||||
|
||||
if not celebrities:
|
||||
print("No celebrities found")
|
||||
return
|
||||
|
||||
# Get existing URL hashes for dedup
|
||||
result = subprocess.run(
|
||||
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
|
||||
'-tAc', "SELECT url_hash FROM press_articles"],
|
||||
capture_output=True, text=True, env=env
|
||||
)
|
||||
existing_hashes = set(line.strip() for line in result.stdout.strip().splitlines() if line.strip())
|
||||
print(f"Existing articles: {len(existing_hashes)}")
|
||||
|
||||
# Also get existing titles per celebrity for dedup
|
||||
result = subprocess.run(
|
||||
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
|
||||
'-tAF', '|', '-c', "SELECT celebrity_id, title FROM press_articles"],
|
||||
capture_output=True, text=True, env=env
|
||||
)
|
||||
existing_titles = set()
|
||||
for line in result.stdout.strip().splitlines():
|
||||
if '|' in line:
|
||||
parts = line.split('|', 1)
|
||||
existing_titles.add((parts[0].strip(), parts[1].strip()))
|
||||
|
||||
now = datetime.now()
|
||||
total_new = 0
|
||||
total_fetched = 0
|
||||
|
||||
for celeb in celebrities:
|
||||
celeb_id = celeb['id']
|
||||
celeb_name = celeb['name']
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Backfilling: {celeb_name} (id={celeb_id})")
|
||||
print(f"{'='*60}")
|
||||
|
||||
celeb_new = 0
|
||||
|
||||
# Query in 1-week windows going back
|
||||
for week in range(WEEKS_BACK):
|
||||
end_dt = now - timedelta(weeks=week)
|
||||
start_dt = now - timedelta(weeks=week + 1)
|
||||
|
||||
start_str = start_dt.strftime('%Y-%m-%d')
|
||||
end_str = end_dt.strftime('%Y-%m-%d')
|
||||
week_label = f"Week -{week+1} ({start_dt.strftime('%b %d')} - {end_dt.strftime('%b %d')})"
|
||||
print(f"\n {week_label}...", end='', flush=True)
|
||||
|
||||
articles = fetch_google_news_window(celeb_name, start_str, end_str)
|
||||
total_fetched += len(articles)
|
||||
|
||||
if not articles:
|
||||
print(f" no articles")
|
||||
continue
|
||||
|
||||
# Warn if we hit the 100 cap (may be missing articles)
|
||||
cap_warning = " [HIT 100 CAP]" if len(articles) >= 100 else ""
|
||||
print(f" {len(articles)} found{cap_warning}", flush=True)
|
||||
week_new = 0
|
||||
|
||||
for article in articles:
|
||||
google_url = article.get('url', '')
|
||||
if not google_url:
|
||||
continue
|
||||
|
||||
title = article.get('title', '').strip()
|
||||
if title and (str(celeb_id), title) in existing_titles:
|
||||
continue
|
||||
|
||||
# Only keep articles where celeb name appears in the title
|
||||
if not title or celeb_name.lower() not in title.lower():
|
||||
continue
|
||||
|
||||
# Decode Google News URL to real article URL
|
||||
article_url = decode_google_news_url(google_url)
|
||||
if not article_url:
|
||||
continue
|
||||
|
||||
# Skip domains that are JS-rendered or block scrapers
|
||||
parsed_check = urlparse(article_url)
|
||||
host = parsed_check.netloc.lower()
|
||||
# Check if host or any parent domain is in SKIP_DOMAINS
|
||||
if any(host == d or host.endswith('.' + d) for d in SKIP_DOMAINS):
|
||||
continue
|
||||
|
||||
url_hash = hashlib.sha256(article_url.encode('utf-8')).hexdigest()
|
||||
if url_hash in existing_hashes:
|
||||
continue
|
||||
|
||||
# Parse domain from real URL
|
||||
parsed = urlparse(article_url)
|
||||
domain = parsed.netloc.replace('www.', '')
|
||||
|
||||
published_date = article.get('published_date', '')
|
||||
source = article.get('source', '')
|
||||
|
||||
# Extract content and og:image (with rate limiting to be polite)
|
||||
content, og_image = extract_content(article_url)
|
||||
|
||||
# Cache all inline images in the content to local proxy
|
||||
if content:
|
||||
content = cache_content_images(content)
|
||||
|
||||
if content:
|
||||
import re as _re3
|
||||
snippet = _re3.sub(r'<[^>]+>', ' ', content)
|
||||
snippet = ' '.join(snippet.split())[:300]
|
||||
else:
|
||||
snippet = title[:300] if title else ''
|
||||
|
||||
# Cache the og:image locally, fall back to first inline image
|
||||
image_url = cache_press_image(og_image) if og_image else None
|
||||
if not image_url and content:
|
||||
import re as _re2
|
||||
m = _re2.search(r'<img\s+src="(/api/press/images/[^"]+)"', content)
|
||||
if m:
|
||||
image_url = m.group(1)
|
||||
time.sleep(0.5)
|
||||
|
||||
# Insert using parameterized query via psycopg2
|
||||
import psycopg2
|
||||
try:
|
||||
pg_conn = psycopg2.connect(
|
||||
host='localhost', user='media_downloader',
|
||||
password=env.get('PGPASSWORD', ''), dbname='media_downloader'
|
||||
)
|
||||
pg_cur = pg_conn.cursor()
|
||||
pg_cur.execute("""INSERT INTO press_articles
|
||||
(celebrity_id, title, url, url_hash, domain, published_date,
|
||||
image_url, language, country, article_content, snippet, notified, read)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, 'en', '', %s, %s, 1, 0)
|
||||
ON CONFLICT DO NOTHING""",
|
||||
(celeb_id, title, article_url, url_hash, domain,
|
||||
published_date, image_url or '', content, snippet))
|
||||
inserted = pg_cur.rowcount > 0
|
||||
pg_conn.commit()
|
||||
pg_cur.close()
|
||||
pg_conn.close()
|
||||
except Exception as db_err:
|
||||
print(f" DB error: {db_err}")
|
||||
inserted = False
|
||||
if inserted:
|
||||
week_new += 1
|
||||
existing_hashes.add(url_hash)
|
||||
existing_titles.add((str(celeb_id), title))
|
||||
|
||||
if week_new > 0:
|
||||
print(f" Added {week_new} new articles")
|
||||
celeb_new += week_new
|
||||
|
||||
# Small delay between queries to be polite
|
||||
time.sleep(1)
|
||||
|
||||
total_new += celeb_new
|
||||
print(f"\n {celeb_name}: {celeb_new} new articles added")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"DONE: Fetched {total_fetched} total, added {total_new} new articles")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user