Files
media-downloader/modules/paid_content/reddit_client.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

679 lines
26 KiB
Python

"""
Reddit Client for Paid Content - Uses gallery-dl to fetch subreddit posts and download media.
Adapts the gallery-dl + metadata parsing pattern from reddit_community_monitor.py
to produce Post/Attachment objects for the paid content system.
"""
import asyncio
import json
import os
import shutil
import subprocess
import tempfile
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Dict, List, Optional
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class RedditClient(LoggingMixin):
"""
Client for fetching Reddit subreddit content via gallery-dl.
gallery-dl downloads files during fetch, so attachments come with local_path
already set. The sync handler moves files to their final location.
"""
SERVICE_ID = 'reddit'
PLATFORM = 'reddit'
def __init__(self, unified_db=None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Reddit')
self.unified_db = unified_db
self.gallery_dl_path = shutil.which('gallery-dl') or '/opt/media-downloader/venv/bin/gallery-dl'
def get_subreddit_info(self, subreddit: str) -> Optional[Dict]:
"""Get basic subreddit info by checking the Reddit JSON API.
Returns dict with creator_id and creator_name.
"""
import urllib.request
import urllib.error
try:
# Quick check via Reddit's public JSON endpoint
url = f'https://www.reddit.com/r/{subreddit}/about.json'
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
})
with urllib.request.urlopen(req, timeout=15) as resp:
data = json.loads(resp.read().decode())
sub_data = data.get('data', {})
display_name = sub_data.get('display_name', subreddit)
title = sub_data.get('title', '')
# Extract icon — community_icon is higher res, icon_img is fallback
icon_url = (sub_data.get('community_icon') or sub_data.get('icon_img') or '').split('?')[0]
# HTML entities in URLs
icon_url = icon_url.replace('&amp;', '&') if icon_url else None
# Extract banner — banner_background_image is the main one
banner_url = sub_data.get('banner_background_image') or sub_data.get('mobile_banner_image') or ''
banner_url = banner_url.split('?')[0] if banner_url else None
if banner_url:
banner_url = banner_url.replace('&amp;', '&')
# Build bio from title + public description
public_desc = sub_data.get('public_description', '')
bio_parts = []
if title:
bio_parts.append(title)
if public_desc and public_desc != title:
bio_parts.append(public_desc)
subscribers = sub_data.get('subscribers')
if subscribers:
bio_parts.append(f"{subscribers:,} subscribers")
bio = ''.join(bio_parts) if bio_parts else None
# Subreddit creation date
created_utc = sub_data.get('created_utc')
joined_date = None
if created_utc:
try:
joined_date = datetime.fromtimestamp(created_utc, tz=timezone.utc).strftime('%Y-%m-%d')
except (ValueError, OSError):
pass
# Use the subreddit title as display name (e.g. "Reddit Pics")
# Fall back to r/name format if no title
friendly_name = title if title else f'r/{display_name}'
return {
'creator_id': display_name.lower(),
'creator_name': f'r/{display_name}',
'display_name': friendly_name,
'bio': bio,
'joined_date': joined_date,
'profile_image_url': icon_url or None,
'banner_image_url': banner_url or None,
}
except urllib.error.HTTPError as e:
if e.code == 404:
self.log(f"Subreddit r/{subreddit} not found (404)", 'warning')
return None
elif e.code == 403:
# Private/quarantined — still exists, return basic info
self.log(f"Subreddit r/{subreddit} is private/quarantined", 'warning')
return {
'creator_id': subreddit.lower(),
'creator_name': f'r/{subreddit}',
}
else:
self.log(f"HTTP {e.code} checking r/{subreddit}", 'warning')
# Return basic info and let sync verify
return {
'creator_id': subreddit.lower(),
'creator_name': f'r/{subreddit}',
}
except Exception as e:
self.log(f"Error getting subreddit info for r/{subreddit}: {e}", 'error')
return None
def get_posts(self, subreddit: str, since_date: str = None, max_posts: int = 0,
progress_callback=None) -> tuple:
"""Fetch posts and download media from a subreddit using gallery-dl.
Args:
subreddit: Subreddit name (without r/)
since_date: ISO date string; skip posts older than this
max_posts: Maximum posts to fetch (0 = unlimited)
progress_callback: Optional callable(downloaded_count, skipped_count, latest_file)
for live progress updates
Returns:
Tuple of (List[Post], temp_dir_path) — caller must clean up temp_dir
when done moving files. Returns ([], None) on failure.
"""
temp_dir = tempfile.mkdtemp(prefix=f'reddit_paid_{subreddit}_')
try:
downloaded = self.run_gallery_dl(subreddit, temp_dir, since_date, max_posts,
progress_callback=progress_callback)
if not downloaded:
shutil.rmtree(temp_dir, ignore_errors=True)
return [], None
# Group files by post using metadata sidecars
grouped = self._group_files_by_post(downloaded, temp_dir, subreddit)
if not grouped:
shutil.rmtree(temp_dir, ignore_errors=True)
return [], None
posts = []
for post_id, post_data in grouped.items():
attachments = []
for file_path in post_data['files']:
ext = file_path.suffix.lower()
file_type = self._detect_file_type(ext)
attachments.append(Attachment(
name=file_path.name,
file_type=file_type,
extension=ext,
server_path=str(file_path), # temp path, will be moved
download_url=None, # Already downloaded
file_size=file_path.stat().st_size if file_path.exists() else None,
))
if not attachments:
continue
post = Post(
post_id=post_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=subreddit.lower(),
title=post_data.get('title'),
content=post_data.get('title'),
published_at=post_data.get('date'),
attachments=attachments,
)
posts.append(post)
self.log(f"Parsed {len(posts)} posts with {sum(len(p.attachments) for p in posts)} attachments from r/{subreddit}", 'info')
return posts, temp_dir
except Exception as e:
self.log(f"Error fetching posts from r/{subreddit}: {e}", 'error')
shutil.rmtree(temp_dir, ignore_errors=True)
return [], None
def run_gallery_dl(self, subreddit: str, temp_dir: str,
since_date: str = None, max_posts: int = 0,
progress_callback=None, batch_callback=None,
batch_size: int = 50) -> dict:
"""Run gallery-dl to download media from a subreddit.
Streams stdout line-by-line. Calls progress_callback for status updates
and batch_callback with lists of new file paths for incremental processing.
Args:
progress_callback: Called with (dl_count, skip_count, total_seen)
batch_callback: Called with (new_files: List[Path]) every batch_size files
batch_size: How many files to accumulate before calling batch_callback
Returns:
Dict with dl_count, skip_count, total.
"""
import time
# Use a separate download archive for paid content reddit
archive_dir = '/opt/media-downloader/data/cache'
os.makedirs(archive_dir, exist_ok=True)
archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
cmd = [
self.gallery_dl_path,
'--write-metadata',
'--download-archive', archive_path,
'-d', temp_dir,
]
# REST API mode to avoid shared OAuth rate limits
cmd.extend(['-o', 'extractor.reddit.api=rest'])
# Limit posts (0 = unlimited)
if max_posts > 0:
cmd.extend(['--range', f'1-{max_posts}'])
# Date filtering
if since_date:
try:
cutoff = since_date[:10] # YYYY-MM-DD
cmd.extend(['--filter', f"date >= datetime.strptime('{cutoff}', '%Y-%m-%d')"])
except (ValueError, IndexError):
pass
cmd.append(f'https://www.reddit.com/r/{subreddit}/new/')
# Check for Reddit cookies file
cookies_file = self._get_cookies_file()
if cookies_file:
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
cmd.extend(['--cookies', temp_cookie_file])
self.log(f"Running gallery-dl for r/{subreddit}", 'info')
self.log(f"Command: {' '.join(cmd)}", 'debug')
dl_count = 0
skip_count = 0
pending_files = []
try:
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
start_time = time.time()
timeout_secs = 7200 # 2 hours
while True:
if time.time() - start_time > timeout_secs:
proc.kill()
self.log(f"gallery-dl timed out for r/{subreddit}", 'error')
break
line = proc.stdout.readline()
if not line and proc.poll() is not None:
break
if not line:
continue
line = line.strip()
if not line:
continue
if line.startswith('# '):
# Skipped file (already in archive)
skip_count += 1
else:
# Downloaded file — gallery-dl prints the full path
dl_count += 1
file_path = Path(line)
if file_path.exists() and not file_path.name.endswith('.json'):
pending_files.append(file_path)
total = dl_count + skip_count
if progress_callback and total % 5 == 0:
progress_callback(dl_count, skip_count, total)
# Flush batch for processing
if batch_callback and len(pending_files) >= batch_size:
batch_callback(list(pending_files))
pending_files.clear()
proc.wait()
# Final batch
if batch_callback and pending_files:
batch_callback(list(pending_files))
pending_files.clear()
if progress_callback:
progress_callback(dl_count, skip_count, dl_count + skip_count)
returncode = proc.returncode
if returncode not in (None, 0, 1, 4, 5):
stderr = proc.stderr.read()
self.log(f"gallery-dl returned code {returncode} for r/{subreddit}", 'warning')
if stderr:
self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
except Exception as e:
self.log(f"gallery-dl failed for r/{subreddit}: {e}", 'error')
self.log(f"gallery-dl done for r/{subreddit}: {dl_count} downloaded, {skip_count} skipped", 'info')
return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
def _group_files_by_post(self, files: List[Path], temp_dir: str,
subreddit: str) -> Dict[str, Dict]:
"""Group downloaded files by Reddit post ID using metadata JSON sidecars.
Adapted from reddit_community_monitor.py:_group_files_by_post
Returns:
Dict mapping reddit_post_id -> {
'files': [Path],
'title': str,
'date': str,
'source_url': str
}
"""
posts: Dict[str, Dict] = {}
for file_path in files:
# Look for matching metadata JSON sidecar
json_path = file_path.with_suffix(file_path.suffix + '.json')
if not json_path.exists():
json_path = file_path.with_suffix('.json')
metadata = {}
if json_path.exists():
try:
with open(json_path, 'r', encoding='utf-8') as f:
metadata = json.load(f)
except (json.JSONDecodeError, Exception) as e:
self.log(f"Failed to parse metadata for {file_path.name}: {e}", 'debug')
# Extract Reddit post ID
reddit_post_id = None
for key in ('id', 'reddit_id', 'parent_id'):
if key in metadata:
reddit_post_id = str(metadata[key])
break
if not reddit_post_id:
# Filename-based fallback: subreddit_postid_num.ext
parts = file_path.stem.split('_')
if len(parts) >= 2:
reddit_post_id = parts[-2] if len(parts) >= 3 else parts[-1]
else:
reddit_post_id = file_path.stem
# Extract post date
post_date = None
if 'date' in metadata:
date_val = metadata['date']
if isinstance(date_val, str):
for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
try:
utc_dt = datetime.strptime(date_val, fmt).replace(tzinfo=timezone.utc)
post_date = utc_dt.astimezone().strftime('%Y-%m-%dT%H:%M:%S')
break
except ValueError:
continue
if not post_date:
post_date = date_val
elif isinstance(date_val, (int, float)):
try:
post_date = datetime.fromtimestamp(date_val, tz=timezone.utc).isoformat()
except (ValueError, OSError):
pass
if not post_date and 'created_utc' in metadata:
try:
post_date = datetime.fromtimestamp(metadata['created_utc'], tz=timezone.utc).isoformat()
except (ValueError, OSError):
pass
if not post_date:
post_date = datetime.now().isoformat()
title = metadata.get('title', metadata.get('description', ''))
sub = metadata.get('subreddit', subreddit)
source_url = f"https://www.reddit.com/r/{sub}/comments/{reddit_post_id}" if sub else ''
if reddit_post_id not in posts:
posts[reddit_post_id] = {
'files': [],
'title': title,
'date': post_date,
'source_url': source_url,
}
posts[reddit_post_id]['files'].append(file_path)
return posts
def _get_cookies_file(self) -> Optional[str]:
"""Get Reddit cookies JSON from the scrapers table if configured."""
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT cookies FROM scrapers WHERE name = 'reddit' AND cookies IS NOT NULL"
)
row = cursor.fetchone()
if row and row[0]:
return row[0]
except Exception as e:
self.log(f"Could not load Reddit cookies: {e}", 'debug')
return None
def _write_netscape_cookie_file(self, cookies_json: str, output_path: str) -> bool:
"""Convert JSON cookies array to Netscape cookie file format."""
try:
cookies = json.loads(cookies_json)
if not isinstance(cookies, list):
return False
with open(output_path, 'w') as f:
f.write("# Netscape HTTP Cookie File\n")
f.write("# https://curl.haxx.se/docs/http-cookies.html\n\n")
for cookie in cookies:
domain = cookie.get('domain', '')
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
path = cookie.get('path', '/')
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
expires = cookie.get('expirationDate', cookie.get('expiry', cookie.get('expires', 0)))
if expires is None:
expires = 0
expires = str(int(float(expires)))
name = cookie.get('name', '')
value = cookie.get('value', '')
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n")
return True
except Exception as e:
self.log(f"Failed to write Netscape cookie file: {e}", 'error')
return False
def get_pullpush_post_ids(self, subreddit: str, after_ts: int = 0,
before_ts: int = None,
progress_callback=None) -> List[Dict]:
"""Fetch all historical post IDs for a subreddit from the Pullpush (Pushshift) API.
Paginates through the full archive using created_utc ascending order.
Rate-limited to ~1 request per 2 seconds.
Args:
subreddit: Subreddit name (without r/)
after_ts: Unix timestamp to start from (0 = beginning of time)
before_ts: Unix timestamp to stop at (None = no upper limit)
progress_callback: Optional callable(fetched_count, message)
Returns:
List of dicts: [{id, title, created_utc, url, is_gallery}, ...]
"""
import time
import urllib.request
import urllib.error
base_url = 'https://api.pullpush.io/reddit/search/submission/'
all_posts = []
current_after = after_ts
page = 0
while True:
params = (
f'subreddit={subreddit}'
f'&size=100'
f'&sort=asc'
f'&sort_type=created_utc'
f'&after={current_after}'
)
if before_ts is not None:
params += f'&before={before_ts}'
url = f'{base_url}?{params}'
page += 1
try:
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
})
with urllib.request.urlopen(req, timeout=30) as resp:
data = json.loads(resp.read().decode())
except urllib.error.HTTPError as e:
if e.code == 429:
self.log(f"Pullpush rate limited, waiting 5s...", 'warning')
time.sleep(5)
continue
self.log(f"Pullpush HTTP {e.code} for r/{subreddit}: {e}", 'error')
break
except Exception as e:
self.log(f"Pullpush request failed for r/{subreddit}: {e}", 'error')
break
posts = data.get('data', [])
if not posts:
break
for post in posts:
all_posts.append({
'id': post.get('id', ''),
'title': post.get('title', ''),
'created_utc': post.get('created_utc', 0),
'url': post.get('url', ''),
'is_gallery': post.get('is_gallery', False),
'selftext': post.get('selftext', ''),
})
last_ts = posts[-1].get('created_utc', 0)
if progress_callback:
progress_callback(len(all_posts),
f"Fetched {len(all_posts)} post IDs (page {page})")
# Handle stuck pagination — same timestamp repeating
if last_ts <= current_after:
current_after = last_ts + 1
else:
current_after = last_ts
# If we got fewer than 100, we've reached the end
if len(posts) < 100:
break
# Rate limit: 2s between requests
time.sleep(2)
self.log(f"Pullpush: fetched {len(all_posts)} total post IDs for r/{subreddit}", 'info')
return all_posts
def run_gallery_dl_urls(self, urls_file: str, temp_dir: str,
progress_callback=None, batch_callback=None,
batch_size: int = 50) -> dict:
"""Run gallery-dl with --input-file to download specific Reddit post URLs.
Same streaming/batch pattern as run_gallery_dl() but reads URLs from a file
instead of scraping a subreddit listing.
Args:
urls_file: Path to file containing one URL per line
temp_dir: Directory for gallery-dl to download into
progress_callback: Called with (dl_count, skip_count, total_seen)
batch_callback: Called with (new_files: List[Path]) every batch_size files
batch_size: How many files to accumulate before calling batch_callback
Returns:
Dict with dl_count, skip_count, total.
"""
import time
# Same archive as normal Reddit paid content sync
archive_dir = '/opt/media-downloader/data/cache'
os.makedirs(archive_dir, exist_ok=True)
archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
cmd = [
self.gallery_dl_path,
'--write-metadata',
'--download-archive', archive_path,
'-d', temp_dir,
'-o', 'extractor.reddit.api=rest',
'--input-file', urls_file,
]
# Check for Reddit cookies file
cookies_file = self._get_cookies_file()
if cookies_file:
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
cmd.extend(['--cookies', temp_cookie_file])
self.log(f"Running gallery-dl with input file ({urls_file})", 'info')
self.log(f"Command: {' '.join(cmd)}", 'debug')
dl_count = 0
skip_count = 0
pending_files = []
try:
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
start_time = time.time()
timeout_secs = 14400 # 4 hours for backfill (can be large)
while True:
if time.time() - start_time > timeout_secs:
proc.kill()
self.log("gallery-dl backfill timed out", 'error')
break
line = proc.stdout.readline()
if not line and proc.poll() is not None:
break
if not line:
continue
line = line.strip()
if not line:
continue
if line.startswith('# '):
skip_count += 1
else:
dl_count += 1
file_path = Path(line)
if file_path.exists() and not file_path.name.endswith('.json'):
pending_files.append(file_path)
total = dl_count + skip_count
if progress_callback:
progress_callback(dl_count, skip_count, total)
if batch_callback and len(pending_files) >= batch_size:
batch_callback(list(pending_files))
pending_files.clear()
proc.wait()
# Final batch
if batch_callback and pending_files:
batch_callback(list(pending_files))
pending_files.clear()
if progress_callback:
progress_callback(dl_count, skip_count, dl_count + skip_count)
returncode = proc.returncode
if returncode not in (None, 0, 1, 4, 5):
stderr = proc.stderr.read()
self.log(f"gallery-dl backfill returned code {returncode}", 'warning')
if stderr:
self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
except Exception as e:
self.log(f"gallery-dl backfill failed: {e}", 'error')
self.log(f"gallery-dl backfill done: {dl_count} downloaded, {skip_count} skipped", 'info')
return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
@staticmethod
def _detect_file_type(ext: str) -> str:
"""Detect file type from extension."""
ext = ext.lower().lstrip('.')
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv', 'mpeg', 'mpg'}
if ext in image_exts:
return 'image'
elif ext in video_exts:
return 'video'
return 'unknown'