""" Reddit Client for Paid Content - Uses gallery-dl to fetch subreddit posts and download media. Adapts the gallery-dl + metadata parsing pattern from reddit_community_monitor.py to produce Post/Attachment objects for the paid content system. """ import asyncio import json import os import shutil import subprocess import tempfile from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Dict, List, Optional from modules.base_module import LoggingMixin from .models import Post, Attachment class RedditClient(LoggingMixin): """ Client for fetching Reddit subreddit content via gallery-dl. gallery-dl downloads files during fetch, so attachments come with local_path already set. The sync handler moves files to their final location. """ SERVICE_ID = 'reddit' PLATFORM = 'reddit' def __init__(self, unified_db=None, log_callback=None): self._init_logger('PaidContent', log_callback, default_module='Reddit') self.unified_db = unified_db self.gallery_dl_path = shutil.which('gallery-dl') or '/opt/media-downloader/venv/bin/gallery-dl' def get_subreddit_info(self, subreddit: str) -> Optional[Dict]: """Get basic subreddit info by checking the Reddit JSON API. Returns dict with creator_id and creator_name. """ import urllib.request import urllib.error try: # Quick check via Reddit's public JSON endpoint url = f'https://www.reddit.com/r/{subreddit}/about.json' req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)' }) with urllib.request.urlopen(req, timeout=15) as resp: data = json.loads(resp.read().decode()) sub_data = data.get('data', {}) display_name = sub_data.get('display_name', subreddit) title = sub_data.get('title', '') # Extract icon — community_icon is higher res, icon_img is fallback icon_url = (sub_data.get('community_icon') or sub_data.get('icon_img') or '').split('?')[0] # HTML entities in URLs icon_url = icon_url.replace('&', '&') if icon_url else None # Extract banner — banner_background_image is the main one banner_url = sub_data.get('banner_background_image') or sub_data.get('mobile_banner_image') or '' banner_url = banner_url.split('?')[0] if banner_url else None if banner_url: banner_url = banner_url.replace('&', '&') # Build bio from title + public description public_desc = sub_data.get('public_description', '') bio_parts = [] if title: bio_parts.append(title) if public_desc and public_desc != title: bio_parts.append(public_desc) subscribers = sub_data.get('subscribers') if subscribers: bio_parts.append(f"{subscribers:,} subscribers") bio = ' — '.join(bio_parts) if bio_parts else None # Subreddit creation date created_utc = sub_data.get('created_utc') joined_date = None if created_utc: try: joined_date = datetime.fromtimestamp(created_utc, tz=timezone.utc).strftime('%Y-%m-%d') except (ValueError, OSError): pass # Use the subreddit title as display name (e.g. "Reddit Pics") # Fall back to r/name format if no title friendly_name = title if title else f'r/{display_name}' return { 'creator_id': display_name.lower(), 'creator_name': f'r/{display_name}', 'display_name': friendly_name, 'bio': bio, 'joined_date': joined_date, 'profile_image_url': icon_url or None, 'banner_image_url': banner_url or None, } except urllib.error.HTTPError as e: if e.code == 404: self.log(f"Subreddit r/{subreddit} not found (404)", 'warning') return None elif e.code == 403: # Private/quarantined — still exists, return basic info self.log(f"Subreddit r/{subreddit} is private/quarantined", 'warning') return { 'creator_id': subreddit.lower(), 'creator_name': f'r/{subreddit}', } else: self.log(f"HTTP {e.code} checking r/{subreddit}", 'warning') # Return basic info and let sync verify return { 'creator_id': subreddit.lower(), 'creator_name': f'r/{subreddit}', } except Exception as e: self.log(f"Error getting subreddit info for r/{subreddit}: {e}", 'error') return None def get_posts(self, subreddit: str, since_date: str = None, max_posts: int = 0, progress_callback=None) -> tuple: """Fetch posts and download media from a subreddit using gallery-dl. Args: subreddit: Subreddit name (without r/) since_date: ISO date string; skip posts older than this max_posts: Maximum posts to fetch (0 = unlimited) progress_callback: Optional callable(downloaded_count, skipped_count, latest_file) for live progress updates Returns: Tuple of (List[Post], temp_dir_path) — caller must clean up temp_dir when done moving files. Returns ([], None) on failure. """ temp_dir = tempfile.mkdtemp(prefix=f'reddit_paid_{subreddit}_') try: downloaded = self.run_gallery_dl(subreddit, temp_dir, since_date, max_posts, progress_callback=progress_callback) if not downloaded: shutil.rmtree(temp_dir, ignore_errors=True) return [], None # Group files by post using metadata sidecars grouped = self._group_files_by_post(downloaded, temp_dir, subreddit) if not grouped: shutil.rmtree(temp_dir, ignore_errors=True) return [], None posts = [] for post_id, post_data in grouped.items(): attachments = [] for file_path in post_data['files']: ext = file_path.suffix.lower() file_type = self._detect_file_type(ext) attachments.append(Attachment( name=file_path.name, file_type=file_type, extension=ext, server_path=str(file_path), # temp path, will be moved download_url=None, # Already downloaded file_size=file_path.stat().st_size if file_path.exists() else None, )) if not attachments: continue post = Post( post_id=post_id, service_id=self.SERVICE_ID, platform=self.PLATFORM, creator_id=subreddit.lower(), title=post_data.get('title'), content=post_data.get('title'), published_at=post_data.get('date'), attachments=attachments, ) posts.append(post) self.log(f"Parsed {len(posts)} posts with {sum(len(p.attachments) for p in posts)} attachments from r/{subreddit}", 'info') return posts, temp_dir except Exception as e: self.log(f"Error fetching posts from r/{subreddit}: {e}", 'error') shutil.rmtree(temp_dir, ignore_errors=True) return [], None def run_gallery_dl(self, subreddit: str, temp_dir: str, since_date: str = None, max_posts: int = 0, progress_callback=None, batch_callback=None, batch_size: int = 50) -> dict: """Run gallery-dl to download media from a subreddit. Streams stdout line-by-line. Calls progress_callback for status updates and batch_callback with lists of new file paths for incremental processing. Args: progress_callback: Called with (dl_count, skip_count, total_seen) batch_callback: Called with (new_files: List[Path]) every batch_size files batch_size: How many files to accumulate before calling batch_callback Returns: Dict with dl_count, skip_count, total. """ import time # Use a separate download archive for paid content reddit archive_dir = '/opt/media-downloader/data/cache' os.makedirs(archive_dir, exist_ok=True) archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db') cmd = [ self.gallery_dl_path, '--write-metadata', '--download-archive', archive_path, '-d', temp_dir, ] # REST API mode to avoid shared OAuth rate limits cmd.extend(['-o', 'extractor.reddit.api=rest']) # Limit posts (0 = unlimited) if max_posts > 0: cmd.extend(['--range', f'1-{max_posts}']) # Date filtering if since_date: try: cutoff = since_date[:10] # YYYY-MM-DD cmd.extend(['--filter', f"date >= datetime.strptime('{cutoff}', '%Y-%m-%d')"]) except (ValueError, IndexError): pass cmd.append(f'https://www.reddit.com/r/{subreddit}/new/') # Check for Reddit cookies file cookies_file = self._get_cookies_file() if cookies_file: temp_cookie_file = os.path.join(temp_dir, '.cookies.txt') if self._write_netscape_cookie_file(cookies_file, temp_cookie_file): cmd.extend(['--cookies', temp_cookie_file]) self.log(f"Running gallery-dl for r/{subreddit}", 'info') self.log(f"Command: {' '.join(cmd)}", 'debug') dl_count = 0 skip_count = 0 pending_files = [] try: proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) start_time = time.time() timeout_secs = 7200 # 2 hours while True: if time.time() - start_time > timeout_secs: proc.kill() self.log(f"gallery-dl timed out for r/{subreddit}", 'error') break line = proc.stdout.readline() if not line and proc.poll() is not None: break if not line: continue line = line.strip() if not line: continue if line.startswith('# '): # Skipped file (already in archive) skip_count += 1 else: # Downloaded file — gallery-dl prints the full path dl_count += 1 file_path = Path(line) if file_path.exists() and not file_path.name.endswith('.json'): pending_files.append(file_path) total = dl_count + skip_count if progress_callback and total % 5 == 0: progress_callback(dl_count, skip_count, total) # Flush batch for processing if batch_callback and len(pending_files) >= batch_size: batch_callback(list(pending_files)) pending_files.clear() proc.wait() # Final batch if batch_callback and pending_files: batch_callback(list(pending_files)) pending_files.clear() if progress_callback: progress_callback(dl_count, skip_count, dl_count + skip_count) returncode = proc.returncode if returncode not in (None, 0, 1, 4, 5): stderr = proc.stderr.read() self.log(f"gallery-dl returned code {returncode} for r/{subreddit}", 'warning') if stderr: self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug') except Exception as e: self.log(f"gallery-dl failed for r/{subreddit}: {e}", 'error') self.log(f"gallery-dl done for r/{subreddit}: {dl_count} downloaded, {skip_count} skipped", 'info') return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count} def _group_files_by_post(self, files: List[Path], temp_dir: str, subreddit: str) -> Dict[str, Dict]: """Group downloaded files by Reddit post ID using metadata JSON sidecars. Adapted from reddit_community_monitor.py:_group_files_by_post Returns: Dict mapping reddit_post_id -> { 'files': [Path], 'title': str, 'date': str, 'source_url': str } """ posts: Dict[str, Dict] = {} for file_path in files: # Look for matching metadata JSON sidecar json_path = file_path.with_suffix(file_path.suffix + '.json') if not json_path.exists(): json_path = file_path.with_suffix('.json') metadata = {} if json_path.exists(): try: with open(json_path, 'r', encoding='utf-8') as f: metadata = json.load(f) except (json.JSONDecodeError, Exception) as e: self.log(f"Failed to parse metadata for {file_path.name}: {e}", 'debug') # Extract Reddit post ID reddit_post_id = None for key in ('id', 'reddit_id', 'parent_id'): if key in metadata: reddit_post_id = str(metadata[key]) break if not reddit_post_id: # Filename-based fallback: subreddit_postid_num.ext parts = file_path.stem.split('_') if len(parts) >= 2: reddit_post_id = parts[-2] if len(parts) >= 3 else parts[-1] else: reddit_post_id = file_path.stem # Extract post date post_date = None if 'date' in metadata: date_val = metadata['date'] if isinstance(date_val, str): for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'): try: utc_dt = datetime.strptime(date_val, fmt).replace(tzinfo=timezone.utc) post_date = utc_dt.astimezone().strftime('%Y-%m-%dT%H:%M:%S') break except ValueError: continue if not post_date: post_date = date_val elif isinstance(date_val, (int, float)): try: post_date = datetime.fromtimestamp(date_val, tz=timezone.utc).isoformat() except (ValueError, OSError): pass if not post_date and 'created_utc' in metadata: try: post_date = datetime.fromtimestamp(metadata['created_utc'], tz=timezone.utc).isoformat() except (ValueError, OSError): pass if not post_date: post_date = datetime.now().isoformat() title = metadata.get('title', metadata.get('description', '')) sub = metadata.get('subreddit', subreddit) source_url = f"https://www.reddit.com/r/{sub}/comments/{reddit_post_id}" if sub else '' if reddit_post_id not in posts: posts[reddit_post_id] = { 'files': [], 'title': title, 'date': post_date, 'source_url': source_url, } posts[reddit_post_id]['files'].append(file_path) return posts def _get_cookies_file(self) -> Optional[str]: """Get Reddit cookies JSON from the scrapers table if configured.""" if not self.unified_db: return None try: with self.unified_db.get_connection() as conn: cursor = conn.cursor() cursor.execute( "SELECT cookies FROM scrapers WHERE name = 'reddit' AND cookies IS NOT NULL" ) row = cursor.fetchone() if row and row[0]: return row[0] except Exception as e: self.log(f"Could not load Reddit cookies: {e}", 'debug') return None def _write_netscape_cookie_file(self, cookies_json: str, output_path: str) -> bool: """Convert JSON cookies array to Netscape cookie file format.""" try: cookies = json.loads(cookies_json) if not isinstance(cookies, list): return False with open(output_path, 'w') as f: f.write("# Netscape HTTP Cookie File\n") f.write("# https://curl.haxx.se/docs/http-cookies.html\n\n") for cookie in cookies: domain = cookie.get('domain', '') include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE' path = cookie.get('path', '/') secure = 'TRUE' if cookie.get('secure', False) else 'FALSE' expires = cookie.get('expirationDate', cookie.get('expiry', cookie.get('expires', 0))) if expires is None: expires = 0 expires = str(int(float(expires))) name = cookie.get('name', '') value = cookie.get('value', '') f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n") return True except Exception as e: self.log(f"Failed to write Netscape cookie file: {e}", 'error') return False def get_pullpush_post_ids(self, subreddit: str, after_ts: int = 0, before_ts: int = None, progress_callback=None) -> List[Dict]: """Fetch all historical post IDs for a subreddit from the Pullpush (Pushshift) API. Paginates through the full archive using created_utc ascending order. Rate-limited to ~1 request per 2 seconds. Args: subreddit: Subreddit name (without r/) after_ts: Unix timestamp to start from (0 = beginning of time) before_ts: Unix timestamp to stop at (None = no upper limit) progress_callback: Optional callable(fetched_count, message) Returns: List of dicts: [{id, title, created_utc, url, is_gallery}, ...] """ import time import urllib.request import urllib.error base_url = 'https://api.pullpush.io/reddit/search/submission/' all_posts = [] current_after = after_ts page = 0 while True: params = ( f'subreddit={subreddit}' f'&size=100' f'&sort=asc' f'&sort_type=created_utc' f'&after={current_after}' ) if before_ts is not None: params += f'&before={before_ts}' url = f'{base_url}?{params}' page += 1 try: req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)' }) with urllib.request.urlopen(req, timeout=30) as resp: data = json.loads(resp.read().decode()) except urllib.error.HTTPError as e: if e.code == 429: self.log(f"Pullpush rate limited, waiting 5s...", 'warning') time.sleep(5) continue self.log(f"Pullpush HTTP {e.code} for r/{subreddit}: {e}", 'error') break except Exception as e: self.log(f"Pullpush request failed for r/{subreddit}: {e}", 'error') break posts = data.get('data', []) if not posts: break for post in posts: all_posts.append({ 'id': post.get('id', ''), 'title': post.get('title', ''), 'created_utc': post.get('created_utc', 0), 'url': post.get('url', ''), 'is_gallery': post.get('is_gallery', False), 'selftext': post.get('selftext', ''), }) last_ts = posts[-1].get('created_utc', 0) if progress_callback: progress_callback(len(all_posts), f"Fetched {len(all_posts)} post IDs (page {page})") # Handle stuck pagination — same timestamp repeating if last_ts <= current_after: current_after = last_ts + 1 else: current_after = last_ts # If we got fewer than 100, we've reached the end if len(posts) < 100: break # Rate limit: 2s between requests time.sleep(2) self.log(f"Pullpush: fetched {len(all_posts)} total post IDs for r/{subreddit}", 'info') return all_posts def run_gallery_dl_urls(self, urls_file: str, temp_dir: str, progress_callback=None, batch_callback=None, batch_size: int = 50) -> dict: """Run gallery-dl with --input-file to download specific Reddit post URLs. Same streaming/batch pattern as run_gallery_dl() but reads URLs from a file instead of scraping a subreddit listing. Args: urls_file: Path to file containing one URL per line temp_dir: Directory for gallery-dl to download into progress_callback: Called with (dl_count, skip_count, total_seen) batch_callback: Called with (new_files: List[Path]) every batch_size files batch_size: How many files to accumulate before calling batch_callback Returns: Dict with dl_count, skip_count, total. """ import time # Same archive as normal Reddit paid content sync archive_dir = '/opt/media-downloader/data/cache' os.makedirs(archive_dir, exist_ok=True) archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db') cmd = [ self.gallery_dl_path, '--write-metadata', '--download-archive', archive_path, '-d', temp_dir, '-o', 'extractor.reddit.api=rest', '--input-file', urls_file, ] # Check for Reddit cookies file cookies_file = self._get_cookies_file() if cookies_file: temp_cookie_file = os.path.join(temp_dir, '.cookies.txt') if self._write_netscape_cookie_file(cookies_file, temp_cookie_file): cmd.extend(['--cookies', temp_cookie_file]) self.log(f"Running gallery-dl with input file ({urls_file})", 'info') self.log(f"Command: {' '.join(cmd)}", 'debug') dl_count = 0 skip_count = 0 pending_files = [] try: proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) start_time = time.time() timeout_secs = 14400 # 4 hours for backfill (can be large) while True: if time.time() - start_time > timeout_secs: proc.kill() self.log("gallery-dl backfill timed out", 'error') break line = proc.stdout.readline() if not line and proc.poll() is not None: break if not line: continue line = line.strip() if not line: continue if line.startswith('# '): skip_count += 1 else: dl_count += 1 file_path = Path(line) if file_path.exists() and not file_path.name.endswith('.json'): pending_files.append(file_path) total = dl_count + skip_count if progress_callback: progress_callback(dl_count, skip_count, total) if batch_callback and len(pending_files) >= batch_size: batch_callback(list(pending_files)) pending_files.clear() proc.wait() # Final batch if batch_callback and pending_files: batch_callback(list(pending_files)) pending_files.clear() if progress_callback: progress_callback(dl_count, skip_count, dl_count + skip_count) returncode = proc.returncode if returncode not in (None, 0, 1, 4, 5): stderr = proc.stderr.read() self.log(f"gallery-dl backfill returned code {returncode}", 'warning') if stderr: self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug') except Exception as e: self.log(f"gallery-dl backfill failed: {e}", 'error') self.log(f"gallery-dl backfill done: {dl_count} downloaded, {skip_count} skipped", 'info') return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count} @staticmethod def _detect_file_type(ext: str) -> str: """Detect file type from extension.""" ext = ext.lower().lstrip('.') image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'} video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv', 'mpeg', 'mpg'} if ext in image_exts: return 'image' elif ext in video_exts: return 'video' return 'unknown'