Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/modules/paid_content/reddit_client.py
+++ b/modules/paid_content/reddit_client.py
@@ -0,0 +1,678 @@
+"""
+Reddit Client for Paid Content - Uses gallery-dl to fetch subreddit posts and download media.
+
+Adapts the gallery-dl + metadata parsing pattern from reddit_community_monitor.py
+to produce Post/Attachment objects for the paid content system.
+"""
+
+import asyncio
+import json
+import os
+import shutil
+import subprocess
+import tempfile
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from modules.base_module import LoggingMixin
+from .models import Post, Attachment
+
+
+class RedditClient(LoggingMixin):
+    """
+    Client for fetching Reddit subreddit content via gallery-dl.
+
+    gallery-dl downloads files during fetch, so attachments come with local_path
+    already set. The sync handler moves files to their final location.
+    """
+
+    SERVICE_ID = 'reddit'
+    PLATFORM = 'reddit'
+
+    def __init__(self, unified_db=None, log_callback=None):
+        self._init_logger('PaidContent', log_callback, default_module='Reddit')
+        self.unified_db = unified_db
+        self.gallery_dl_path = shutil.which('gallery-dl') or '/opt/media-downloader/venv/bin/gallery-dl'
+
+    def get_subreddit_info(self, subreddit: str) -> Optional[Dict]:
+        """Get basic subreddit info by checking the Reddit JSON API.
+
+        Returns dict with creator_id and creator_name.
+        """
+        import urllib.request
+        import urllib.error
+
+        try:
+            # Quick check via Reddit's public JSON endpoint
+            url = f'https://www.reddit.com/r/{subreddit}/about.json'
+            req = urllib.request.Request(url, headers={
+                'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
+            })
+            with urllib.request.urlopen(req, timeout=15) as resp:
+                data = json.loads(resp.read().decode())
+
+            sub_data = data.get('data', {})
+            display_name = sub_data.get('display_name', subreddit)
+            title = sub_data.get('title', '')
+
+            # Extract icon — community_icon is higher res, icon_img is fallback
+            icon_url = (sub_data.get('community_icon') or sub_data.get('icon_img') or '').split('?')[0]
+            # HTML entities in URLs
+            icon_url = icon_url.replace('&amp;', '&') if icon_url else None
+
+            # Extract banner — banner_background_image is the main one
+            banner_url = sub_data.get('banner_background_image') or sub_data.get('mobile_banner_image') or ''
+            banner_url = banner_url.split('?')[0] if banner_url else None
+            if banner_url:
+                banner_url = banner_url.replace('&amp;', '&')
+
+            # Build bio from title + public description
+            public_desc = sub_data.get('public_description', '')
+            bio_parts = []
+            if title:
+                bio_parts.append(title)
+            if public_desc and public_desc != title:
+                bio_parts.append(public_desc)
+            subscribers = sub_data.get('subscribers')
+            if subscribers:
+                bio_parts.append(f"{subscribers:,} subscribers")
+            bio = ' — '.join(bio_parts) if bio_parts else None
+
+            # Subreddit creation date
+            created_utc = sub_data.get('created_utc')
+            joined_date = None
+            if created_utc:
+                try:
+                    joined_date = datetime.fromtimestamp(created_utc, tz=timezone.utc).strftime('%Y-%m-%d')
+                except (ValueError, OSError):
+                    pass
+
+            # Use the subreddit title as display name (e.g. "Reddit Pics")
+            # Fall back to r/name format if no title
+            friendly_name = title if title else f'r/{display_name}'
+
+            return {
+                'creator_id': display_name.lower(),
+                'creator_name': f'r/{display_name}',
+                'display_name': friendly_name,
+                'bio': bio,
+                'joined_date': joined_date,
+                'profile_image_url': icon_url or None,
+                'banner_image_url': banner_url or None,
+            }
+
+        except urllib.error.HTTPError as e:
+            if e.code == 404:
+                self.log(f"Subreddit r/{subreddit} not found (404)", 'warning')
+                return None
+            elif e.code == 403:
+                # Private/quarantined — still exists, return basic info
+                self.log(f"Subreddit r/{subreddit} is private/quarantined", 'warning')
+                return {
+                    'creator_id': subreddit.lower(),
+                    'creator_name': f'r/{subreddit}',
+                }
+            else:
+                self.log(f"HTTP {e.code} checking r/{subreddit}", 'warning')
+                # Return basic info and let sync verify
+                return {
+                    'creator_id': subreddit.lower(),
+                    'creator_name': f'r/{subreddit}',
+                }
+        except Exception as e:
+            self.log(f"Error getting subreddit info for r/{subreddit}: {e}", 'error')
+            return None
+
+    def get_posts(self, subreddit: str, since_date: str = None, max_posts: int = 0,
+                  progress_callback=None) -> tuple:
+        """Fetch posts and download media from a subreddit using gallery-dl.
+
+        Args:
+            subreddit: Subreddit name (without r/)
+            since_date: ISO date string; skip posts older than this
+            max_posts: Maximum posts to fetch (0 = unlimited)
+            progress_callback: Optional callable(downloaded_count, skipped_count, latest_file)
+                for live progress updates
+
+        Returns:
+            Tuple of (List[Post], temp_dir_path) — caller must clean up temp_dir
+            when done moving files. Returns ([], None) on failure.
+        """
+        temp_dir = tempfile.mkdtemp(prefix=f'reddit_paid_{subreddit}_')
+
+        try:
+            downloaded = self.run_gallery_dl(subreddit, temp_dir, since_date, max_posts,
+                                             progress_callback=progress_callback)
+
+            if not downloaded:
+                shutil.rmtree(temp_dir, ignore_errors=True)
+                return [], None
+
+            # Group files by post using metadata sidecars
+            grouped = self._group_files_by_post(downloaded, temp_dir, subreddit)
+
+            if not grouped:
+                shutil.rmtree(temp_dir, ignore_errors=True)
+                return [], None
+
+            posts = []
+            for post_id, post_data in grouped.items():
+                attachments = []
+                for file_path in post_data['files']:
+                    ext = file_path.suffix.lower()
+                    file_type = self._detect_file_type(ext)
+
+                    attachments.append(Attachment(
+                        name=file_path.name,
+                        file_type=file_type,
+                        extension=ext,
+                        server_path=str(file_path),  # temp path, will be moved
+                        download_url=None,  # Already downloaded
+                        file_size=file_path.stat().st_size if file_path.exists() else None,
+                    ))
+
+                if not attachments:
+                    continue
+
+                post = Post(
+                    post_id=post_id,
+                    service_id=self.SERVICE_ID,
+                    platform=self.PLATFORM,
+                    creator_id=subreddit.lower(),
+                    title=post_data.get('title'),
+                    content=post_data.get('title'),
+                    published_at=post_data.get('date'),
+                    attachments=attachments,
+                )
+                posts.append(post)
+
+            self.log(f"Parsed {len(posts)} posts with {sum(len(p.attachments) for p in posts)} attachments from r/{subreddit}", 'info')
+            return posts, temp_dir
+
+        except Exception as e:
+            self.log(f"Error fetching posts from r/{subreddit}: {e}", 'error')
+            shutil.rmtree(temp_dir, ignore_errors=True)
+            return [], None
+
+    def run_gallery_dl(self, subreddit: str, temp_dir: str,
+                       since_date: str = None, max_posts: int = 0,
+                       progress_callback=None, batch_callback=None,
+                       batch_size: int = 50) -> dict:
+        """Run gallery-dl to download media from a subreddit.
+
+        Streams stdout line-by-line. Calls progress_callback for status updates
+        and batch_callback with lists of new file paths for incremental processing.
+
+        Args:
+            progress_callback: Called with (dl_count, skip_count, total_seen)
+            batch_callback: Called with (new_files: List[Path]) every batch_size files
+            batch_size: How many files to accumulate before calling batch_callback
+
+        Returns:
+            Dict with dl_count, skip_count, total.
+        """
+        import time
+
+        # Use a separate download archive for paid content reddit
+        archive_dir = '/opt/media-downloader/data/cache'
+        os.makedirs(archive_dir, exist_ok=True)
+        archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
+
+        cmd = [
+            self.gallery_dl_path,
+            '--write-metadata',
+            '--download-archive', archive_path,
+            '-d', temp_dir,
+        ]
+
+        # REST API mode to avoid shared OAuth rate limits
+        cmd.extend(['-o', 'extractor.reddit.api=rest'])
+
+        # Limit posts (0 = unlimited)
+        if max_posts > 0:
+            cmd.extend(['--range', f'1-{max_posts}'])
+
+        # Date filtering
+        if since_date:
+            try:
+                cutoff = since_date[:10]  # YYYY-MM-DD
+                cmd.extend(['--filter', f"date >= datetime.strptime('{cutoff}', '%Y-%m-%d')"])
+            except (ValueError, IndexError):
+                pass
+
+        cmd.append(f'https://www.reddit.com/r/{subreddit}/new/')
+
+        # Check for Reddit cookies file
+        cookies_file = self._get_cookies_file()
+        if cookies_file:
+            temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
+            if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
+                cmd.extend(['--cookies', temp_cookie_file])
+
+        self.log(f"Running gallery-dl for r/{subreddit}", 'info')
+        self.log(f"Command: {' '.join(cmd)}", 'debug')
+
+        dl_count = 0
+        skip_count = 0
+        pending_files = []
+
+        try:
+            proc = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+            )
+
+            start_time = time.time()
+            timeout_secs = 7200  # 2 hours
+
+            while True:
+                if time.time() - start_time > timeout_secs:
+                    proc.kill()
+                    self.log(f"gallery-dl timed out for r/{subreddit}", 'error')
+                    break
+
+                line = proc.stdout.readline()
+                if not line and proc.poll() is not None:
+                    break
+                if not line:
+                    continue
+
+                line = line.strip()
+                if not line:
+                    continue
+
+                if line.startswith('# '):
+                    # Skipped file (already in archive)
+                    skip_count += 1
+                else:
+                    # Downloaded file — gallery-dl prints the full path
+                    dl_count += 1
+                    file_path = Path(line)
+                    if file_path.exists() and not file_path.name.endswith('.json'):
+                        pending_files.append(file_path)
+
+                total = dl_count + skip_count
+                if progress_callback and total % 5 == 0:
+                    progress_callback(dl_count, skip_count, total)
+
+                # Flush batch for processing
+                if batch_callback and len(pending_files) >= batch_size:
+                    batch_callback(list(pending_files))
+                    pending_files.clear()
+
+            proc.wait()
+
+            # Final batch
+            if batch_callback and pending_files:
+                batch_callback(list(pending_files))
+                pending_files.clear()
+
+            if progress_callback:
+                progress_callback(dl_count, skip_count, dl_count + skip_count)
+
+            returncode = proc.returncode
+            if returncode not in (None, 0, 1, 4, 5):
+                stderr = proc.stderr.read()
+                self.log(f"gallery-dl returned code {returncode} for r/{subreddit}", 'warning')
+                if stderr:
+                    self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
+
+        except Exception as e:
+            self.log(f"gallery-dl failed for r/{subreddit}: {e}", 'error')
+
+        self.log(f"gallery-dl done for r/{subreddit}: {dl_count} downloaded, {skip_count} skipped", 'info')
+        return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
+
+    def _group_files_by_post(self, files: List[Path], temp_dir: str,
+                             subreddit: str) -> Dict[str, Dict]:
+        """Group downloaded files by Reddit post ID using metadata JSON sidecars.
+
+        Adapted from reddit_community_monitor.py:_group_files_by_post
+
+        Returns:
+            Dict mapping reddit_post_id -> {
+                'files': [Path],
+                'title': str,
+                'date': str,
+                'source_url': str
+            }
+        """
+        posts: Dict[str, Dict] = {}
+
+        for file_path in files:
+            # Look for matching metadata JSON sidecar
+            json_path = file_path.with_suffix(file_path.suffix + '.json')
+            if not json_path.exists():
+                json_path = file_path.with_suffix('.json')
+
+            metadata = {}
+            if json_path.exists():
+                try:
+                    with open(json_path, 'r', encoding='utf-8') as f:
+                        metadata = json.load(f)
+                except (json.JSONDecodeError, Exception) as e:
+                    self.log(f"Failed to parse metadata for {file_path.name}: {e}", 'debug')
+
+            # Extract Reddit post ID
+            reddit_post_id = None
+            for key in ('id', 'reddit_id', 'parent_id'):
+                if key in metadata:
+                    reddit_post_id = str(metadata[key])
+                    break
+
+            if not reddit_post_id:
+                # Filename-based fallback: subreddit_postid_num.ext
+                parts = file_path.stem.split('_')
+                if len(parts) >= 2:
+                    reddit_post_id = parts[-2] if len(parts) >= 3 else parts[-1]
+                else:
+                    reddit_post_id = file_path.stem
+
+            # Extract post date
+            post_date = None
+            if 'date' in metadata:
+                date_val = metadata['date']
+                if isinstance(date_val, str):
+                    for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
+                        try:
+                            utc_dt = datetime.strptime(date_val, fmt).replace(tzinfo=timezone.utc)
+                            post_date = utc_dt.astimezone().strftime('%Y-%m-%dT%H:%M:%S')
+                            break
+                        except ValueError:
+                            continue
+                    if not post_date:
+                        post_date = date_val
+                elif isinstance(date_val, (int, float)):
+                    try:
+                        post_date = datetime.fromtimestamp(date_val, tz=timezone.utc).isoformat()
+                    except (ValueError, OSError):
+                        pass
+
+            if not post_date and 'created_utc' in metadata:
+                try:
+                    post_date = datetime.fromtimestamp(metadata['created_utc'], tz=timezone.utc).isoformat()
+                except (ValueError, OSError):
+                    pass
+
+            if not post_date:
+                post_date = datetime.now().isoformat()
+
+            title = metadata.get('title', metadata.get('description', ''))
+            sub = metadata.get('subreddit', subreddit)
+            source_url = f"https://www.reddit.com/r/{sub}/comments/{reddit_post_id}" if sub else ''
+
+            if reddit_post_id not in posts:
+                posts[reddit_post_id] = {
+                    'files': [],
+                    'title': title,
+                    'date': post_date,
+                    'source_url': source_url,
+                }
+
+            posts[reddit_post_id]['files'].append(file_path)
+
+        return posts
+
+    def _get_cookies_file(self) -> Optional[str]:
+        """Get Reddit cookies JSON from the scrapers table if configured."""
+        if not self.unified_db:
+            return None
+
+        try:
+            with self.unified_db.get_connection() as conn:
+                cursor = conn.cursor()
+                cursor.execute(
+                    "SELECT cookies FROM scrapers WHERE name = 'reddit' AND cookies IS NOT NULL"
+                )
+                row = cursor.fetchone()
+                if row and row[0]:
+                    return row[0]
+        except Exception as e:
+            self.log(f"Could not load Reddit cookies: {e}", 'debug')
+
+        return None
+
+    def _write_netscape_cookie_file(self, cookies_json: str, output_path: str) -> bool:
+        """Convert JSON cookies array to Netscape cookie file format."""
+        try:
+            cookies = json.loads(cookies_json)
+            if not isinstance(cookies, list):
+                return False
+
+            with open(output_path, 'w') as f:
+                f.write("# Netscape HTTP Cookie File\n")
+                f.write("# https://curl.haxx.se/docs/http-cookies.html\n\n")
+                for cookie in cookies:
+                    domain = cookie.get('domain', '')
+                    include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
+                    path = cookie.get('path', '/')
+                    secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
+                    expires = cookie.get('expirationDate', cookie.get('expiry', cookie.get('expires', 0)))
+                    if expires is None:
+                        expires = 0
+                    expires = str(int(float(expires)))
+                    name = cookie.get('name', '')
+                    value = cookie.get('value', '')
+                    f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n")
+
+            return True
+        except Exception as e:
+            self.log(f"Failed to write Netscape cookie file: {e}", 'error')
+            return False
+
+    def get_pullpush_post_ids(self, subreddit: str, after_ts: int = 0,
+                              before_ts: int = None,
+                              progress_callback=None) -> List[Dict]:
+        """Fetch all historical post IDs for a subreddit from the Pullpush (Pushshift) API.
+
+        Paginates through the full archive using created_utc ascending order.
+        Rate-limited to ~1 request per 2 seconds.
+
+        Args:
+            subreddit: Subreddit name (without r/)
+            after_ts: Unix timestamp to start from (0 = beginning of time)
+            before_ts: Unix timestamp to stop at (None = no upper limit)
+            progress_callback: Optional callable(fetched_count, message)
+
+        Returns:
+            List of dicts: [{id, title, created_utc, url, is_gallery}, ...]
+        """
+        import time
+        import urllib.request
+        import urllib.error
+
+        base_url = 'https://api.pullpush.io/reddit/search/submission/'
+        all_posts = []
+        current_after = after_ts
+        page = 0
+
+        while True:
+            params = (
+                f'subreddit={subreddit}'
+                f'&size=100'
+                f'&sort=asc'
+                f'&sort_type=created_utc'
+                f'&after={current_after}'
+            )
+            if before_ts is not None:
+                params += f'&before={before_ts}'
+
+            url = f'{base_url}?{params}'
+            page += 1
+
+            try:
+                req = urllib.request.Request(url, headers={
+                    'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
+                })
+                with urllib.request.urlopen(req, timeout=30) as resp:
+                    data = json.loads(resp.read().decode())
+            except urllib.error.HTTPError as e:
+                if e.code == 429:
+                    self.log(f"Pullpush rate limited, waiting 5s...", 'warning')
+                    time.sleep(5)
+                    continue
+                self.log(f"Pullpush HTTP {e.code} for r/{subreddit}: {e}", 'error')
+                break
+            except Exception as e:
+                self.log(f"Pullpush request failed for r/{subreddit}: {e}", 'error')
+                break
+
+            posts = data.get('data', [])
+            if not posts:
+                break
+
+            for post in posts:
+                all_posts.append({
+                    'id': post.get('id', ''),
+                    'title': post.get('title', ''),
+                    'created_utc': post.get('created_utc', 0),
+                    'url': post.get('url', ''),
+                    'is_gallery': post.get('is_gallery', False),
+                    'selftext': post.get('selftext', ''),
+                })
+
+            last_ts = posts[-1].get('created_utc', 0)
+
+            if progress_callback:
+                progress_callback(len(all_posts),
+                    f"Fetched {len(all_posts)} post IDs (page {page})")
+
+            # Handle stuck pagination — same timestamp repeating
+            if last_ts <= current_after:
+                current_after = last_ts + 1
+            else:
+                current_after = last_ts
+
+            # If we got fewer than 100, we've reached the end
+            if len(posts) < 100:
+                break
+
+            # Rate limit: 2s between requests
+            time.sleep(2)
+
+        self.log(f"Pullpush: fetched {len(all_posts)} total post IDs for r/{subreddit}", 'info')
+        return all_posts
+
+    def run_gallery_dl_urls(self, urls_file: str, temp_dir: str,
+                            progress_callback=None, batch_callback=None,
+                            batch_size: int = 50) -> dict:
+        """Run gallery-dl with --input-file to download specific Reddit post URLs.
+
+        Same streaming/batch pattern as run_gallery_dl() but reads URLs from a file
+        instead of scraping a subreddit listing.
+
+        Args:
+            urls_file: Path to file containing one URL per line
+            temp_dir: Directory for gallery-dl to download into
+            progress_callback: Called with (dl_count, skip_count, total_seen)
+            batch_callback: Called with (new_files: List[Path]) every batch_size files
+            batch_size: How many files to accumulate before calling batch_callback
+
+        Returns:
+            Dict with dl_count, skip_count, total.
+        """
+        import time
+
+        # Same archive as normal Reddit paid content sync
+        archive_dir = '/opt/media-downloader/data/cache'
+        os.makedirs(archive_dir, exist_ok=True)
+        archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
+
+        cmd = [
+            self.gallery_dl_path,
+            '--write-metadata',
+            '--download-archive', archive_path,
+            '-d', temp_dir,
+            '-o', 'extractor.reddit.api=rest',
+            '--input-file', urls_file,
+        ]
+
+        # Check for Reddit cookies file
+        cookies_file = self._get_cookies_file()
+        if cookies_file:
+            temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
+            if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
+                cmd.extend(['--cookies', temp_cookie_file])
+
+        self.log(f"Running gallery-dl with input file ({urls_file})", 'info')
+        self.log(f"Command: {' '.join(cmd)}", 'debug')
+
+        dl_count = 0
+        skip_count = 0
+        pending_files = []
+
+        try:
+            proc = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+            )
+
+            start_time = time.time()
+            timeout_secs = 14400  # 4 hours for backfill (can be large)
+
+            while True:
+                if time.time() - start_time > timeout_secs:
+                    proc.kill()
+                    self.log("gallery-dl backfill timed out", 'error')
+                    break
+
+                line = proc.stdout.readline()
+                if not line and proc.poll() is not None:
+                    break
+                if not line:
+                    continue
+
+                line = line.strip()
+                if not line:
+                    continue
+
+                if line.startswith('# '):
+                    skip_count += 1
+                else:
+                    dl_count += 1
+                    file_path = Path(line)
+                    if file_path.exists() and not file_path.name.endswith('.json'):
+                        pending_files.append(file_path)
+
+                total = dl_count + skip_count
+                if progress_callback:
+                    progress_callback(dl_count, skip_count, total)
+
+                if batch_callback and len(pending_files) >= batch_size:
+                    batch_callback(list(pending_files))
+                    pending_files.clear()
+
+            proc.wait()
+
+            # Final batch
+            if batch_callback and pending_files:
+                batch_callback(list(pending_files))
+                pending_files.clear()
+
+            if progress_callback:
+                progress_callback(dl_count, skip_count, dl_count + skip_count)
+
+            returncode = proc.returncode
+            if returncode not in (None, 0, 1, 4, 5):
+                stderr = proc.stderr.read()
+                self.log(f"gallery-dl backfill returned code {returncode}", 'warning')
+                if stderr:
+                    self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
+
+        except Exception as e:
+            self.log(f"gallery-dl backfill failed: {e}", 'error')
+
+        self.log(f"gallery-dl backfill done: {dl_count} downloaded, {skip_count} skipped", 'info')
+        return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
+
+    @staticmethod
+    def _detect_file_type(ext: str) -> str:
+        """Detect file type from extension."""
+        ext = ext.lower().lstrip('.')
+        image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
+        video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv', 'mpeg', 'mpg'}
+
+        if ext in image_exts:
+            return 'image'
+        elif ext in video_exts:
+            return 'video'
+        return 'unknown'