media-downloader/modules/paid_content/reddit_client.py

"""
Reddit Client for Paid Content - Uses gallery-dl to fetch subreddit posts and download media.

Adapts the gallery-dl + metadata parsing pattern from reddit_community_monitor.py
to produce Post/Attachment objects for the paid content system.
"""

import asyncio
import json
import os
import shutil
import subprocess
import tempfile
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Dict, List, Optional

from modules.base_module import LoggingMixin
from .models import Post, Attachment


class RedditClient(LoggingMixin):
    """
    Client for fetching Reddit subreddit content via gallery-dl.

    gallery-dl downloads files during fetch, so attachments come with local_path
    already set. The sync handler moves files to their final location.
    """

    SERVICE_ID = 'reddit'
    PLATFORM = 'reddit'

    def __init__(self, unified_db=None, log_callback=None):
        self._init_logger('PaidContent', log_callback, default_module='Reddit')
        self.unified_db = unified_db
        self.gallery_dl_path = shutil.which('gallery-dl') or '/opt/media-downloader/venv/bin/gallery-dl'

    def get_subreddit_info(self, subreddit: str) -> Optional[Dict]:
        """Get basic subreddit info by checking the Reddit JSON API.

        Returns dict with creator_id and creator_name.
        """
        import urllib.request
        import urllib.error

        try:
            # Quick check via Reddit's public JSON endpoint
            url = f'https://www.reddit.com/r/{subreddit}/about.json'
            req = urllib.request.Request(url, headers={
                'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
            })
            with urllib.request.urlopen(req, timeout=15) as resp:
                data = json.loads(resp.read().decode())

            sub_data = data.get('data', {})
            display_name = sub_data.get('display_name', subreddit)
            title = sub_data.get('title', '')

            # Extract icon — community_icon is higher res, icon_img is fallback
            icon_url = (sub_data.get('community_icon') or sub_data.get('icon_img') or '').split('?')[0]
            # HTML entities in URLs
            icon_url = icon_url.replace('&amp;', '&') if icon_url else None

            # Extract banner — banner_background_image is the main one
            banner_url = sub_data.get('banner_background_image') or sub_data.get('mobile_banner_image') or ''
            banner_url = banner_url.split('?')[0] if banner_url else None
            if banner_url:
                banner_url = banner_url.replace('&amp;', '&')

            # Build bio from title + public description
            public_desc = sub_data.get('public_description', '')
            bio_parts = []
            if title:
                bio_parts.append(title)
            if public_desc and public_desc != title:
                bio_parts.append(public_desc)
            subscribers = sub_data.get('subscribers')
            if subscribers:
                bio_parts.append(f"{subscribers:,} subscribers")
            bio = ' — '.join(bio_parts) if bio_parts else None

            # Subreddit creation date
            created_utc = sub_data.get('created_utc')
            joined_date = None
            if created_utc:
                try:
                    joined_date = datetime.fromtimestamp(created_utc, tz=timezone.utc).strftime('%Y-%m-%d')
                except (ValueError, OSError):
                    pass

            # Use the subreddit title as display name (e.g. "Reddit Pics")
            # Fall back to r/name format if no title
            friendly_name = title if title else f'r/{display_name}'

            return {
                'creator_id': display_name.lower(),
                'creator_name': f'r/{display_name}',
                'display_name': friendly_name,
                'bio': bio,
                'joined_date': joined_date,
                'profile_image_url': icon_url or None,
                'banner_image_url': banner_url or None,
            }

        except urllib.error.HTTPError as e:
            if e.code == 404:
                self.log(f"Subreddit r/{subreddit} not found (404)", 'warning')
                return None
            elif e.code == 403:
                # Private/quarantined — still exists, return basic info
                self.log(f"Subreddit r/{subreddit} is private/quarantined", 'warning')
                return {
                    'creator_id': subreddit.lower(),
                    'creator_name': f'r/{subreddit}',
                }
            else:
                self.log(f"HTTP {e.code} checking r/{subreddit}", 'warning')
                # Return basic info and let sync verify
                return {
                    'creator_id': subreddit.lower(),
                    'creator_name': f'r/{subreddit}',
                }
        except Exception as e:
            self.log(f"Error getting subreddit info for r/{subreddit}: {e}", 'error')
            return None

    def get_posts(self, subreddit: str, since_date: str = None, max_posts: int = 0,
                  progress_callback=None) -> tuple:
        """Fetch posts and download media from a subreddit using gallery-dl.

        Args:
            subreddit: Subreddit name (without r/)
            since_date: ISO date string; skip posts older than this
            max_posts: Maximum posts to fetch (0 = unlimited)
            progress_callback: Optional callable(downloaded_count, skipped_count, latest_file)
                for live progress updates

        Returns:
            Tuple of (List[Post], temp_dir_path) — caller must clean up temp_dir
            when done moving files. Returns ([], None) on failure.
        """
        temp_dir = tempfile.mkdtemp(prefix=f'reddit_paid_{subreddit}_')

        try:
            downloaded = self.run_gallery_dl(subreddit, temp_dir, since_date, max_posts,
                                             progress_callback=progress_callback)

            if not downloaded:
                shutil.rmtree(temp_dir, ignore_errors=True)
                return [], None

            # Group files by post using metadata sidecars
            grouped = self._group_files_by_post(downloaded, temp_dir, subreddit)

            if not grouped:
                shutil.rmtree(temp_dir, ignore_errors=True)
                return [], None

            posts = []
            for post_id, post_data in grouped.items():
                attachments = []
                for file_path in post_data['files']:
                    ext = file_path.suffix.lower()
                    file_type = self._detect_file_type(ext)

                    attachments.append(Attachment(
                        name=file_path.name,
                        file_type=file_type,
                        extension=ext,
                        server_path=str(file_path),  # temp path, will be moved
                        download_url=None,  # Already downloaded
                        file_size=file_path.stat().st_size if file_path.exists() else None,
                    ))

                if not attachments:
                    continue

                post = Post(
                    post_id=post_id,
                    service_id=self.SERVICE_ID,
                    platform=self.PLATFORM,
                    creator_id=subreddit.lower(),
                    title=post_data.get('title'),
                    content=post_data.get('title'),
                    published_at=post_data.get('date'),
                    attachments=attachments,
                )
                posts.append(post)

            self.log(f"Parsed {len(posts)} posts with {sum(len(p.attachments) for p in posts)} attachments from r/{subreddit}", 'info')
            return posts, temp_dir

        except Exception as e:
            self.log(f"Error fetching posts from r/{subreddit}: {e}", 'error')
            shutil.rmtree(temp_dir, ignore_errors=True)
            return [], None

    def run_gallery_dl(self, subreddit: str, temp_dir: str,
                       since_date: str = None, max_posts: int = 0,
                       progress_callback=None, batch_callback=None,
                       batch_size: int = 50) -> dict:
        """Run gallery-dl to download media from a subreddit.

        Streams stdout line-by-line. Calls progress_callback for status updates
        and batch_callback with lists of new file paths for incremental processing.

        Args:
            progress_callback: Called with (dl_count, skip_count, total_seen)
            batch_callback: Called with (new_files: List[Path]) every batch_size files
            batch_size: How many files to accumulate before calling batch_callback

        Returns:
            Dict with dl_count, skip_count, total.
        """
        import time

        # Use a separate download archive for paid content reddit
        archive_dir = '/opt/media-downloader/data/cache'
        os.makedirs(archive_dir, exist_ok=True)
        archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')

        cmd = [
            self.gallery_dl_path,
            '--write-metadata',
            '--download-archive', archive_path,
            '-d', temp_dir,
        ]

        # REST API mode to avoid shared OAuth rate limits
        cmd.extend(['-o', 'extractor.reddit.api=rest'])

        # Limit posts (0 = unlimited)
        if max_posts > 0:
            cmd.extend(['--range', f'1-{max_posts}'])

        # Date filtering
        if since_date:
            try:
                cutoff = since_date[:10]  # YYYY-MM-DD
                cmd.extend(['--filter', f"date >= datetime.strptime('{cutoff}', '%Y-%m-%d')"])
            except (ValueError, IndexError):
                pass

        cmd.append(f'https://www.reddit.com/r/{subreddit}/new/')

        # Check for Reddit cookies file
        cookies_file = self._get_cookies_file()
        if cookies_file:
            temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
            if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
                cmd.extend(['--cookies', temp_cookie_file])

        self.log(f"Running gallery-dl for r/{subreddit}", 'info')
        self.log(f"Command: {' '.join(cmd)}", 'debug')

        dl_count = 0
        skip_count = 0
        pending_files = []

        try:
            proc = subprocess.Popen(
                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
            )

            start_time = time.time()
            timeout_secs = 7200  # 2 hours

            while True:
                if time.time() - start_time > timeout_secs:
                    proc.kill()
                    self.log(f"gallery-dl timed out for r/{subreddit}", 'error')
                    break

                line = proc.stdout.readline()
                if not line and proc.poll() is not None:
                    break
                if not line:
                    continue

                line = line.strip()
                if not line:
                    continue

                if line.startswith('# '):
                    # Skipped file (already in archive)
                    skip_count += 1
                else:
                    # Downloaded file — gallery-dl prints the full path
                    dl_count += 1
                    file_path = Path(line)
                    if file_path.exists() and not file_path.name.endswith('.json'):
                        pending_files.append(file_path)

                total = dl_count + skip_count
                if progress_callback and total % 5 == 0:
                    progress_callback(dl_count, skip_count, total)

                # Flush batch for processing
                if batch_callback and len(pending_files) >= batch_size:
                    batch_callback(list(pending_files))
                    pending_files.clear()

            proc.wait()

            # Final batch
            if batch_callback and pending_files:
                batch_callback(list(pending_files))
                pending_files.clear()

            if progress_callback:
                progress_callback(dl_count, skip_count, dl_count + skip_count)

            returncode = proc.returncode
            if returncode not in (None, 0, 1, 4, 5):
                stderr = proc.stderr.read()
                self.log(f"gallery-dl returned code {returncode} for r/{subreddit}", 'warning')
                if stderr:
                    self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')

        except Exception as e:
            self.log(f"gallery-dl failed for r/{subreddit}: {e}", 'error')

        self.log(f"gallery-dl done for r/{subreddit}: {dl_count} downloaded, {skip_count} skipped", 'info')
        return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}

    def _group_files_by_post(self, files: List[Path], temp_dir: str,
                             subreddit: str) -> Dict[str, Dict]:
        """Group downloaded files by Reddit post ID using metadata JSON sidecars.

        Adapted from reddit_community_monitor.py:_group_files_by_post

        Returns:
            Dict mapping reddit_post_id -> {
                'files': [Path],
                'title': str,
                'date': str,
                'source_url': str
            }
        """
        posts: Dict[str, Dict] = {}

        for file_path in files:
            # Look for matching metadata JSON sidecar
            json_path = file_path.with_suffix(file_path.suffix + '.json')
            if not json_path.exists():
                json_path = file_path.with_suffix('.json')

            metadata = {}
            if json_path.exists():
                try:
                    with open(json_path, 'r', encoding='utf-8') as f:
                        metadata = json.load(f)
                except (json.JSONDecodeError, Exception) as e:
                    self.log(f"Failed to parse metadata for {file_path.name}: {e}", 'debug')

            # Extract Reddit post ID
            reddit_post_id = None
            for key in ('id', 'reddit_id', 'parent_id'):
                if key in metadata:
                    reddit_post_id = str(metadata[key])
                    break

            if not reddit_post_id:
                # Filename-based fallback: subreddit_postid_num.ext
                parts = file_path.stem.split('_')
                if len(parts) >= 2:
                    reddit_post_id = parts[-2] if len(parts) >= 3 else parts[-1]
                else:
                    reddit_post_id = file_path.stem

            # Extract post date
            post_date = None
            if 'date' in metadata:
                date_val = metadata['date']
                if isinstance(date_val, str):
                    for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
                        try:
                            utc_dt = datetime.strptime(date_val, fmt).replace(tzinfo=timezone.utc)
                            post_date = utc_dt.astimezone().strftime('%Y-%m-%dT%H:%M:%S')
                            break
                        except ValueError:
                            continue
                    if not post_date:
                        post_date = date_val
                elif isinstance(date_val, (int, float)):
                    try:
                        post_date = datetime.fromtimestamp(date_val, tz=timezone.utc).isoformat()
                    except (ValueError, OSError):
                        pass

            if not post_date and 'created_utc' in metadata:
                try:
                    post_date = datetime.fromtimestamp(metadata['created_utc'], tz=timezone.utc).isoformat()
                except (ValueError, OSError):
                    pass

            if not post_date:
                post_date = datetime.now().isoformat()

            title = metadata.get('title', metadata.get('description', ''))
            sub = metadata.get('subreddit', subreddit)
            source_url = f"https://www.reddit.com/r/{sub}/comments/{reddit_post_id}" if sub else ''

            if reddit_post_id not in posts:
                posts[reddit_post_id] = {
                    'files': [],
                    'title': title,
                    'date': post_date,
                    'source_url': source_url,
                }

            posts[reddit_post_id]['files'].append(file_path)

        return posts

    def _get_cookies_file(self) -> Optional[str]:
        """Get Reddit cookies JSON from the scrapers table if configured."""
        if not self.unified_db:
            return None

        try:
            with self.unified_db.get_connection() as conn:
                cursor = conn.cursor()
                cursor.execute(
                    "SELECT cookies FROM scrapers WHERE name = 'reddit' AND cookies IS NOT NULL"
                )
                row = cursor.fetchone()
                if row and row[0]:
                    return row[0]
        except Exception as e:
            self.log(f"Could not load Reddit cookies: {e}", 'debug')

        return None

    def _write_netscape_cookie_file(self, cookies_json: str, output_path: str) -> bool:
        """Convert JSON cookies array to Netscape cookie file format."""
        try:
            cookies = json.loads(cookies_json)
            if not isinstance(cookies, list):
                return False

            with open(output_path, 'w') as f:
                f.write("# Netscape HTTP Cookie File\n")
                f.write("# https://curl.haxx.se/docs/http-cookies.html\n\n")
                for cookie in cookies:
                    domain = cookie.get('domain', '')
                    include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
                    path = cookie.get('path', '/')
                    secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
                    expires = cookie.get('expirationDate', cookie.get('expiry', cookie.get('expires', 0)))
                    if expires is None:
                        expires = 0
                    expires = str(int(float(expires)))
                    name = cookie.get('name', '')
                    value = cookie.get('value', '')
                    f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n")

            return True
        except Exception as e:
            self.log(f"Failed to write Netscape cookie file: {e}", 'error')
            return False

    def get_pullpush_post_ids(self, subreddit: str, after_ts: int = 0,
                              before_ts: int = None,
                              progress_callback=None) -> List[Dict]:
        """Fetch all historical post IDs for a subreddit from the Pullpush (Pushshift) API.

        Paginates through the full archive using created_utc ascending order.
        Rate-limited to ~1 request per 2 seconds.

        Args:
            subreddit: Subreddit name (without r/)
            after_ts: Unix timestamp to start from (0 = beginning of time)
            before_ts: Unix timestamp to stop at (None = no upper limit)
            progress_callback: Optional callable(fetched_count, message)

        Returns:
            List of dicts: [{id, title, created_utc, url, is_gallery}, ...]
        """
        import time
        import urllib.request
        import urllib.error

        base_url = 'https://api.pullpush.io/reddit/search/submission/'
        all_posts = []
        current_after = after_ts
        page = 0

        while True:
            params = (
                f'subreddit={subreddit}'
                f'&size=100'
                f'&sort=asc'
                f'&sort_type=created_utc'
                f'&after={current_after}'
            )
            if before_ts is not None:
                params += f'&before={before_ts}'

            url = f'{base_url}?{params}'
            page += 1

            try:
                req = urllib.request.Request(url, headers={
                    'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
                })
                with urllib.request.urlopen(req, timeout=30) as resp:
                    data = json.loads(resp.read().decode())
            except urllib.error.HTTPError as e:
                if e.code == 429:
                    self.log(f"Pullpush rate limited, waiting 5s...", 'warning')
                    time.sleep(5)
                    continue
                self.log(f"Pullpush HTTP {e.code} for r/{subreddit}: {e}", 'error')
                break
            except Exception as e:
                self.log(f"Pullpush request failed for r/{subreddit}: {e}", 'error')
                break

            posts = data.get('data', [])
            if not posts:
                break

            for post in posts:
                all_posts.append({
                    'id': post.get('id', ''),
                    'title': post.get('title', ''),
                    'created_utc': post.get('created_utc', 0),
                    'url': post.get('url', ''),
                    'is_gallery': post.get('is_gallery', False),
                    'selftext': post.get('selftext', ''),
                })

            last_ts = posts[-1].get('created_utc', 0)

            if progress_callback:
                progress_callback(len(all_posts),
                    f"Fetched {len(all_posts)} post IDs (page {page})")

            # Handle stuck pagination — same timestamp repeating
            if last_ts <= current_after:
                current_after = last_ts + 1
            else:
                current_after = last_ts

            # If we got fewer than 100, we've reached the end
            if len(posts) < 100:
                break

            # Rate limit: 2s between requests
            time.sleep(2)

        self.log(f"Pullpush: fetched {len(all_posts)} total post IDs for r/{subreddit}", 'info')
        return all_posts

    def run_gallery_dl_urls(self, urls_file: str, temp_dir: str,
                            progress_callback=None, batch_callback=None,
                            batch_size: int = 50) -> dict:
        """Run gallery-dl with --input-file to download specific Reddit post URLs.

        Same streaming/batch pattern as run_gallery_dl() but reads URLs from a file
        instead of scraping a subreddit listing.

        Args:
            urls_file: Path to file containing one URL per line
            temp_dir: Directory for gallery-dl to download into
            progress_callback: Called with (dl_count, skip_count, total_seen)
            batch_callback: Called with (new_files: List[Path]) every batch_size files
            batch_size: How many files to accumulate before calling batch_callback

        Returns:
            Dict with dl_count, skip_count, total.
        """
        import time

        # Same archive as normal Reddit paid content sync
        archive_dir = '/opt/media-downloader/data/cache'
        os.makedirs(archive_dir, exist_ok=True)
        archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')

        cmd = [
            self.gallery_dl_path,
            '--write-metadata',
            '--download-archive', archive_path,
            '-d', temp_dir,
            '-o', 'extractor.reddit.api=rest',
            '--input-file', urls_file,
        ]

        # Check for Reddit cookies file
        cookies_file = self._get_cookies_file()
        if cookies_file:
            temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
            if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
                cmd.extend(['--cookies', temp_cookie_file])

        self.log(f"Running gallery-dl with input file ({urls_file})", 'info')
        self.log(f"Command: {' '.join(cmd)}", 'debug')

        dl_count = 0
        skip_count = 0
        pending_files = []

        try:
            proc = subprocess.Popen(
                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
            )

            start_time = time.time()
            timeout_secs = 14400  # 4 hours for backfill (can be large)

            while True:
                if time.time() - start_time > timeout_secs:
                    proc.kill()
                    self.log("gallery-dl backfill timed out", 'error')
                    break

                line = proc.stdout.readline()
                if not line and proc.poll() is not None:
                    break
                if not line:
                    continue

                line = line.strip()
                if not line:
                    continue

                if line.startswith('# '):
                    skip_count += 1
                else:
                    dl_count += 1
                    file_path = Path(line)
                    if file_path.exists() and not file_path.name.endswith('.json'):
                        pending_files.append(file_path)

                total = dl_count + skip_count
                if progress_callback:
                    progress_callback(dl_count, skip_count, total)

                if batch_callback and len(pending_files) >= batch_size:
                    batch_callback(list(pending_files))
                    pending_files.clear()

            proc.wait()

            # Final batch
            if batch_callback and pending_files:
                batch_callback(list(pending_files))
                pending_files.clear()

            if progress_callback:
                progress_callback(dl_count, skip_count, dl_count + skip_count)

            returncode = proc.returncode
            if returncode not in (None, 0, 1, 4, 5):
                stderr = proc.stderr.read()
                self.log(f"gallery-dl backfill returned code {returncode}", 'warning')
                if stderr:
                    self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')

        except Exception as e:
            self.log(f"gallery-dl backfill failed: {e}", 'error')

        self.log(f"gallery-dl backfill done: {dl_count} downloaded, {skip_count} skipped", 'info')
        return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}

    @staticmethod
    def _detect_file_type(ext: str) -> str:
        """Detect file type from extension."""
        ext = ext.lower().lstrip('.')
        image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
        video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv', 'mpeg', 'mpg'}

        if ext in image_exts:
            return 'image'
        elif ext in video_exts:
            return 'video'
        return 'unknown'