media-downloader/modules/paid_content/pornhub_client.py

"""
Pornhub Client - Fetches creator info and videos using yt-dlp
"""

import asyncio
import html as html_module
import json
import os
import re
import subprocess
import tempfile
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple

from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment


class PornhubClient(LoggingMixin):
    """
    Client for fetching Pornhub creator information and videos using yt-dlp

    Supports:
    - Pornstar pages (pornhub.com/pornstar/name)
    - Channel pages (pornhub.com/channels/name)
    - User pages (pornhub.com/users/name)
    - Model pages (pornhub.com/model/name)
    """

    SERVICE_ID = 'pornhub'
    PLATFORM = 'pornhub'

    # Quality presets for yt-dlp
    # Pornhub serves single combined streams with IDs like '1080p', '720p', etc.
    # NOT separate video+audio streams like YouTube
    QUALITY_PRESETS = {
        'best': 'bestvideo+bestaudio/best',
        '1080p': 'bestvideo[height<=1080]+bestaudio/best[height<=1080]/best',
        '720p': 'bestvideo[height<=720]+bestaudio/best[height<=720]/best',
        '480p': 'bestvideo[height<=480]+bestaudio/best[height<=480]/best',
    }

    def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None):
        self._init_logger('PaidContent', log_callback, default_module='Pornhub')

        # Find yt-dlp executable
        self.ytdlp_path = ytdlp_path or self._find_ytdlp()
        if not self.ytdlp_path:
            self.log("yt-dlp not found, Pornhub support will be disabled", 'warning')

        # Store database reference for cookie access
        self.unified_db = unified_db
        self._cookies_file = None

        # Cache for profile page HTML (avoid re-fetching for avatar/banner/bio)
        self._profile_page_cache: Dict[str, Optional[str]] = {}

    def _find_ytdlp(self) -> Optional[str]:
        """Find yt-dlp executable"""
        common_paths = [
            '/opt/media-downloader/venv/bin/yt-dlp',
            '/usr/local/bin/yt-dlp',
            '/usr/bin/yt-dlp',
            '/opt/homebrew/bin/yt-dlp',
            os.path.expanduser('~/.local/bin/yt-dlp'),
        ]

        for path in common_paths:
            if os.path.isfile(path) and os.access(path, os.X_OK):
                return path

        try:
            result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
            if result.returncode == 0:
                return result.stdout.strip()
        except Exception:
            pass

        return None

    def is_available(self) -> bool:
        """Check if yt-dlp is available"""
        return self.ytdlp_path is not None

    def _get_cookies_file(self) -> Optional[str]:
        """Get path to cookies file, creating it from database if needed"""
        if self._cookies_file and os.path.exists(self._cookies_file):
            return self._cookies_file

        if not self.unified_db:
            return None

        try:
            with self.unified_db.get_connection() as conn:
                cursor = conn.cursor()
                cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('pornhub',))
                row = cursor.fetchone()
                if row and row[0]:
                    data = json.loads(row[0])
                    # Support both {"cookies": [...]} and [...] formats
                    if isinstance(data, dict) and 'cookies' in data:
                        cookies_list = data['cookies']
                    elif isinstance(data, list):
                        cookies_list = data
                    else:
                        cookies_list = []

                    if cookies_list:
                        # Write cookies to temp file in Netscape format
                        fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='pornhub_cookies_')
                        with os.fdopen(fd, 'w') as f:
                            f.write("# Netscape HTTP Cookie File\n")
                            for cookie in cookies_list:
                                domain = cookie.get('domain', '')
                                include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
                                path = cookie.get('path', '/')
                                secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
                                expiry = str(int(cookie.get('expirationDate', 0)))
                                name = cookie.get('name', '')
                                value = cookie.get('value', '')
                                f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
                        self.log(f"Loaded {len(cookies_list)} cookies from pornhub scraper", 'debug')
                        return self._cookies_file
        except Exception as e:
            self.log(f"Could not load cookies: {e}", 'debug')

        return None

    def _get_cookies_list(self) -> Optional[list]:
        """Get cookies as a list of dicts for aiohttp requests"""
        if not self.unified_db:
            return None

        try:
            with self.unified_db.get_connection() as conn:
                cursor = conn.cursor()
                cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('pornhub',))
                row = cursor.fetchone()
                if row and row[0]:
                    data = json.loads(row[0])
                    if isinstance(data, dict) and 'cookies' in data:
                        return data['cookies']
                    elif isinstance(data, list):
                        return data
        except Exception as e:
            self.log(f"Could not load cookies list: {e}", 'debug')

        return None

    def _get_base_cmd(self) -> List[str]:
        """Get base yt-dlp command with cookies if available"""
        cmd = [self.ytdlp_path]
        cookies_file = self._get_cookies_file()
        if cookies_file:
            cmd.extend(['--cookies', cookies_file])
        return cmd

    def cleanup(self):
        """Clean up temporary files"""
        if self._cookies_file and os.path.exists(self._cookies_file):
            try:
                os.unlink(self._cookies_file)
            except Exception:
                pass
            self._cookies_file = None
        self._profile_page_cache.clear()

    @staticmethod
    def extract_creator_id(url: str) -> Optional[Tuple[str, str]]:
        """
        Extract creator type and identifier from Pornhub URL

        Returns:
            Tuple of (type, id) where type is 'pornstar', 'channels', 'users', or 'model'
            or None if not a valid Pornhub creator URL
        """
        patterns = [
            (r'pornhub\.com/pornstar/([a-zA-Z0-9_-]+)', 'pornstar'),
            (r'pornhub\.com/channels/([a-zA-Z0-9_-]+)', 'channels'),
            (r'pornhub\.com/users/([a-zA-Z0-9_-]+)', 'users'),
            (r'pornhub\.com/model/([a-zA-Z0-9_-]+)', 'model'),
        ]

        for pattern, creator_type in patterns:
            match = re.search(pattern, url)
            if match:
                return (creator_type, match.group(1))

        return None

    @staticmethod
    def normalize_creator_url(creator_id: str, creator_type: str = 'pornstar') -> str:
        """Convert creator ID to a consistent URL format

        Args:
            creator_id: Creator name/identifier (may be 'type/name' format)
            creator_type: Default type if not embedded in creator_id
        """
        # Already a full URL
        if creator_id.startswith('http://') or creator_id.startswith('https://'):
            return creator_id

        # Handle 'type/name' format from URL parser
        if '/' in creator_id:
            parts = creator_id.split('/', 1)
            creator_type = parts[0]
            creator_id = parts[1]

        return f"https://www.pornhub.com/{creator_type}/{creator_id}"

    def _get_listing_url(self, url: str) -> str:
        """Get the URL to use for listing videos from a creator page.

        For pornstars and models, append /videos to get the video listing.
        For channels and users, the base URL already lists videos.
        """
        # Parse out the type
        parsed = self.extract_creator_id(url)
        if parsed:
            creator_type, _ = parsed
            if creator_type in ('pornstar', 'model'):
                # Strip any trailing slash and append /videos
                url = url.rstrip('/')
                if not url.endswith('/videos'):
                    url = f"{url}/videos"
        return url

    async def get_creator_info(self, url: str) -> Optional[Dict]:
        """
        Get creator information using yt-dlp + profile page scraping

        Returns dict with creator metadata or None if not found
        """
        if not self.is_available():
            return None

        creator_type_id = self.extract_creator_id(url)
        creator_type = creator_type_id[0] if creator_type_id else 'pornstar'

        # Try to scrape the display name from the profile page first
        creator_name = None
        try:
            page_html = await self.get_profile_page(url)
            if page_html:
                # Look for <h1 itemprop="name">Name</h1> inside nameSubscribe div
                name_match = re.search(r'<div class="nameSubscribe">.*?<h1[^>]*>\s*(.+?)\s*</h1>', page_html, re.DOTALL)
                if name_match:
                    creator_name = html_module.unescape(name_match.group(1).strip())
                    self.log(f"Found creator name from profile page: {creator_name}", 'debug')
        except Exception as e:
            self.log(f"Could not scrape creator name: {e}", 'debug')

        # If page scraping didn't find a name, try yt-dlp
        if not creator_name:
            try:
                listing_url = self._get_listing_url(url)

                cmd = self._get_base_cmd() + [
                    '--no-warnings',
                    '--flat-playlist',
                    '-j',
                    '--playlist-items', '1',
                    listing_url
                ]

                result = await asyncio.create_subprocess_exec(
                    *cmd,
                    stdout=asyncio.subprocess.PIPE,
                    stderr=asyncio.subprocess.PIPE
                )

                stdout, stderr = await result.communicate()

                if result.returncode == 0:
                    for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
                        if not line:
                            continue
                        try:
                            data = json.loads(line)
                            playlist_title = data.get('playlist_title') or ''
                            creator_name = (data.get('channel') or data.get('uploader')
                                            or playlist_title.replace(' - Videos', '') or None)
                            if creator_name:
                                creator_name = html_module.unescape(creator_name)
                                break
                        except json.JSONDecodeError:
                            continue
            except Exception as e:
                self.log(f"yt-dlp creator info failed: {e}", 'debug')

        # Fall back to deriving name from URL slug
        if not creator_name and creator_type_id:
            creator_name = creator_type_id[1].replace('-', ' ').title()

        if creator_name:
            return {
                'creator_id': creator_type_id[1] if creator_type_id else None,
                'creator_name': creator_name,
                'creator_url': url,
                'creator_type': creator_type,
            }

        return None

    async def get_creator_videos(self, url: str, since_date: str = None,
                                  max_videos: int = None,
                                  progress_callback=None) -> List[Dict]:
        """
        Get all videos from a creator page using --flat-playlist for speed.

        Args:
            url: Pornhub creator URL
            since_date: Only fetch videos published after this date (ISO format)
            max_videos: Maximum number of videos to fetch
            progress_callback: Callback function(count) for progress updates

        Returns:
            List of video metadata dicts
        """
        if not self.is_available():
            return []

        try:
            listing_url = self._get_listing_url(url)

            # Use --flat-playlist for fast listing (avoids per-video HTTP requests)
            cmd = self._get_base_cmd() + [
                '--no-warnings',
                '--flat-playlist',
                '-j',
                '--socket-timeout', '30',
                '--retries', '3',
                listing_url
            ]

            if max_videos:
                cmd.extend(['--playlist-items', f'1:{max_videos}'])

            self.log(f"Fetching videos from: {url}", 'info')

            result = await asyncio.create_subprocess_exec(
                *cmd,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE
            )

            stdout, stderr = await result.communicate()

            if result.returncode != 0:
                error = stderr.decode('utf-8', errors='replace')
                self.log(f"Failed to get creator videos: {error}", 'warning')
                return []

            videos = []
            for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
                if not line:
                    continue
                try:
                    data = json.loads(line)

                    # Skip non-video entries
                    if data.get('_type') == 'playlist':
                        continue

                    video_id = data.get('id')
                    if not video_id:
                        continue

                    # Flat-playlist doesn't provide upload_date for Pornhub, but check anyway
                    upload_date = data.get('upload_date')
                    if upload_date:
                        try:
                            upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat()
                        except ValueError:
                            pass

                    # Decode HTML entities in title (flat-playlist returns them encoded)
                    title = html_module.unescape(data.get('title', f'Video {video_id}'))

                    # Build video URL
                    video_url = (data.get('webpage_url') or data.get('url')
                                 or f"https://www.pornhub.com/view_video.php?viewkey={video_id}")

                    videos.append({
                        'video_id': video_id,
                        'title': title,
                        'description': data.get('description', ''),
                        'upload_date': upload_date,
                        'duration': data.get('duration'),
                        'view_count': data.get('view_count'),
                        'thumbnail': data.get('thumbnail'),
                        'url': video_url,
                    })

                    if progress_callback:
                        progress_callback(len(videos))

                    if max_videos and len(videos) >= max_videos:
                        break

                except json.JSONDecodeError:
                    continue

            self.log(f"Found {len(videos)} videos", 'info')
            return videos

        except Exception as e:
            self.log(f"Error getting creator videos: {e}", 'error')
            return []

    async def download_video(self, video_url: str, output_dir: Path, quality: str = 'best',
                            progress_callback=None) -> Dict:
        """
        Download a video

        Args:
            video_url: Pornhub video URL
            output_dir: Directory to save the video
            quality: Quality preset
            progress_callback: Callback for download progress

        Returns:
            Dict with success status and file info
        """
        if not self.is_available():
            return {'success': False, 'error': 'yt-dlp not available'}

        try:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)

            output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s')

            format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])

            cmd = self._get_base_cmd() + [
                '--no-warnings',
                '-f', format_str,
                '-o', output_template,
                '--print-json',
                '--no-playlist',
                '--user-agent', 'Mozilla/5.0',
                '--referer', 'https://www.pornhub.com/',
                '--merge-output-format', 'mp4',
                '--concurrent-fragments', '4',
                '--no-part',
                '--retries', '20',
                video_url
            ]

            self.log(f"Downloading video: {video_url}", 'debug')

            result = await asyncio.create_subprocess_exec(
                *cmd,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE
            )

            stdout, stderr = await result.communicate()

            if result.returncode != 0:
                error_msg = stderr.decode('utf-8', errors='replace').strip()
                if 'Video unavailable' in error_msg or 'not available' in error_msg:
                    error_msg = 'Video unavailable or private'
                elif 'premium' in error_msg.lower():
                    error_msg = 'Video requires premium access'
                elif len(error_msg) > 200:
                    error_msg = error_msg[:200] + '...'

                return {'success': False, 'error': error_msg}

            # Parse output JSON
            video_info = None
            for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
                try:
                    video_info = json.loads(line)
                    break
                except json.JSONDecodeError:
                    continue

            if not video_info:
                # Try to find downloaded file
                files = list(output_dir.glob('*.mp4'))
                if files:
                    file_path = max(files, key=lambda f: f.stat().st_mtime)
                    return {
                        'success': True,
                        'file_path': str(file_path),
                        'filename': file_path.name,
                        'file_size': file_path.stat().st_size
                    }
                return {'success': False, 'error': 'Could not find downloaded file'}

            file_path = video_info.get('_filename') or video_info.get('filename')
            if file_path:
                file_path = Path(file_path)

            return {
                'success': True,
                'file_path': str(file_path) if file_path else None,
                'filename': file_path.name if file_path else None,
                'file_size': file_path.stat().st_size if file_path and file_path.exists() else video_info.get('filesize'),
                'title': video_info.get('title'),
                'duration': video_info.get('duration'),
                'video_id': video_info.get('id'),
                'upload_date': video_info.get('upload_date'),
                'timestamp': video_info.get('timestamp'),
                'thumbnail': video_info.get('thumbnail'),
            }

        except Exception as e:
            self.log(f"Error downloading video: {e}", 'error')
            return {'success': False, 'error': str(e)}

    async def get_profile_page(self, url: str) -> Optional[str]:
        """Fetch profile page HTML via aiohttp (with cookies if available).
        Results are cached to avoid re-fetching for avatar/banner/bio."""
        # Strip /videos suffix for profile page
        base_url = re.sub(r'/videos/?$', '', url)

        if base_url in self._profile_page_cache:
            return self._profile_page_cache[base_url]

        try:
            import aiohttp

            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
            }

            # Build simple cookies dict for the session
            cookies_dict = {}
            cookies_list = self._get_cookies_list()
            if cookies_list:
                for cookie in cookies_list:
                    name = cookie.get('name', '')
                    value = cookie.get('value', '')
                    if name:
                        cookies_dict[name] = value

            async with aiohttp.ClientSession(cookies=cookies_dict) as session:
                async with session.get(
                    base_url,
                    headers=headers,
                    timeout=aiohttp.ClientTimeout(total=15)
                ) as resp:
                    if resp.status == 200:
                        text = await resp.text()
                        self._profile_page_cache[base_url] = text
                        return text

        except Exception as e:
            self.log(f"Could not fetch profile page: {e}", 'debug')

        self._profile_page_cache[base_url] = None
        return None

    async def get_profile_image(self, url: str) -> Optional[str]:
        """Scrape profile page for avatar/photo URL"""
        try:
            page_html = await self.get_profile_page(url)
            if not page_html:
                return None

            # Look for avatar image: <img id="getAvatar" src="...">
            avatar_match = re.search(r'<img[^>]*id=["\']getAvatar["\'][^>]*src=["\']([^"\']+)["\']', page_html)
            if avatar_match:
                self.log("Found Pornhub profile avatar", 'debug')
                return avatar_match.group(1)

            # Try og:image meta tag
            og_match = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_html)
            if not og_match:
                og_match = re.search(r'<meta\s+content="([^"]+)"\s+property="og:image"', page_html)
            if og_match:
                return og_match.group(1)

        except Exception as e:
            self.log(f"Could not fetch profile image: {e}", 'debug')

        return None

    async def get_profile_bio(self, url: str) -> Optional[str]:
        """Scrape bio/about section from profile page"""
        try:
            page_html = await self.get_profile_page(url)
            if not page_html:
                return None

            # Look for aboutMeSection -> div with the actual text
            # Structure: <section class="aboutMeSection ..."><div class="title">About Name</div><div>Bio text</div></section>
            about_match = re.search(
                r'<section\s+class="aboutMeSection[^"]*"[^>]*>.*?<div class="title">[^<]*</div>\s*<div>\s*(.*?)\s*</div>',
                page_html, re.DOTALL
            )
            if about_match:
                bio_text = re.sub(r'<[^>]+>', '', about_match.group(1)).strip()
                if bio_text:
                    self.log("Found Pornhub profile bio", 'debug')
                    return html_module.unescape(bio_text)

            # Fallback: look for biographyAbout section
            bio_match = re.search(
                r'class="biographyAbout[^"]*"[^>]*>.*?<div class="content[^"]*">(.*?)</div>',
                page_html, re.DOTALL
            )
            if bio_match:
                bio_text = re.sub(r'<[^>]+>', '', bio_match.group(1)).strip()
                if bio_text:
                    self.log("Found Pornhub profile bio (fallback)", 'debug')
                    return html_module.unescape(bio_text)

        except Exception as e:
            self.log(f"Could not fetch profile bio: {e}", 'debug')

        return None

    async def get_profile_banner(self, url: str) -> Optional[str]:
        """Scrape banner/cover image if available"""
        try:
            page_html = await self.get_profile_page(url)
            if not page_html:
                return None

            # Look for cover image: <img id="coverPictureDefault" src="...">
            cover_match = re.search(
                r'<img[^>]*id=["\']coverPictureDefault["\'][^>]*src=["\']([^"\']+)["\']',
                page_html
            )
            if cover_match:
                self.log("Found Pornhub profile banner", 'debug')
                return cover_match.group(1)

            # Fallback: any img inside coverImage div
            cover_match = re.search(
                r'<div class="coverImage">\s*<img[^>]*src=["\']([^"\']+)["\']',
                page_html, re.DOTALL
            )
            if cover_match:
                self.log("Found Pornhub profile banner (div)", 'debug')
                return cover_match.group(1)

        except Exception as e:
            self.log(f"Could not fetch profile banner: {e}", 'debug')

        return None

    async def get_profile_info(self, url: str) -> Optional[Dict]:
        """Scrape all profile info from the page in one pass"""
        page_html = await self.get_profile_page(url)
        if not page_html:
            return None

        info = {}

        # Extract infoPiece data (Gender, Birth Place, Height, etc.)
        info_pieces = re.findall(
            r'<div class="infoPiece">\s*<span>\s*(.*?)\s*</span>\s*(.*?)\s*</div>',
            page_html, re.DOTALL
        )
        for label, value in info_pieces:
            label = re.sub(r'<[^>]+>', '', label).strip().rstrip(':')
            value = re.sub(r'<[^>]+>', '', value).strip()
            if label and value:
                info[label.lower().replace(' ', '_')] = value

        return info if info else None

    async def get_joined_date(self, url: str) -> Optional[str]:
        """Extract a joined/career start date from profile info"""
        try:
            profile_info = await self.get_profile_info(url)
            if not profile_info:
                return None

            # Pornstar pages have "Career Start and End: 2011 to Present"
            career = profile_info.get('career_start_and_end')
            if career:
                # Extract start year: "2011 to Present" -> "2011"
                match = re.match(r'(\d{4})', career)
                if match:
                    return match.group(1)

            # User/model pages might not have career info but could have other dates
            return None
        except Exception as e:
            self.log(f"Could not get joined date: {e}", 'debug')
            return None

    async def get_creator(self, url: str) -> Optional[Creator]:
        """
        Get Creator object from creator URL
        """
        info = await self.get_creator_info(url)
        if not info:
            return None

        # Build creator_id as 'type/name' format
        creator_type_id = self.extract_creator_id(url)
        if creator_type_id:
            creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}"
        else:
            creator_id = info.get('creator_id', '')

        # Profile image is already fetched during get_creator_info (page was cached)
        profile_image = await self.get_profile_image(url)

        return Creator(
            creator_id=creator_id,
            service_id='pornhub',
            platform='pornhub',
            username=info.get('creator_name', 'Unknown'),
            display_name=info.get('creator_name'),
            profile_image_url=profile_image,
        )

    async def get_posts(self, url: str, since_date: str = None,
                        max_videos: int = None, progress_callback=None) -> List[Post]:
        """
        Get videos as Post objects
        """
        videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback)

        # Get creator_id from URL
        creator_type_id = self.extract_creator_id(url)
        creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}" if creator_type_id else ''

        posts = []
        for video in videos:
            # Create attachment for the video
            attachment = Attachment(
                name=f"{video['title']}.mp4",
                file_type='video',
                extension='.mp4',
                server_path=video['url'],
                download_url=video['url'],
                duration=video.get('duration'),
            )

            post = Post(
                post_id=video['video_id'],
                service_id='pornhub',
                platform='pornhub',
                creator_id=creator_id,
                title=video['title'],
                content=video.get('description') or video['title'],
                published_at=video.get('upload_date'),
                attachments=[attachment],
            )
            posts.append(post)

        return posts