media-downloader/modules/paid_content/twitch_client.py

"""
Twitch Clips Client - Fetches channel clips using yt-dlp
"""

import aiohttp
import asyncio
import hashlib
import json
import os
import re
import subprocess
import tempfile
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional

from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment


class TwitchThumbnailCache:
    """Cache for Twitch clip thumbnails"""

    def __init__(self, cache_dir: str = None):
        self.cache_dir = Path(cache_dir or '/opt/media-downloader/data/cache/twitch_thumbnails')
        self.cache_dir.mkdir(parents=True, exist_ok=True)

    def _get_cache_path(self, thumbnail_url: str) -> Path:
        """Get local cache path for a thumbnail URL"""
        # Create a hash of the URL for the filename
        url_hash = hashlib.md5(thumbnail_url.encode()).hexdigest()
        # Extract extension from URL or default to jpg
        ext = '.jpg'
        if '.png' in thumbnail_url.lower():
            ext = '.png'
        elif '.webp' in thumbnail_url.lower():
            ext = '.webp'
        return self.cache_dir / f"{url_hash}{ext}"

    def get_cached(self, thumbnail_url: str) -> Optional[str]:
        """Get cached thumbnail path if it exists"""
        cache_path = self._get_cache_path(thumbnail_url)
        if cache_path.exists():
            return str(cache_path)
        return None

    async def cache_thumbnail(self, thumbnail_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
        """Download and cache a thumbnail, return local path"""
        if not thumbnail_url:
            return None

        # Check if already cached
        cache_path = self._get_cache_path(thumbnail_url)
        if cache_path.exists():
            return str(cache_path)

        # Download thumbnail
        try:
            close_session = False
            if session is None:
                session = aiohttp.ClientSession()
                close_session = True

            try:
                async with session.get(thumbnail_url, timeout=aiohttp.ClientTimeout(total=30)) as resp:
                    if resp.status == 200:
                        content = await resp.read()
                        with open(cache_path, 'wb') as f:
                            f.write(content)
                        return str(cache_path)
            finally:
                if close_session:
                    await session.close()
        except Exception:
            pass

        return None

    async def cache_thumbnails_batch(self, thumbnail_urls: List[str], max_concurrent: int = 5) -> Dict[str, str]:
        """Cache multiple thumbnails in parallel, return url->local_path mapping"""
        result = {}

        # Filter out already cached
        to_download = []
        for url in thumbnail_urls:
            if not url:
                continue
            cached = self.get_cached(url)
            if cached:
                result[url] = cached
            else:
                to_download.append(url)

        if not to_download:
            return result

        # Download in batches
        async with aiohttp.ClientSession() as session:
            semaphore = asyncio.Semaphore(max_concurrent)

            async def download_one(url: str):
                async with semaphore:
                    path = await self.cache_thumbnail(url, session)
                    if path:
                        result[url] = path

            await asyncio.gather(*[download_one(url) for url in to_download])

        return result


class TwitchClient(LoggingMixin):
    """
    Client for fetching Twitch channel clips using yt-dlp

    Supports:
    - Channel clips URLs (twitch.tv/username/clips)
    - Fetching channel metadata
    - Listing all clips from a channel
    - Downloading clips
    """

    # Quality presets for yt-dlp
    QUALITY_PRESETS = {
        'best': 'best',
        '1080p': 'best[height<=1080]',
        '720p': 'best[height<=720]',
        '480p': 'best[height<=480]',
    }

    def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None, cache_dir: str = None):
        self._init_logger('PaidContent', log_callback, default_module='Twitch')

        # Find yt-dlp executable
        self.ytdlp_path = ytdlp_path or self._find_ytdlp()
        if not self.ytdlp_path:
            self.log("yt-dlp not found, Twitch support will be disabled", 'warning')

        # Store database reference for cookie access
        self.unified_db = unified_db
        self._cookies_file = None

        # Initialize thumbnail cache
        self.thumbnail_cache = TwitchThumbnailCache(cache_dir)

    def _find_ytdlp(self) -> Optional[str]:
        """Find yt-dlp executable"""
        common_paths = [
            '/opt/media-downloader/venv/bin/yt-dlp',  # Prefer venv version (kept up to date)
            '/usr/local/bin/yt-dlp',
            '/usr/bin/yt-dlp',
            '/opt/homebrew/bin/yt-dlp',
            os.path.expanduser('~/.local/bin/yt-dlp'),
        ]

        for path in common_paths:
            if os.path.isfile(path) and os.access(path, os.X_OK):
                return path

        try:
            result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
            if result.returncode == 0:
                return result.stdout.strip()
        except Exception:
            pass

        return None

    def is_available(self) -> bool:
        """Check if yt-dlp is available"""
        return self.ytdlp_path is not None

    def _get_cookies_file(self) -> Optional[str]:
        """Get path to cookies file, creating it from database if needed"""
        if self._cookies_file and os.path.exists(self._cookies_file):
            return self._cookies_file

        if not self.unified_db:
            return None

        try:
            with self.unified_db.get_connection() as conn:
                cursor = conn.cursor()
                # Try twitch-specific cookies first, then fall back to ytdlp
                for scraper_id in ['twitch', 'ytdlp']:
                    cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", (scraper_id,))
                    row = cursor.fetchone()
                    if row and row[0]:
                        data = json.loads(row[0])
                        # Support both {"cookies": [...]} and [...] formats
                        if isinstance(data, dict) and 'cookies' in data:
                            cookies_list = data['cookies']
                        elif isinstance(data, list):
                            cookies_list = data
                        else:
                            cookies_list = []

                        if cookies_list:
                            # Write cookies to temp file in Netscape format
                            fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='twitch_cookies_')
                            with os.fdopen(fd, 'w') as f:
                                f.write("# Netscape HTTP Cookie File\n")
                                for cookie in cookies_list:
                                    domain = cookie.get('domain', '')
                                    include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
                                    path = cookie.get('path', '/')
                                    secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
                                    expiry = str(int(cookie.get('expirationDate', 0)))
                                    name = cookie.get('name', '')
                                    value = cookie.get('value', '')
                                    f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
                            self.log(f"Loaded {len(cookies_list)} cookies from {scraper_id} scraper", 'debug')
                            return self._cookies_file
        except Exception as e:
            self.log(f"Could not load cookies: {e}", 'debug')

        return None

    def _get_base_cmd(self) -> List[str]:
        """Get base yt-dlp command with cookies if available"""
        cmd = [self.ytdlp_path]
        cookies_file = self._get_cookies_file()
        if cookies_file:
            cmd.extend(['--cookies', cookies_file])
        return cmd

    def cleanup(self):
        """Clean up temporary files"""
        if self._cookies_file and os.path.exists(self._cookies_file):
            try:
                os.unlink(self._cookies_file)
            except Exception:
                pass
            self._cookies_file = None

    @staticmethod
    def extract_channel_name(url: str) -> Optional[str]:
        """
        Extract channel name from Twitch URL

        Supports:
        - twitch.tv/username
        - twitch.tv/username/clips
        - m.twitch.tv/username/clips
        """
        patterns = [
            r'twitch\.tv/([a-zA-Z0-9_]+)(?:/clips)?',
        ]

        for pattern in patterns:
            match = re.search(pattern, url)
            if match:
                return match.group(1).lower()

        return None

    @staticmethod
    def normalize_clips_url(channel_name: str) -> str:
        """Convert channel name to clips URL with all-time filter"""
        return f"https://www.twitch.tv/{channel_name}/clips?filter=clips&range=all"

    async def get_channel_info(self, channel_url: str, count_clips: bool = True) -> Optional[Dict]:
        """
        Get channel information and optionally count all clips
        """
        if not self.is_available():
            return None

        channel_name = self.extract_channel_name(channel_url)
        if not channel_name:
            return None

        try:
            clips_url = self.normalize_clips_url(channel_name)

            # First get basic info from first clip
            cmd = self._get_base_cmd() + [
                '--no-warnings',
                '--flat-playlist',
                '-j',
                '--playlist-items', '1',
                clips_url
            ]

            result = await asyncio.create_subprocess_exec(
                *cmd,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE
            )

            stdout, stderr = await result.communicate()

            if result.returncode != 0:
                self.log(f"Failed to get channel info: {stderr.decode()}", 'warning')
                return None

            first_clip_data = None
            for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
                if not line:
                    continue
                try:
                    first_clip_data = json.loads(line)
                    break
                except json.JSONDecodeError:
                    continue

            if not first_clip_data:
                return None

            # Count all clips if requested (this can take a while for channels with many clips)
            clip_count = 0
            if count_clips:
                self.log(f"Counting clips for {channel_name}...", 'debug')
                count_cmd = self._get_base_cmd() + [
                    '--no-warnings',
                    '--flat-playlist',
                    '--print', 'id',
                    clips_url
                ]

                count_result = await asyncio.create_subprocess_exec(
                    *count_cmd,
                    stdout=asyncio.subprocess.PIPE,
                    stderr=asyncio.subprocess.PIPE
                )

                count_stdout, _ = await count_result.communicate()
                if count_result.returncode == 0:
                    clip_count = len([l for l in count_stdout.decode('utf-8', errors='replace').strip().split('\n') if l])
                    self.log(f"Found {clip_count} clips for {channel_name}", 'info')

            return {
                'channel_id': channel_name,
                'channel_name': channel_name,
                'channel_url': f"https://www.twitch.tv/{channel_name}",
                'clips_url': clips_url,
                'thumbnail': first_clip_data.get('thumbnail'),
                'clip_count': clip_count,
            }

        except Exception as e:
            self.log(f"Error getting channel info: {e}", 'error')
            return None

    async def get_channel_clips(self, channel_url: str, since_date: str = None,
                                 max_clips: int = None, progress_callback=None,
                                 cache_thumbnails: bool = True) -> List[Dict]:
        """
        Get all clips from a channel

        Args:
            channel_url: Twitch channel URL
            since_date: Only fetch clips created after this date (ISO format)
            max_clips: Maximum number of clips to fetch
            progress_callback: Callback function(count) for progress updates
            cache_thumbnails: Whether to download and cache thumbnails locally

        Returns:
            List of clip metadata dicts with cached thumbnail paths
        """
        if not self.is_available():
            return []

        channel_name = self.extract_channel_name(channel_url)
        if not channel_name:
            self.log(f"Could not extract channel name from URL: {channel_url}", 'error')
            return []

        try:
            clips_url = self.normalize_clips_url(channel_name)

            # Use flat-playlist for faster extraction (full metadata available in flat mode for Twitch clips)
            cmd = self._get_base_cmd() + [
                '--no-warnings',
                '--flat-playlist',
                '-j',
                clips_url
            ]

            # Add date filter at yt-dlp level for efficiency
            if since_date:
                try:
                    from datetime import datetime
                    # Convert ISO date to YYYYMMDD format for yt-dlp
                    date_obj = datetime.fromisoformat(since_date.replace('Z', '+00:00'))
                    dateafter = date_obj.strftime('%Y%m%d')
                    cmd.extend(['--dateafter', dateafter])
                    self.log(f"Filtering clips after {dateafter}", 'debug')
                except (ValueError, AttributeError):
                    pass

            if max_clips:
                cmd.extend(['--playlist-items', f'1:{max_clips}'])

            self.log(f"Fetching clips from channel: {channel_name}", 'info')

            result = await asyncio.create_subprocess_exec(
                *cmd,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE
            )

            stdout, stderr = await result.communicate()

            if result.returncode != 0:
                error = stderr.decode('utf-8', errors='replace')
                self.log(f"Failed to get channel clips: {error}", 'warning')
                return []

            clips = []
            for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
                if not line:
                    continue
                try:
                    data = json.loads(line)

                    clip_id = data.get('id')
                    if not clip_id:
                        continue

                    # Parse timestamp to ISO format
                    timestamp = data.get('timestamp')
                    upload_date = data.get('upload_date')
                    if timestamp:
                        try:
                            upload_date = datetime.fromtimestamp(timestamp).isoformat()
                        except (ValueError, OSError):
                            pass
                    elif upload_date:
                        # Convert YYYYMMDD to ISO format
                        try:
                            upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat()
                        except ValueError:
                            pass

                    # Check if clip is newer than since_date
                    if since_date and upload_date and upload_date <= since_date:
                        self.log(f"Reached clip from {upload_date}, stopping", 'debug')
                        break

                    # Extract clip slug from URL
                    clip_url = data.get('url') or data.get('webpage_url', '')
                    clip_slug = clip_url.split('/')[-1] if clip_url else clip_id

                    clips.append({
                        'clip_id': clip_id,
                        'clip_slug': clip_slug,
                        'title': data.get('title', f'Clip {clip_id}'),
                        'upload_date': upload_date,
                        'timestamp': timestamp,
                        'duration': data.get('duration'),
                        'view_count': data.get('view_count'),
                        'thumbnail': data.get('thumbnail'),
                        'url': clip_url,
                        'language': data.get('language'),
                        'channel_name': channel_name,
                    })

                    if progress_callback:
                        progress_callback(len(clips))

                    if max_clips and len(clips) >= max_clips:
                        break

                except json.JSONDecodeError:
                    continue

            self.log(f"Found {len(clips)} clips", 'info')

            # Cache thumbnails if requested
            if cache_thumbnails and clips:
                thumbnail_urls = [c.get('thumbnail') for c in clips if c.get('thumbnail')]
                if thumbnail_urls:
                    self.log(f"Caching {len(thumbnail_urls)} thumbnails...", 'debug')
                    cached_paths = await self.thumbnail_cache.cache_thumbnails_batch(thumbnail_urls)

                    # Update clips with cached thumbnail paths
                    for clip in clips:
                        thumb_url = clip.get('thumbnail')
                        if thumb_url and thumb_url in cached_paths:
                            clip['thumbnail_cached'] = cached_paths[thumb_url]

                    self.log(f"Cached {len(cached_paths)} thumbnails", 'debug')

            return clips

        except Exception as e:
            self.log(f"Error getting channel clips: {e}", 'error')
            return []

    async def download_clip(self, clip_url: str, output_dir: Path, quality: str = 'best',
                            progress_callback=None) -> Dict:
        """
        Download a clip

        Args:
            clip_url: Twitch clip URL
            output_dir: Directory to save the clip
            quality: Quality preset
            progress_callback: Callback for download progress

        Returns:
            Dict with success status and file info
        """
        if not self.is_available():
            return {'success': False, 'error': 'yt-dlp not available'}

        try:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)

            # Output template preserves title and ID
            output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s')

            format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])

            cmd = self._get_base_cmd() + [
                '--no-warnings',
                '-f', format_str,
                '-o', output_template,
                '--print-json',
                clip_url
            ]

            self.log(f"Downloading clip: {clip_url}", 'debug')

            result = await asyncio.create_subprocess_exec(
                *cmd,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE
            )

            stdout, stderr = await result.communicate()

            if result.returncode != 0:
                error_msg = stderr.decode('utf-8', errors='replace').strip()
                if len(error_msg) > 200:
                    error_msg = error_msg[:200] + '...'
                return {'success': False, 'error': error_msg}

            # Parse output JSON
            clip_info = None
            for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
                try:
                    clip_info = json.loads(line)
                    break
                except json.JSONDecodeError:
                    continue

            if not clip_info:
                # Try to find downloaded file
                files = list(output_dir.glob('*.mp4'))
                if files:
                    file_path = max(files, key=lambda f: f.stat().st_mtime)
                    return {
                        'success': True,
                        'file_path': str(file_path),
                        'filename': file_path.name,
                        'file_size': file_path.stat().st_size
                    }
                return {'success': False, 'error': 'Could not find downloaded file'}

            file_path = clip_info.get('_filename') or clip_info.get('filename')
            if file_path:
                file_path = Path(file_path)

            return {
                'success': True,
                'file_path': str(file_path) if file_path else None,
                'filename': file_path.name if file_path else None,
                'file_size': file_path.stat().st_size if file_path and file_path.exists() else clip_info.get('filesize'),
                'title': clip_info.get('title'),
                'duration': clip_info.get('duration'),
                'clip_id': clip_info.get('id'),
                'upload_date': clip_info.get('upload_date'),
                'thumbnail': clip_info.get('thumbnail'),
            }

        except Exception as e:
            self.log(f"Error downloading clip: {e}", 'error')
            return {'success': False, 'error': str(e)}

    async def get_channel_avatar(self, channel_name: str) -> Optional[str]:
        """
        Try to fetch channel avatar from Twitch

        Note: This requires either Twitch API credentials or scraping.
        Returns None if avatar cannot be fetched.
        """
        profile = await self.get_channel_profile(channel_name)
        return profile.get('avatar') if profile else None

    async def get_channel_profile(self, channel_name: str) -> Optional[Dict]:
        """
        Fetch channel profile info using Twitch's GQL API.

        Returns dict with avatar, banner, display_name, bio, joined_date, external_links
        """
        try:
            import aiohttp

            async with aiohttp.ClientSession() as session:
                headers = {
                    'Client-Id': 'kimne78kx3ncx6brgo4mv6wki5h1ko',  # Public Twitch web client ID
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                }

                # GQL query for comprehensive user info
                query = '''
                query {
                    user(login: "%s") {
                        id
                        login
                        displayName
                        description
                        createdAt
                        profileImageURL(width: 300)
                        bannerImageURL
                        offlineImageURL
                        channel {
                            socialMedias {
                                name
                                url
                            }
                        }
                    }
                }
                ''' % channel_name

                async with session.post(
                    'https://gql.twitch.tv/gql',
                    headers=headers,
                    json={'query': query},
                    timeout=aiohttp.ClientTimeout(total=15)
                ) as resp:
                    if resp.status == 200:
                        data = await resp.json()
                        user = data.get('data', {}).get('user')

                        if not user:
                            self.log(f"Twitch user not found: {channel_name}", 'warning')
                            return None

                        result = {}

                        # Avatar
                        if user.get('profileImageURL'):
                            result['avatar'] = user['profileImageURL']

                        # Banner - prefer offlineImageURL (larger), fall back to bannerImageURL
                        if user.get('offlineImageURL'):
                            result['banner'] = user['offlineImageURL']
                        elif user.get('bannerImageURL'):
                            result['banner'] = user['bannerImageURL']

                        # Display name
                        if user.get('displayName'):
                            result['display_name'] = user['displayName']

                        # Bio/description
                        if user.get('description'):
                            result['bio'] = user['description']

                        # Joined date (format: "Jun 10, 2016")
                        if user.get('createdAt'):
                            try:
                                created_dt = datetime.fromisoformat(user['createdAt'].replace('Z', '+00:00'))
                                result['joined_date'] = created_dt.strftime('%b %d, %Y')
                                self.log(f"Found Twitch joined date: {result['joined_date']}", 'debug')
                            except (ValueError, TypeError):
                                pass

                        # Social links
                        social_medias = user.get('channel', {}).get('socialMedias', [])
                        if social_medias:
                            links = []
                            for social in social_medias:
                                name = social.get('name', 'Link')
                                url = social.get('url', '')
                                if url:
                                    # Capitalize first letter of name
                                    title = name.capitalize() if name else 'Link'
                                    links.append({'title': title, 'url': url})
                            if links:
                                result['external_links'] = json.dumps(links)
                                self.log(f"Found {len(links)} Twitch external links", 'debug')

                        if result:
                            self.log(f"Fetched Twitch profile via GQL for {channel_name}: {list(result.keys())}", 'debug')
                            return result

        except Exception as e:
            self.log(f"Could not fetch Twitch profile: {e}", 'debug')

        return None

    async def get_creator(self, channel_url: str) -> Optional[Creator]:
        """
        Get Creator object from channel URL
        """
        info = await self.get_channel_info(channel_url)
        if not info:
            return None

        channel_name = info.get('channel_name') or self.extract_channel_name(channel_url)

        # Try to get the actual channel avatar (not clip thumbnail)
        avatar_url = await self.get_channel_avatar(channel_name)

        return Creator(
            creator_id=info.get('channel_id') or channel_name,
            service_id='twitch',
            platform='twitch',
            username=channel_name or 'Unknown',
            display_name=channel_name,
            profile_image_url=avatar_url,  # Use actual avatar, not clip thumbnail
            post_count=info.get('clip_count', 0)
        )

    async def get_posts(self, channel_url: str, since_date: str = None,
                        max_clips: int = None, progress_callback=None) -> List[Post]:
        """
        Get clips as Post objects
        """
        clips = await self.get_channel_clips(channel_url, since_date, max_clips, progress_callback)

        posts = []
        for clip in clips:
            # Create attachment for the clip
            attachment = Attachment(
                name=f"{clip['title']}.mp4",
                file_type='video',
                extension='.mp4',
                server_path=clip['url'],  # Use URL as server_path
                download_url=clip['url'],
                duration=clip.get('duration'),
            )

            post = Post(
                post_id=clip['clip_id'],
                service_id='twitch',
                platform='twitch',
                creator_id=clip.get('channel_name', ''),
                title=clip['title'],
                content='',  # Clips don't have descriptions
                published_at=clip.get('upload_date'),
                attachments=[attachment],
            )
            posts.append(post)

        return posts