""" Twitch Clips Client - Fetches channel clips using yt-dlp """ import aiohttp import asyncio import hashlib import json import os import re import subprocess import tempfile from datetime import datetime from pathlib import Path from typing import Dict, List, Optional from modules.base_module import LoggingMixin from .models import Creator, Post, Attachment class TwitchThumbnailCache: """Cache for Twitch clip thumbnails""" def __init__(self, cache_dir: str = None): self.cache_dir = Path(cache_dir or '/opt/media-downloader/data/cache/twitch_thumbnails') self.cache_dir.mkdir(parents=True, exist_ok=True) def _get_cache_path(self, thumbnail_url: str) -> Path: """Get local cache path for a thumbnail URL""" # Create a hash of the URL for the filename url_hash = hashlib.md5(thumbnail_url.encode()).hexdigest() # Extract extension from URL or default to jpg ext = '.jpg' if '.png' in thumbnail_url.lower(): ext = '.png' elif '.webp' in thumbnail_url.lower(): ext = '.webp' return self.cache_dir / f"{url_hash}{ext}" def get_cached(self, thumbnail_url: str) -> Optional[str]: """Get cached thumbnail path if it exists""" cache_path = self._get_cache_path(thumbnail_url) if cache_path.exists(): return str(cache_path) return None async def cache_thumbnail(self, thumbnail_url: str, session: aiohttp.ClientSession = None) -> Optional[str]: """Download and cache a thumbnail, return local path""" if not thumbnail_url: return None # Check if already cached cache_path = self._get_cache_path(thumbnail_url) if cache_path.exists(): return str(cache_path) # Download thumbnail try: close_session = False if session is None: session = aiohttp.ClientSession() close_session = True try: async with session.get(thumbnail_url, timeout=aiohttp.ClientTimeout(total=30)) as resp: if resp.status == 200: content = await resp.read() with open(cache_path, 'wb') as f: f.write(content) return str(cache_path) finally: if close_session: await session.close() except Exception: pass return None async def cache_thumbnails_batch(self, thumbnail_urls: List[str], max_concurrent: int = 5) -> Dict[str, str]: """Cache multiple thumbnails in parallel, return url->local_path mapping""" result = {} # Filter out already cached to_download = [] for url in thumbnail_urls: if not url: continue cached = self.get_cached(url) if cached: result[url] = cached else: to_download.append(url) if not to_download: return result # Download in batches async with aiohttp.ClientSession() as session: semaphore = asyncio.Semaphore(max_concurrent) async def download_one(url: str): async with semaphore: path = await self.cache_thumbnail(url, session) if path: result[url] = path await asyncio.gather(*[download_one(url) for url in to_download]) return result class TwitchClient(LoggingMixin): """ Client for fetching Twitch channel clips using yt-dlp Supports: - Channel clips URLs (twitch.tv/username/clips) - Fetching channel metadata - Listing all clips from a channel - Downloading clips """ # Quality presets for yt-dlp QUALITY_PRESETS = { 'best': 'best', '1080p': 'best[height<=1080]', '720p': 'best[height<=720]', '480p': 'best[height<=480]', } def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None, cache_dir: str = None): self._init_logger('PaidContent', log_callback, default_module='Twitch') # Find yt-dlp executable self.ytdlp_path = ytdlp_path or self._find_ytdlp() if not self.ytdlp_path: self.log("yt-dlp not found, Twitch support will be disabled", 'warning') # Store database reference for cookie access self.unified_db = unified_db self._cookies_file = None # Initialize thumbnail cache self.thumbnail_cache = TwitchThumbnailCache(cache_dir) def _find_ytdlp(self) -> Optional[str]: """Find yt-dlp executable""" common_paths = [ '/opt/media-downloader/venv/bin/yt-dlp', # Prefer venv version (kept up to date) '/usr/local/bin/yt-dlp', '/usr/bin/yt-dlp', '/opt/homebrew/bin/yt-dlp', os.path.expanduser('~/.local/bin/yt-dlp'), ] for path in common_paths: if os.path.isfile(path) and os.access(path, os.X_OK): return path try: result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True) if result.returncode == 0: return result.stdout.strip() except Exception: pass return None def is_available(self) -> bool: """Check if yt-dlp is available""" return self.ytdlp_path is not None def _get_cookies_file(self) -> Optional[str]: """Get path to cookies file, creating it from database if needed""" if self._cookies_file and os.path.exists(self._cookies_file): return self._cookies_file if not self.unified_db: return None try: with self.unified_db.get_connection() as conn: cursor = conn.cursor() # Try twitch-specific cookies first, then fall back to ytdlp for scraper_id in ['twitch', 'ytdlp']: cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", (scraper_id,)) row = cursor.fetchone() if row and row[0]: data = json.loads(row[0]) # Support both {"cookies": [...]} and [...] formats if isinstance(data, dict) and 'cookies' in data: cookies_list = data['cookies'] elif isinstance(data, list): cookies_list = data else: cookies_list = [] if cookies_list: # Write cookies to temp file in Netscape format fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='twitch_cookies_') with os.fdopen(fd, 'w') as f: f.write("# Netscape HTTP Cookie File\n") for cookie in cookies_list: domain = cookie.get('domain', '') include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE' path = cookie.get('path', '/') secure = 'TRUE' if cookie.get('secure', False) else 'FALSE' expiry = str(int(cookie.get('expirationDate', 0))) name = cookie.get('name', '') value = cookie.get('value', '') f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n") self.log(f"Loaded {len(cookies_list)} cookies from {scraper_id} scraper", 'debug') return self._cookies_file except Exception as e: self.log(f"Could not load cookies: {e}", 'debug') return None def _get_base_cmd(self) -> List[str]: """Get base yt-dlp command with cookies if available""" cmd = [self.ytdlp_path] cookies_file = self._get_cookies_file() if cookies_file: cmd.extend(['--cookies', cookies_file]) return cmd def cleanup(self): """Clean up temporary files""" if self._cookies_file and os.path.exists(self._cookies_file): try: os.unlink(self._cookies_file) except Exception: pass self._cookies_file = None @staticmethod def extract_channel_name(url: str) -> Optional[str]: """ Extract channel name from Twitch URL Supports: - twitch.tv/username - twitch.tv/username/clips - m.twitch.tv/username/clips """ patterns = [ r'twitch\.tv/([a-zA-Z0-9_]+)(?:/clips)?', ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1).lower() return None @staticmethod def normalize_clips_url(channel_name: str) -> str: """Convert channel name to clips URL with all-time filter""" return f"https://www.twitch.tv/{channel_name}/clips?filter=clips&range=all" async def get_channel_info(self, channel_url: str, count_clips: bool = True) -> Optional[Dict]: """ Get channel information and optionally count all clips """ if not self.is_available(): return None channel_name = self.extract_channel_name(channel_url) if not channel_name: return None try: clips_url = self.normalize_clips_url(channel_name) # First get basic info from first clip cmd = self._get_base_cmd() + [ '--no-warnings', '--flat-playlist', '-j', '--playlist-items', '1', clips_url ] result = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await result.communicate() if result.returncode != 0: self.log(f"Failed to get channel info: {stderr.decode()}", 'warning') return None first_clip_data = None for line in stdout.decode('utf-8', errors='replace').strip().split('\n'): if not line: continue try: first_clip_data = json.loads(line) break except json.JSONDecodeError: continue if not first_clip_data: return None # Count all clips if requested (this can take a while for channels with many clips) clip_count = 0 if count_clips: self.log(f"Counting clips for {channel_name}...", 'debug') count_cmd = self._get_base_cmd() + [ '--no-warnings', '--flat-playlist', '--print', 'id', clips_url ] count_result = await asyncio.create_subprocess_exec( *count_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) count_stdout, _ = await count_result.communicate() if count_result.returncode == 0: clip_count = len([l for l in count_stdout.decode('utf-8', errors='replace').strip().split('\n') if l]) self.log(f"Found {clip_count} clips for {channel_name}", 'info') return { 'channel_id': channel_name, 'channel_name': channel_name, 'channel_url': f"https://www.twitch.tv/{channel_name}", 'clips_url': clips_url, 'thumbnail': first_clip_data.get('thumbnail'), 'clip_count': clip_count, } except Exception as e: self.log(f"Error getting channel info: {e}", 'error') return None async def get_channel_clips(self, channel_url: str, since_date: str = None, max_clips: int = None, progress_callback=None, cache_thumbnails: bool = True) -> List[Dict]: """ Get all clips from a channel Args: channel_url: Twitch channel URL since_date: Only fetch clips created after this date (ISO format) max_clips: Maximum number of clips to fetch progress_callback: Callback function(count) for progress updates cache_thumbnails: Whether to download and cache thumbnails locally Returns: List of clip metadata dicts with cached thumbnail paths """ if not self.is_available(): return [] channel_name = self.extract_channel_name(channel_url) if not channel_name: self.log(f"Could not extract channel name from URL: {channel_url}", 'error') return [] try: clips_url = self.normalize_clips_url(channel_name) # Use flat-playlist for faster extraction (full metadata available in flat mode for Twitch clips) cmd = self._get_base_cmd() + [ '--no-warnings', '--flat-playlist', '-j', clips_url ] # Add date filter at yt-dlp level for efficiency if since_date: try: from datetime import datetime # Convert ISO date to YYYYMMDD format for yt-dlp date_obj = datetime.fromisoformat(since_date.replace('Z', '+00:00')) dateafter = date_obj.strftime('%Y%m%d') cmd.extend(['--dateafter', dateafter]) self.log(f"Filtering clips after {dateafter}", 'debug') except (ValueError, AttributeError): pass if max_clips: cmd.extend(['--playlist-items', f'1:{max_clips}']) self.log(f"Fetching clips from channel: {channel_name}", 'info') result = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await result.communicate() if result.returncode != 0: error = stderr.decode('utf-8', errors='replace') self.log(f"Failed to get channel clips: {error}", 'warning') return [] clips = [] for line in stdout.decode('utf-8', errors='replace').strip().split('\n'): if not line: continue try: data = json.loads(line) clip_id = data.get('id') if not clip_id: continue # Parse timestamp to ISO format timestamp = data.get('timestamp') upload_date = data.get('upload_date') if timestamp: try: upload_date = datetime.fromtimestamp(timestamp).isoformat() except (ValueError, OSError): pass elif upload_date: # Convert YYYYMMDD to ISO format try: upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat() except ValueError: pass # Check if clip is newer than since_date if since_date and upload_date and upload_date <= since_date: self.log(f"Reached clip from {upload_date}, stopping", 'debug') break # Extract clip slug from URL clip_url = data.get('url') or data.get('webpage_url', '') clip_slug = clip_url.split('/')[-1] if clip_url else clip_id clips.append({ 'clip_id': clip_id, 'clip_slug': clip_slug, 'title': data.get('title', f'Clip {clip_id}'), 'upload_date': upload_date, 'timestamp': timestamp, 'duration': data.get('duration'), 'view_count': data.get('view_count'), 'thumbnail': data.get('thumbnail'), 'url': clip_url, 'language': data.get('language'), 'channel_name': channel_name, }) if progress_callback: progress_callback(len(clips)) if max_clips and len(clips) >= max_clips: break except json.JSONDecodeError: continue self.log(f"Found {len(clips)} clips", 'info') # Cache thumbnails if requested if cache_thumbnails and clips: thumbnail_urls = [c.get('thumbnail') for c in clips if c.get('thumbnail')] if thumbnail_urls: self.log(f"Caching {len(thumbnail_urls)} thumbnails...", 'debug') cached_paths = await self.thumbnail_cache.cache_thumbnails_batch(thumbnail_urls) # Update clips with cached thumbnail paths for clip in clips: thumb_url = clip.get('thumbnail') if thumb_url and thumb_url in cached_paths: clip['thumbnail_cached'] = cached_paths[thumb_url] self.log(f"Cached {len(cached_paths)} thumbnails", 'debug') return clips except Exception as e: self.log(f"Error getting channel clips: {e}", 'error') return [] async def download_clip(self, clip_url: str, output_dir: Path, quality: str = 'best', progress_callback=None) -> Dict: """ Download a clip Args: clip_url: Twitch clip URL output_dir: Directory to save the clip quality: Quality preset progress_callback: Callback for download progress Returns: Dict with success status and file info """ if not self.is_available(): return {'success': False, 'error': 'yt-dlp not available'} try: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Output template preserves title and ID output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s') format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best']) cmd = self._get_base_cmd() + [ '--no-warnings', '-f', format_str, '-o', output_template, '--print-json', clip_url ] self.log(f"Downloading clip: {clip_url}", 'debug') result = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await result.communicate() if result.returncode != 0: error_msg = stderr.decode('utf-8', errors='replace').strip() if len(error_msg) > 200: error_msg = error_msg[:200] + '...' return {'success': False, 'error': error_msg} # Parse output JSON clip_info = None for line in stdout.decode('utf-8', errors='replace').strip().split('\n'): try: clip_info = json.loads(line) break except json.JSONDecodeError: continue if not clip_info: # Try to find downloaded file files = list(output_dir.glob('*.mp4')) if files: file_path = max(files, key=lambda f: f.stat().st_mtime) return { 'success': True, 'file_path': str(file_path), 'filename': file_path.name, 'file_size': file_path.stat().st_size } return {'success': False, 'error': 'Could not find downloaded file'} file_path = clip_info.get('_filename') or clip_info.get('filename') if file_path: file_path = Path(file_path) return { 'success': True, 'file_path': str(file_path) if file_path else None, 'filename': file_path.name if file_path else None, 'file_size': file_path.stat().st_size if file_path and file_path.exists() else clip_info.get('filesize'), 'title': clip_info.get('title'), 'duration': clip_info.get('duration'), 'clip_id': clip_info.get('id'), 'upload_date': clip_info.get('upload_date'), 'thumbnail': clip_info.get('thumbnail'), } except Exception as e: self.log(f"Error downloading clip: {e}", 'error') return {'success': False, 'error': str(e)} async def get_channel_avatar(self, channel_name: str) -> Optional[str]: """ Try to fetch channel avatar from Twitch Note: This requires either Twitch API credentials or scraping. Returns None if avatar cannot be fetched. """ profile = await self.get_channel_profile(channel_name) return profile.get('avatar') if profile else None async def get_channel_profile(self, channel_name: str) -> Optional[Dict]: """ Fetch channel profile info using Twitch's GQL API. Returns dict with avatar, banner, display_name, bio, joined_date, external_links """ try: import aiohttp async with aiohttp.ClientSession() as session: headers = { 'Client-Id': 'kimne78kx3ncx6brgo4mv6wki5h1ko', # Public Twitch web client ID 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', } # GQL query for comprehensive user info query = ''' query { user(login: "%s") { id login displayName description createdAt profileImageURL(width: 300) bannerImageURL offlineImageURL channel { socialMedias { name url } } } } ''' % channel_name async with session.post( 'https://gql.twitch.tv/gql', headers=headers, json={'query': query}, timeout=aiohttp.ClientTimeout(total=15) ) as resp: if resp.status == 200: data = await resp.json() user = data.get('data', {}).get('user') if not user: self.log(f"Twitch user not found: {channel_name}", 'warning') return None result = {} # Avatar if user.get('profileImageURL'): result['avatar'] = user['profileImageURL'] # Banner - prefer offlineImageURL (larger), fall back to bannerImageURL if user.get('offlineImageURL'): result['banner'] = user['offlineImageURL'] elif user.get('bannerImageURL'): result['banner'] = user['bannerImageURL'] # Display name if user.get('displayName'): result['display_name'] = user['displayName'] # Bio/description if user.get('description'): result['bio'] = user['description'] # Joined date (format: "Jun 10, 2016") if user.get('createdAt'): try: created_dt = datetime.fromisoformat(user['createdAt'].replace('Z', '+00:00')) result['joined_date'] = created_dt.strftime('%b %d, %Y') self.log(f"Found Twitch joined date: {result['joined_date']}", 'debug') except (ValueError, TypeError): pass # Social links social_medias = user.get('channel', {}).get('socialMedias', []) if social_medias: links = [] for social in social_medias: name = social.get('name', 'Link') url = social.get('url', '') if url: # Capitalize first letter of name title = name.capitalize() if name else 'Link' links.append({'title': title, 'url': url}) if links: result['external_links'] = json.dumps(links) self.log(f"Found {len(links)} Twitch external links", 'debug') if result: self.log(f"Fetched Twitch profile via GQL for {channel_name}: {list(result.keys())}", 'debug') return result except Exception as e: self.log(f"Could not fetch Twitch profile: {e}", 'debug') return None async def get_creator(self, channel_url: str) -> Optional[Creator]: """ Get Creator object from channel URL """ info = await self.get_channel_info(channel_url) if not info: return None channel_name = info.get('channel_name') or self.extract_channel_name(channel_url) # Try to get the actual channel avatar (not clip thumbnail) avatar_url = await self.get_channel_avatar(channel_name) return Creator( creator_id=info.get('channel_id') or channel_name, service_id='twitch', platform='twitch', username=channel_name or 'Unknown', display_name=channel_name, profile_image_url=avatar_url, # Use actual avatar, not clip thumbnail post_count=info.get('clip_count', 0) ) async def get_posts(self, channel_url: str, since_date: str = None, max_clips: int = None, progress_callback=None) -> List[Post]: """ Get clips as Post objects """ clips = await self.get_channel_clips(channel_url, since_date, max_clips, progress_callback) posts = [] for clip in clips: # Create attachment for the clip attachment = Attachment( name=f"{clip['title']}.mp4", file_type='video', extension='.mp4', server_path=clip['url'], # Use URL as server_path download_url=clip['url'], duration=clip.get('duration'), ) post = Post( post_id=clip['clip_id'], service_id='twitch', platform='twitch', creator_id=clip.get('channel_name', ''), title=clip['title'], content='', # Clips don't have descriptions published_at=clip.get('upload_date'), attachments=[attachment], ) posts.append(post) return posts