Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

View File

@@ -0,0 +1,751 @@
"""
Twitch Clips Client - Fetches channel clips using yt-dlp
"""
import aiohttp
import asyncio
import hashlib
import json
import os
import re
import subprocess
import tempfile
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment
class TwitchThumbnailCache:
"""Cache for Twitch clip thumbnails"""
def __init__(self, cache_dir: str = None):
self.cache_dir = Path(cache_dir or '/opt/media-downloader/data/cache/twitch_thumbnails')
self.cache_dir.mkdir(parents=True, exist_ok=True)
def _get_cache_path(self, thumbnail_url: str) -> Path:
"""Get local cache path for a thumbnail URL"""
# Create a hash of the URL for the filename
url_hash = hashlib.md5(thumbnail_url.encode()).hexdigest()
# Extract extension from URL or default to jpg
ext = '.jpg'
if '.png' in thumbnail_url.lower():
ext = '.png'
elif '.webp' in thumbnail_url.lower():
ext = '.webp'
return self.cache_dir / f"{url_hash}{ext}"
def get_cached(self, thumbnail_url: str) -> Optional[str]:
"""Get cached thumbnail path if it exists"""
cache_path = self._get_cache_path(thumbnail_url)
if cache_path.exists():
return str(cache_path)
return None
async def cache_thumbnail(self, thumbnail_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
"""Download and cache a thumbnail, return local path"""
if not thumbnail_url:
return None
# Check if already cached
cache_path = self._get_cache_path(thumbnail_url)
if cache_path.exists():
return str(cache_path)
# Download thumbnail
try:
close_session = False
if session is None:
session = aiohttp.ClientSession()
close_session = True
try:
async with session.get(thumbnail_url, timeout=aiohttp.ClientTimeout(total=30)) as resp:
if resp.status == 200:
content = await resp.read()
with open(cache_path, 'wb') as f:
f.write(content)
return str(cache_path)
finally:
if close_session:
await session.close()
except Exception:
pass
return None
async def cache_thumbnails_batch(self, thumbnail_urls: List[str], max_concurrent: int = 5) -> Dict[str, str]:
"""Cache multiple thumbnails in parallel, return url->local_path mapping"""
result = {}
# Filter out already cached
to_download = []
for url in thumbnail_urls:
if not url:
continue
cached = self.get_cached(url)
if cached:
result[url] = cached
else:
to_download.append(url)
if not to_download:
return result
# Download in batches
async with aiohttp.ClientSession() as session:
semaphore = asyncio.Semaphore(max_concurrent)
async def download_one(url: str):
async with semaphore:
path = await self.cache_thumbnail(url, session)
if path:
result[url] = path
await asyncio.gather(*[download_one(url) for url in to_download])
return result
class TwitchClient(LoggingMixin):
"""
Client for fetching Twitch channel clips using yt-dlp
Supports:
- Channel clips URLs (twitch.tv/username/clips)
- Fetching channel metadata
- Listing all clips from a channel
- Downloading clips
"""
# Quality presets for yt-dlp
QUALITY_PRESETS = {
'best': 'best',
'1080p': 'best[height<=1080]',
'720p': 'best[height<=720]',
'480p': 'best[height<=480]',
}
def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None, cache_dir: str = None):
self._init_logger('PaidContent', log_callback, default_module='Twitch')
# Find yt-dlp executable
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
if not self.ytdlp_path:
self.log("yt-dlp not found, Twitch support will be disabled", 'warning')
# Store database reference for cookie access
self.unified_db = unified_db
self._cookies_file = None
# Initialize thumbnail cache
self.thumbnail_cache = TwitchThumbnailCache(cache_dir)
def _find_ytdlp(self) -> Optional[str]:
"""Find yt-dlp executable"""
common_paths = [
'/opt/media-downloader/venv/bin/yt-dlp', # Prefer venv version (kept up to date)
'/usr/local/bin/yt-dlp',
'/usr/bin/yt-dlp',
'/opt/homebrew/bin/yt-dlp',
os.path.expanduser('~/.local/bin/yt-dlp'),
]
for path in common_paths:
if os.path.isfile(path) and os.access(path, os.X_OK):
return path
try:
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def is_available(self) -> bool:
"""Check if yt-dlp is available"""
return self.ytdlp_path is not None
def _get_cookies_file(self) -> Optional[str]:
"""Get path to cookies file, creating it from database if needed"""
if self._cookies_file and os.path.exists(self._cookies_file):
return self._cookies_file
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
# Try twitch-specific cookies first, then fall back to ytdlp
for scraper_id in ['twitch', 'ytdlp']:
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", (scraper_id,))
row = cursor.fetchone()
if row and row[0]:
data = json.loads(row[0])
# Support both {"cookies": [...]} and [...] formats
if isinstance(data, dict) and 'cookies' in data:
cookies_list = data['cookies']
elif isinstance(data, list):
cookies_list = data
else:
cookies_list = []
if cookies_list:
# Write cookies to temp file in Netscape format
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='twitch_cookies_')
with os.fdopen(fd, 'w') as f:
f.write("# Netscape HTTP Cookie File\n")
for cookie in cookies_list:
domain = cookie.get('domain', '')
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
path = cookie.get('path', '/')
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
expiry = str(int(cookie.get('expirationDate', 0)))
name = cookie.get('name', '')
value = cookie.get('value', '')
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
self.log(f"Loaded {len(cookies_list)} cookies from {scraper_id} scraper", 'debug')
return self._cookies_file
except Exception as e:
self.log(f"Could not load cookies: {e}", 'debug')
return None
def _get_base_cmd(self) -> List[str]:
"""Get base yt-dlp command with cookies if available"""
cmd = [self.ytdlp_path]
cookies_file = self._get_cookies_file()
if cookies_file:
cmd.extend(['--cookies', cookies_file])
return cmd
def cleanup(self):
"""Clean up temporary files"""
if self._cookies_file and os.path.exists(self._cookies_file):
try:
os.unlink(self._cookies_file)
except Exception:
pass
self._cookies_file = None
@staticmethod
def extract_channel_name(url: str) -> Optional[str]:
"""
Extract channel name from Twitch URL
Supports:
- twitch.tv/username
- twitch.tv/username/clips
- m.twitch.tv/username/clips
"""
patterns = [
r'twitch\.tv/([a-zA-Z0-9_]+)(?:/clips)?',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1).lower()
return None
@staticmethod
def normalize_clips_url(channel_name: str) -> str:
"""Convert channel name to clips URL with all-time filter"""
return f"https://www.twitch.tv/{channel_name}/clips?filter=clips&range=all"
async def get_channel_info(self, channel_url: str, count_clips: bool = True) -> Optional[Dict]:
"""
Get channel information and optionally count all clips
"""
if not self.is_available():
return None
channel_name = self.extract_channel_name(channel_url)
if not channel_name:
return None
try:
clips_url = self.normalize_clips_url(channel_name)
# First get basic info from first clip
cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'-j',
'--playlist-items', '1',
clips_url
]
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
self.log(f"Failed to get channel info: {stderr.decode()}", 'warning')
return None
first_clip_data = None
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line:
continue
try:
first_clip_data = json.loads(line)
break
except json.JSONDecodeError:
continue
if not first_clip_data:
return None
# Count all clips if requested (this can take a while for channels with many clips)
clip_count = 0
if count_clips:
self.log(f"Counting clips for {channel_name}...", 'debug')
count_cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'--print', 'id',
clips_url
]
count_result = await asyncio.create_subprocess_exec(
*count_cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
count_stdout, _ = await count_result.communicate()
if count_result.returncode == 0:
clip_count = len([l for l in count_stdout.decode('utf-8', errors='replace').strip().split('\n') if l])
self.log(f"Found {clip_count} clips for {channel_name}", 'info')
return {
'channel_id': channel_name,
'channel_name': channel_name,
'channel_url': f"https://www.twitch.tv/{channel_name}",
'clips_url': clips_url,
'thumbnail': first_clip_data.get('thumbnail'),
'clip_count': clip_count,
}
except Exception as e:
self.log(f"Error getting channel info: {e}", 'error')
return None
async def get_channel_clips(self, channel_url: str, since_date: str = None,
max_clips: int = None, progress_callback=None,
cache_thumbnails: bool = True) -> List[Dict]:
"""
Get all clips from a channel
Args:
channel_url: Twitch channel URL
since_date: Only fetch clips created after this date (ISO format)
max_clips: Maximum number of clips to fetch
progress_callback: Callback function(count) for progress updates
cache_thumbnails: Whether to download and cache thumbnails locally
Returns:
List of clip metadata dicts with cached thumbnail paths
"""
if not self.is_available():
return []
channel_name = self.extract_channel_name(channel_url)
if not channel_name:
self.log(f"Could not extract channel name from URL: {channel_url}", 'error')
return []
try:
clips_url = self.normalize_clips_url(channel_name)
# Use flat-playlist for faster extraction (full metadata available in flat mode for Twitch clips)
cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'-j',
clips_url
]
# Add date filter at yt-dlp level for efficiency
if since_date:
try:
from datetime import datetime
# Convert ISO date to YYYYMMDD format for yt-dlp
date_obj = datetime.fromisoformat(since_date.replace('Z', '+00:00'))
dateafter = date_obj.strftime('%Y%m%d')
cmd.extend(['--dateafter', dateafter])
self.log(f"Filtering clips after {dateafter}", 'debug')
except (ValueError, AttributeError):
pass
if max_clips:
cmd.extend(['--playlist-items', f'1:{max_clips}'])
self.log(f"Fetching clips from channel: {channel_name}", 'info')
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error = stderr.decode('utf-8', errors='replace')
self.log(f"Failed to get channel clips: {error}", 'warning')
return []
clips = []
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line:
continue
try:
data = json.loads(line)
clip_id = data.get('id')
if not clip_id:
continue
# Parse timestamp to ISO format
timestamp = data.get('timestamp')
upload_date = data.get('upload_date')
if timestamp:
try:
upload_date = datetime.fromtimestamp(timestamp).isoformat()
except (ValueError, OSError):
pass
elif upload_date:
# Convert YYYYMMDD to ISO format
try:
upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat()
except ValueError:
pass
# Check if clip is newer than since_date
if since_date and upload_date and upload_date <= since_date:
self.log(f"Reached clip from {upload_date}, stopping", 'debug')
break
# Extract clip slug from URL
clip_url = data.get('url') or data.get('webpage_url', '')
clip_slug = clip_url.split('/')[-1] if clip_url else clip_id
clips.append({
'clip_id': clip_id,
'clip_slug': clip_slug,
'title': data.get('title', f'Clip {clip_id}'),
'upload_date': upload_date,
'timestamp': timestamp,
'duration': data.get('duration'),
'view_count': data.get('view_count'),
'thumbnail': data.get('thumbnail'),
'url': clip_url,
'language': data.get('language'),
'channel_name': channel_name,
})
if progress_callback:
progress_callback(len(clips))
if max_clips and len(clips) >= max_clips:
break
except json.JSONDecodeError:
continue
self.log(f"Found {len(clips)} clips", 'info')
# Cache thumbnails if requested
if cache_thumbnails and clips:
thumbnail_urls = [c.get('thumbnail') for c in clips if c.get('thumbnail')]
if thumbnail_urls:
self.log(f"Caching {len(thumbnail_urls)} thumbnails...", 'debug')
cached_paths = await self.thumbnail_cache.cache_thumbnails_batch(thumbnail_urls)
# Update clips with cached thumbnail paths
for clip in clips:
thumb_url = clip.get('thumbnail')
if thumb_url and thumb_url in cached_paths:
clip['thumbnail_cached'] = cached_paths[thumb_url]
self.log(f"Cached {len(cached_paths)} thumbnails", 'debug')
return clips
except Exception as e:
self.log(f"Error getting channel clips: {e}", 'error')
return []
async def download_clip(self, clip_url: str, output_dir: Path, quality: str = 'best',
progress_callback=None) -> Dict:
"""
Download a clip
Args:
clip_url: Twitch clip URL
output_dir: Directory to save the clip
quality: Quality preset
progress_callback: Callback for download progress
Returns:
Dict with success status and file info
"""
if not self.is_available():
return {'success': False, 'error': 'yt-dlp not available'}
try:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Output template preserves title and ID
output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s')
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
cmd = self._get_base_cmd() + [
'--no-warnings',
'-f', format_str,
'-o', output_template,
'--print-json',
clip_url
]
self.log(f"Downloading clip: {clip_url}", 'debug')
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error_msg = stderr.decode('utf-8', errors='replace').strip()
if len(error_msg) > 200:
error_msg = error_msg[:200] + '...'
return {'success': False, 'error': error_msg}
# Parse output JSON
clip_info = None
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
try:
clip_info = json.loads(line)
break
except json.JSONDecodeError:
continue
if not clip_info:
# Try to find downloaded file
files = list(output_dir.glob('*.mp4'))
if files:
file_path = max(files, key=lambda f: f.stat().st_mtime)
return {
'success': True,
'file_path': str(file_path),
'filename': file_path.name,
'file_size': file_path.stat().st_size
}
return {'success': False, 'error': 'Could not find downloaded file'}
file_path = clip_info.get('_filename') or clip_info.get('filename')
if file_path:
file_path = Path(file_path)
return {
'success': True,
'file_path': str(file_path) if file_path else None,
'filename': file_path.name if file_path else None,
'file_size': file_path.stat().st_size if file_path and file_path.exists() else clip_info.get('filesize'),
'title': clip_info.get('title'),
'duration': clip_info.get('duration'),
'clip_id': clip_info.get('id'),
'upload_date': clip_info.get('upload_date'),
'thumbnail': clip_info.get('thumbnail'),
}
except Exception as e:
self.log(f"Error downloading clip: {e}", 'error')
return {'success': False, 'error': str(e)}
async def get_channel_avatar(self, channel_name: str) -> Optional[str]:
"""
Try to fetch channel avatar from Twitch
Note: This requires either Twitch API credentials or scraping.
Returns None if avatar cannot be fetched.
"""
profile = await self.get_channel_profile(channel_name)
return profile.get('avatar') if profile else None
async def get_channel_profile(self, channel_name: str) -> Optional[Dict]:
"""
Fetch channel profile info using Twitch's GQL API.
Returns dict with avatar, banner, display_name, bio, joined_date, external_links
"""
try:
import aiohttp
async with aiohttp.ClientSession() as session:
headers = {
'Client-Id': 'kimne78kx3ncx6brgo4mv6wki5h1ko', # Public Twitch web client ID
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
}
# GQL query for comprehensive user info
query = '''
query {
user(login: "%s") {
id
login
displayName
description
createdAt
profileImageURL(width: 300)
bannerImageURL
offlineImageURL
channel {
socialMedias {
name
url
}
}
}
}
''' % channel_name
async with session.post(
'https://gql.twitch.tv/gql',
headers=headers,
json={'query': query},
timeout=aiohttp.ClientTimeout(total=15)
) as resp:
if resp.status == 200:
data = await resp.json()
user = data.get('data', {}).get('user')
if not user:
self.log(f"Twitch user not found: {channel_name}", 'warning')
return None
result = {}
# Avatar
if user.get('profileImageURL'):
result['avatar'] = user['profileImageURL']
# Banner - prefer offlineImageURL (larger), fall back to bannerImageURL
if user.get('offlineImageURL'):
result['banner'] = user['offlineImageURL']
elif user.get('bannerImageURL'):
result['banner'] = user['bannerImageURL']
# Display name
if user.get('displayName'):
result['display_name'] = user['displayName']
# Bio/description
if user.get('description'):
result['bio'] = user['description']
# Joined date (format: "Jun 10, 2016")
if user.get('createdAt'):
try:
created_dt = datetime.fromisoformat(user['createdAt'].replace('Z', '+00:00'))
result['joined_date'] = created_dt.strftime('%b %d, %Y')
self.log(f"Found Twitch joined date: {result['joined_date']}", 'debug')
except (ValueError, TypeError):
pass
# Social links
social_medias = user.get('channel', {}).get('socialMedias', [])
if social_medias:
links = []
for social in social_medias:
name = social.get('name', 'Link')
url = social.get('url', '')
if url:
# Capitalize first letter of name
title = name.capitalize() if name else 'Link'
links.append({'title': title, 'url': url})
if links:
result['external_links'] = json.dumps(links)
self.log(f"Found {len(links)} Twitch external links", 'debug')
if result:
self.log(f"Fetched Twitch profile via GQL for {channel_name}: {list(result.keys())}", 'debug')
return result
except Exception as e:
self.log(f"Could not fetch Twitch profile: {e}", 'debug')
return None
async def get_creator(self, channel_url: str) -> Optional[Creator]:
"""
Get Creator object from channel URL
"""
info = await self.get_channel_info(channel_url)
if not info:
return None
channel_name = info.get('channel_name') or self.extract_channel_name(channel_url)
# Try to get the actual channel avatar (not clip thumbnail)
avatar_url = await self.get_channel_avatar(channel_name)
return Creator(
creator_id=info.get('channel_id') or channel_name,
service_id='twitch',
platform='twitch',
username=channel_name or 'Unknown',
display_name=channel_name,
profile_image_url=avatar_url, # Use actual avatar, not clip thumbnail
post_count=info.get('clip_count', 0)
)
async def get_posts(self, channel_url: str, since_date: str = None,
max_clips: int = None, progress_callback=None) -> List[Post]:
"""
Get clips as Post objects
"""
clips = await self.get_channel_clips(channel_url, since_date, max_clips, progress_callback)
posts = []
for clip in clips:
# Create attachment for the clip
attachment = Attachment(
name=f"{clip['title']}.mp4",
file_type='video',
extension='.mp4',
server_path=clip['url'], # Use URL as server_path
download_url=clip['url'],
duration=clip.get('duration'),
)
post = Post(
post_id=clip['clip_id'],
service_id='twitch',
platform='twitch',
creator_id=clip.get('channel_name', ''),
title=clip['title'],
content='', # Clips don't have descriptions
published_at=clip.get('upload_date'),
attachments=[attachment],
)
posts.append(post)
return posts