#!/usr/bin/env python3 """ Universal Video Downloader Module - Downloads videos from YouTube, Vimeo, Dailymotion, Bilibili, and more """ import os import re import json import subprocess import hashlib from pathlib import Path from datetime import datetime from typing import Dict, List, Optional, Tuple from modules.universal_logger import get_logger logger = get_logger('UniversalVideoDownloader') # Cookie/auth error patterns that indicate expired or invalid cookies COOKIE_ERROR_PATTERNS = [ r'sign in to confirm', r'login required', r'cookies.*expired', r'please sign in', r'authentication required', r'private video', r'video is unavailable.*sign in', r'age-restricted.*sign in', r'members-only content', r'this video is available to this channel', r'confirm your age', ] # Browser User-Agent strings (updated Dec 2024) BROWSER_USER_AGENTS = { 'edge': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0', 'chrome': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'firefox': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0', 'safari': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15', } # Default anti-bot settings DEFAULT_ANTIBOT_SETTINGS = { 'browser': 'edge', 'custom_user_agent': '', 'limit_rate': '2M', 'throttled_rate': '100K', 'sleep_requests_min': 1, 'sleep_requests_max': 3, 'retries': 10, 'fragment_retries': 10, 'concurrent_fragments': 1, 'socket_timeout': 30, 'enabled': True, } def is_cookie_error(output: str) -> bool: """Check if output contains cookie/auth error patterns.""" if not output: return False output_lower = output.lower() for pattern in COOKIE_ERROR_PATTERNS: if re.search(pattern, output_lower): return True return False def get_antibot_settings(unified_db) -> dict: """Get anti-bot settings from database or return defaults.""" if not unified_db: return DEFAULT_ANTIBOT_SETTINGS.copy() try: import json with unified_db.get_connection() as conn: cursor = conn.cursor() cursor.execute("SELECT value FROM settings WHERE key = 'antibot_settings'") row = cursor.fetchone() if row: settings = json.loads(row[0]) # Merge with defaults to ensure all keys exist merged = DEFAULT_ANTIBOT_SETTINGS.copy() merged.update(settings) return merged except Exception: pass return DEFAULT_ANTIBOT_SETTINGS.copy() def get_user_agent(settings: dict) -> str: """Get the user agent string based on settings.""" browser = settings.get('browser', 'edge') if browser == 'custom': custom_ua = settings.get('custom_user_agent', '').strip() if custom_ua: return custom_ua # Fall back to edge if custom is empty return BROWSER_USER_AGENTS['edge'] return BROWSER_USER_AGENTS.get(browser, BROWSER_USER_AGENTS['edge']) def format_datetime_for_db(dt: datetime = None) -> str: """Format datetime for database storage using space separator (not ISO T separator). This ensures consistent string sorting in SQLite since 'T' > ' ' would cause ISO format dates to sort incorrectly with space-separated dates. Uses UTC time for consistency with other parts of the system. """ if dt is None: dt = datetime.utcnow() return dt.strftime('%Y-%m-%d %H:%M:%S') # Platform configurations PLATFORMS = { 'youtube': { 'name': 'YouTube', 'color': 'red', 'base_path': '/opt/immich/md/youtube', 'url_patterns': [ r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})', r'youtube\.com/shorts/([a-zA-Z0-9_-]{11})', ], 'id_pattern': r'^[a-zA-Z0-9_-]{11}$' }, 'vimeo': { 'name': 'Vimeo', 'color': 'blue', 'base_path': '/opt/immich/md/vimeo', 'url_patterns': [ r'vimeo\.com/(\d+)', r'vimeo\.com/video/(\d+)', r'vimeo\.com/channels/[^/]+/(\d+)', ], 'id_pattern': r'^\d+$' }, 'dailymotion': { 'name': 'Dailymotion', 'color': 'cyan', 'base_path': '/opt/immich/md/dailymotion', 'url_patterns': [ r'dailymotion\.com/video/([a-zA-Z0-9]+)', r'dai\.ly/([a-zA-Z0-9]+)', ], 'id_pattern': r'^[a-zA-Z0-9]+$' }, 'bilibili': { 'name': 'Bilibili', 'color': 'pink', 'base_path': '/opt/immich/md/bilibili', 'url_patterns': [ r'bilibili\.com/video/(BV[a-zA-Z0-9]+)', r'bilibili\.com/video/(av\d+)', r'b23\.tv/([a-zA-Z0-9]+)', ], 'id_pattern': r'^(BV[a-zA-Z0-9]+|av\d+)$' } } # Sites that should use gallery-dl instead of yt-dlp (image/gallery focused) GALLERY_DL_SITES = { 'erome': { 'name': 'Erome', 'color': 'purple', 'base_path': '/opt/immich/md/erome', 'url_patterns': [r'erome\.com/a/([a-zA-Z0-9]+)', r'erome\.com/([a-zA-Z0-9_-]+)$'], }, 'bunkr': { 'name': 'Bunkr', 'color': 'blue', 'base_path': '/opt/immich/md/bunkr', 'url_patterns': [r'bunkr\.\w+/a/([a-zA-Z0-9]+)', r'bunkr\.\w+/v/([a-zA-Z0-9]+)'], }, 'cyberdrop': { 'name': 'Cyberdrop', 'color': 'cyan', 'base_path': '/opt/immich/md/cyberdrop', 'url_patterns': [r'cyberdrop\.\w+/a/([a-zA-Z0-9]+)'], }, 'kemono': { 'name': 'Kemono', 'color': 'green', 'base_path': '/opt/immich/md/kemono', 'url_patterns': [r'kemono\.\w+/([^/]+)/user/(\d+)'], }, 'coomer': { 'name': 'Coomer', 'color': 'pink', 'base_path': '/opt/immich/md/coomer', 'url_patterns': [r'coomer\.\w+/([^/]+)/user/(\d+)'], }, 'pixeldrain': { 'name': 'Pixeldrain', 'color': 'indigo', 'base_path': '/opt/immich/md/pixeldrain', 'url_patterns': [r'pixeldrain\.com/u/([a-zA-Z0-9]+)', r'pixeldrain\.com/l/([a-zA-Z0-9]+)'], }, 'gofile': { 'name': 'GoFile', 'color': 'yellow', 'base_path': '/opt/immich/md/gofile', 'url_patterns': [r'gofile\.io/d/([a-zA-Z0-9]+)'], }, 'imgbox': { 'name': 'ImgBox', 'color': 'gray', 'base_path': '/opt/immich/md/imgbox', 'url_patterns': [r'imgbox\.com/g/([a-zA-Z0-9]+)'], }, 'imagebam': { 'name': 'ImageBam', 'color': 'orange', 'base_path': '/opt/immich/md/imagebam', 'url_patterns': [r'imagebam\.com/gallery/([a-zA-Z0-9]+)'], }, 'fapello': { 'name': 'Fapello', 'color': 'red', 'base_path': '/opt/immich/md/fapello', 'url_patterns': [r'fapello\.com/([a-zA-Z0-9_-]+)'], }, 'imagefap': { 'name': 'ImageFap', 'color': 'green', 'base_path': '/opt/immich/md/imagefap', 'url_patterns': [r'imagefap\.com/pictures/(\d+)', r'imagefap\.com/gallery/(\d+)'], }, 'rule34': { 'name': 'Rule34', 'color': 'green', 'base_path': '/opt/immich/md/rule34', 'url_patterns': [r'rule34\.(xxx|us|paheal)'], }, 'e621': { 'name': 'e621', 'color': 'blue', 'base_path': '/opt/immich/md/e621', 'url_patterns': [r'e621\.net'], }, 'nhentai': { 'name': 'nHentai', 'color': 'pink', 'base_path': '/opt/immich/md/nhentai', 'url_patterns': [r'nhentai\.net/g/(\d+)'], }, 'hitomi': { 'name': 'Hitomi', 'color': 'pink', 'base_path': '/opt/immich/md/hitomi', 'url_patterns': [r'hitomi\.la'], }, 'gelbooru': { 'name': 'Gelbooru', 'color': 'blue', 'base_path': '/opt/immich/md/gelbooru', 'url_patterns': [r'gelbooru\.com'], }, 'danbooru': { 'name': 'Danbooru', 'color': 'blue', 'base_path': '/opt/immich/md/danbooru', 'url_patterns': [r'danbooru\.donmai\.us'], }, 'deviantart': { 'name': 'DeviantArt', 'color': 'green', 'base_path': '/opt/immich/md/deviantart', 'url_patterns': [r'deviantart\.com'], }, 'artstation': { 'name': 'ArtStation', 'color': 'blue', 'base_path': '/opt/immich/md/artstation', 'url_patterns': [r'artstation\.com'], }, 'pixiv': { 'name': 'Pixiv', 'color': 'blue', 'base_path': '/opt/immich/md/pixiv', 'url_patterns': [r'pixiv\.net'], }, 'furaffinity': { 'name': 'FurAffinity', 'color': 'orange', 'base_path': '/opt/immich/md/furaffinity', 'url_patterns': [r'furaffinity\.net'], }, 'catbox': { 'name': 'Catbox', 'color': 'purple', 'base_path': '/opt/immich/md/catbox', 'url_patterns': [r'catbox\.moe', r'files\.catbox\.moe'], }, } class UniversalVideoDownloader: """Downloads videos from multiple platforms using yt-dlp and gallery-dl""" # Default base directory for all downloads DEFAULT_BASE_DIR = '/opt/immich/md' def __init__(self, platform: str = 'youtube', base_path: Path = None, unified_db=None, cookies_file: str = None): """ Initialize Universal Video Downloader Args: platform: Platform name (youtube, vimeo, dailymotion, bilibili, or gallery-dl sites) base_path: Base path for downloads (default: from settings or platform config) unified_db: UnifiedDatabase instance (required) cookies_file: Path to cookies file for yt-dlp (optional) """ self.cookies_file = cookies_file # Check if platform is a gallery-dl site self.is_gallery_dl = platform in GALLERY_DL_SITES if platform not in PLATFORMS and platform not in GALLERY_DL_SITES: raise ValueError(f"Unsupported platform: {platform}. Supported: {', '.join(list(PLATFORMS.keys()) + list(GALLERY_DL_SITES.keys()))}") self.platform = platform if self.is_gallery_dl: self.platform_config = GALLERY_DL_SITES[platform] else: self.platform_config = PLATFORMS[platform] # Set base path - check settings first, then use default if base_path: self.base_path = Path(base_path) else: # Try to get base directory from settings config_base_dir = self._get_configured_base_dir(unified_db) self.base_path = Path(config_base_dir) / platform self.base_path.mkdir(parents=True, exist_ok=True) # Load video downloader settings self.video_settings = self._get_video_downloader_settings(unified_db) # Initialize universal logger self.logger = get_logger('UniversalVideoDownloader') # Always use unified database adapter if not unified_db: raise ValueError("Universal video downloader requires unified_db") self.unified_db = unified_db # Initialize activity status manager for real-time updates from modules.activity_status import get_activity_manager self.activity_manager = get_activity_manager(unified_db) def _get_video_downloader_settings(self, unified_db) -> dict: """Get video downloader settings from database.""" defaults = { 'base_path': '', 'max_concurrent': 3, 'cache_thumbnails': True, 'auto_generate_thumbnails': True, 'embed_metadata': True } if not unified_db: return defaults try: import json with unified_db.get_connection() as conn: cursor = conn.cursor() cursor.execute("SELECT value FROM settings WHERE key = 'video_downloader'") row = cursor.fetchone() if row: settings = json.loads(row[0]) defaults.update(settings) except Exception: pass return defaults def _get_configured_base_dir(self, unified_db) -> str: """Get base download directory from settings or use default.""" if not unified_db: return self.DEFAULT_BASE_DIR try: import json with unified_db.get_connection() as conn: cursor = conn.cursor() # First check video_downloader.base_path cursor.execute("SELECT value FROM settings WHERE key = 'video_downloader'") row = cursor.fetchone() if row: settings = json.loads(row[0]) base_path = settings.get('base_path') if base_path: return base_path # Fall back to download_settings.base_directory cursor.execute("SELECT value FROM settings WHERE key = 'download_settings'") row = cursor.fetchone() if row: settings = json.loads(row[0]) base_dir = settings.get('base_directory') if base_dir: return base_dir except Exception: pass return self.DEFAULT_BASE_DIR def _get_ytdlp_base_cmd(self) -> list: """Get base yt-dlp command with cookies if configured.""" cmd = ['/opt/media-downloader/venv/bin/yt-dlp'] # Enable remote EJS components for YouTube n-challenge solving (deno required) cmd.extend(['--remote-components', 'ejs:github']) if self.cookies_file: cmd.extend(['--cookies', self.cookies_file]) return cmd def _get_gallery_dl_base_cmd(self) -> list: """Get base gallery-dl command with cookies if configured.""" cmd = ['/opt/media-downloader/venv/bin/gallery-dl'] if self.cookies_file: cmd.extend(['--cookies', self.cookies_file]) return cmd def log(self, message: str, level: str = "info", module: str = "Download"): """Log a message with level Args: message: The message to log level: Log level ('debug', 'info', 'warning', 'error', 'success') module: Module name for logging """ level = level.lower() self.logger.log(f"[{self.platform_config['name']}] {message}", level.upper(), module=module) def detect_platform(self, url: str) -> Optional[str]: """Detect platform from URL Args: url: Video URL Returns: Platform name or None if not detected """ # Check yt-dlp platforms first for platform, config in PLATFORMS.items(): for pattern in config['url_patterns']: if re.search(pattern, url, re.IGNORECASE): return platform # Check gallery-dl sites for platform, config in GALLERY_DL_SITES.items(): for pattern in config['url_patterns']: if re.search(pattern, url, re.IGNORECASE): return platform return None @staticmethod def detect_gallery_dl_site(url: str) -> Optional[str]: """Detect if URL is a gallery-dl supported site Args: url: URL to check Returns: Site name or None if not a gallery-dl site """ for site, config in GALLERY_DL_SITES.items(): for pattern in config['url_patterns']: if re.search(pattern, url, re.IGNORECASE): return site return None def extract_video_id(self, url: str) -> Optional[str]: """Extract video ID from URL Args: url: Video URL Returns: Video ID or None if not found """ # Try patterns for current platform for pattern in self.platform_config['url_patterns']: match = re.search(pattern, url, re.IGNORECASE) if match: return match.group(1) # If URL is just the video ID if re.match(self.platform_config['id_pattern'], url): return url return None def _is_already_downloaded(self, video_id: str) -> bool: """Check if a video has already been downloaded Args: video_id: Video ID Returns: True if already downloaded """ try: with self.unified_db.get_connection() as conn: cursor = conn.cursor() cursor.execute(''' SELECT COUNT(*) as count FROM video_downloads WHERE platform = ? AND video_id = ? ''', (self.platform, video_id)) result = cursor.fetchone() return result['count'] > 0 except Exception as e: self.log(f"Error checking if video already downloaded: {e}", "error", "Database") return False def _record_download(self, video_id: str, url: str, title: str, file_path: str, uploader: str = None, upload_date: Optional[datetime] = None, duration: int = None, file_size: int = None, metadata: Dict = None): """Record a successful download in the database Args: video_id: Video ID url: Original URL title: Video title file_path: Path to downloaded file uploader: Channel/uploader name upload_date: Upload date duration: Duration in seconds file_size: File size in bytes metadata: Additional metadata """ try: # Prepare metadata for JSON serialization metadata_serializable = None if metadata: metadata_serializable = dict(metadata) # Convert datetime objects to ISO format strings if 'upload_date' in metadata_serializable and isinstance(metadata_serializable['upload_date'], datetime): metadata_serializable['upload_date'] = metadata_serializable['upload_date'].isoformat() with self.unified_db.get_connection() as conn: cursor = conn.cursor() # Check if we have cached thumbnail from preview list cursor.execute(''' SELECT thumbnail_data FROM video_preview_list WHERE platform = ? AND video_id = ? ''', (self.platform, video_id)) preview_row = cursor.fetchone() thumbnail_data = preview_row[0] if preview_row else None # Also check video_download_queue (for downloads initiated from queue) if not thumbnail_data: cursor.execute(''' SELECT thumbnail_data FROM video_download_queue WHERE platform = ? AND video_id = ? ''', (self.platform, video_id)) queue_row = cursor.fetchone() if queue_row and queue_row[0]: thumbnail_data = queue_row[0] # Fallback: fetch thumbnail from URL if not in cache if not thumbnail_data and metadata: thumbnail_url = metadata.get('thumbnail') if thumbnail_url: thumbnail_data = self._fetch_thumbnail(thumbnail_url, video_id) cursor.execute(''' INSERT INTO video_downloads (platform, video_id, url, title, uploader, upload_date, duration, file_path, file_size, metadata, download_date, thumbnail_data) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( self.platform, video_id, url, title, uploader, format_datetime_for_db(upload_date) if upload_date else None, duration, file_path, file_size, json.dumps(metadata_serializable) if metadata_serializable else None, format_datetime_for_db(), thumbnail_data )) conn.commit() self.log(f"Recorded download: {title}", "success", "Database") except Exception as e: self.log(f"Error recording download: {e}", "error", "Database") def _fetch_thumbnail(self, thumbnail_url: str, video_id: str) -> Optional[bytes]: """Fetch thumbnail from URL and return binary data. Args: thumbnail_url: URL of the thumbnail video_id: Video ID for logging Returns: Thumbnail binary data or None on failure """ import requests if not thumbnail_url: return None try: # For YouTube, try maxresdefault first (1280x720, no black bars), fallback to hqdefault url_to_fetch = thumbnail_url if 'ytimg.com' in thumbnail_url: # Try maxresdefault first (best quality, no letterboxing) for quality in ['maxresdefault', 'hqdefault']: url_to_fetch = f"https://i.ytimg.com/vi/{video_id}/{quality}.jpg" response = requests.get( url_to_fetch, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}, timeout=10 ) if response.status_code == 200 and len(response.content) > 1000: self.log(f"Fetched {quality} thumbnail for {video_id}", "debug", "Database") return response.content return None response = requests.get( url_to_fetch, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}, timeout=10 ) if response.status_code == 200 and len(response.content) > 1000: self.log(f"Fetched thumbnail for {video_id}", "debug", "Database") return response.content except Exception as e: self.log(f"Failed to fetch thumbnail for {video_id}: {e}", "warning", "Database") return None def get_video_info(self, url: str) -> Optional[Dict]: """Get video metadata using yt-dlp without downloading Args: url: Video URL Returns: Dictionary with video info or None on error """ try: self.log(f"Fetching video info for: {url}", "info", "Core") cmd = self._get_ytdlp_base_cmd() + [ '--dump-json', '--no-playlist', url ] result = subprocess.run( cmd, capture_output=True, text=True, timeout=30 ) if result.returncode != 0: self.log(f"Failed to fetch video info: {result.stderr}", "error", "Core") return None info = json.loads(result.stdout) # Extract upload date upload_date = None if 'upload_date' in info and info['upload_date']: try: upload_date = datetime.strptime(info['upload_date'], '%Y%m%d') except Exception as e: self.log(f"Error parsing upload date: {e}", "warning", "Core") # Extract video ID from info video_id = info.get('id') or self.extract_video_id(url) return { 'video_id': video_id, 'title': info.get('title'), 'uploader': info.get('uploader') or info.get('channel') or info.get('creator'), 'upload_date': upload_date, 'duration': info.get('duration'), 'description': info.get('description'), 'thumbnail': info.get('thumbnail'), 'view_count': info.get('view_count'), 'like_count': info.get('like_count'), } except subprocess.TimeoutExpired: self.log("Timeout fetching video info", "error", "Core") return None except Exception as e: self.log(f"Error fetching video info: {e}", "error", "Core") return None def get_playlist_info(self, url: str) -> Optional[Dict]: """Get playlist info including all video entries Args: url: Playlist URL Returns: Dictionary with playlist info and video entries or None on error """ try: self.log(f"Fetching playlist info for: {url}", "info", "Core") cmd = self._get_ytdlp_base_cmd() + [ '--dump-json', '--flat-playlist', # Only fetch metadata, not full video info url ] result = subprocess.run( cmd, capture_output=True, text=True, timeout=60 ) if result.returncode != 0: self.log(f"Failed to fetch playlist info: {result.stderr}", "error", "Core") return None # Parse JSONL output (one JSON object per line) videos = [] lines = result.stdout.strip().split('\n') for line in lines: if not line.strip(): continue try: entry = json.loads(line) # Skip non-video entries if entry.get('_type') == 'playlist': continue videos.append({ 'video_id': entry.get('id'), 'title': entry.get('title'), 'uploader': entry.get('uploader') or entry.get('channel'), 'upload_date': None, # Not available in flat-playlist 'duration': entry.get('duration'), 'description': '', 'thumbnail': entry.get('thumbnail'), 'view_count': entry.get('view_count'), 'like_count': entry.get('like_count'), 'url': entry.get('url') or entry.get('webpage_url'), }) except json.JSONDecodeError: continue if not videos: self.log("No videos found in playlist", "warning", "Core") return None return { 'is_playlist': True, 'playlist_count': len(videos), 'playlist_videos': videos } except subprocess.TimeoutExpired: self.log("Timeout fetching playlist info", "error", "Core") return None except Exception as e: self.log(f"Error fetching playlist info: {e}", "error", "Core") return None def get_gallery_info(self, url: str) -> Optional[Dict]: """Get gallery/album info using gallery-dl Args: url: Gallery URL Returns: Dictionary with gallery info or None on error """ try: self.log(f"Fetching gallery info for: {url}", "info", "Core") cmd = self._get_gallery_dl_base_cmd() + [ '--dump-json', '--no-download', url ] result = subprocess.run( cmd, capture_output=True, text=True, timeout=60 ) if result.returncode != 0: self.log(f"Failed to fetch gallery info: {result.stderr}", "error", "Core") return None # Parse JSON output try: entries = json.loads(result.stdout) except json.JSONDecodeError: self.log("Failed to parse gallery-dl JSON output", "error", "Core") return None if not entries: self.log("No entries found in gallery", "warning", "Core") return None # gallery-dl output format: # - Entry with [2, {album_metadata}] = album info # - Entry with [3, "url", {file_metadata}] = file entries album_metadata = {} file_entries = [] first_thumbnail = None for entry in entries: if isinstance(entry, list) and len(entry) >= 2: entry_type = entry[0] if entry_type == 2 and isinstance(entry[1], dict): # Album metadata album_metadata = entry[1] elif entry_type == 3 and len(entry) >= 3: # File entry: [3, url, metadata] file_url = entry[1] file_meta = entry[2] if isinstance(entry[2], dict) else {} file_entries.append({ 'url': file_url, 'extension': file_meta.get('extension', ''), 'filename': file_meta.get('filename', '') }) # Get first image as thumbnail if not first_thumbnail and file_meta.get('extension', '').lower() in ['jpg', 'jpeg', 'png', 'gif', 'webp']: first_thumbnail = file_url if not file_entries and not album_metadata: self.log("No valid entries found in gallery", "warning", "Core") return None # Generate a unique ID for the gallery gallery_id = album_metadata.get('album_id') or hashlib.sha256(url.encode()).hexdigest()[:12] # Count media types video_extensions = ['mp4', 'webm', 'mov', 'avi', 'mkv', 'm4v'] video_count = sum(1 for e in file_entries if e.get('extension', '').lower() in video_extensions) image_count = len(file_entries) - video_count # Get title from metadata title = (album_metadata.get('title') or album_metadata.get('album') or album_metadata.get('gallery') or f"Gallery {gallery_id}") return { 'video_id': gallery_id, 'title': title, 'uploader': album_metadata.get('user') or album_metadata.get('uploader') or album_metadata.get('author', ''), 'upload_date': album_metadata.get('date'), 'duration': 0, 'description': album_metadata.get('description', ''), 'thumbnail': first_thumbnail or (file_entries[0]['url'] if file_entries else ''), 'view_count': 0, 'like_count': 0, 'is_gallery': True, 'file_count': len(file_entries), 'image_count': image_count, 'video_count': video_count, 'url': url, 'tags': album_metadata.get('tags', []), } except subprocess.TimeoutExpired: self.log("Timeout fetching gallery info", "error", "Core") return None except Exception as e: self.log(f"Error fetching gallery info: {e}", "error", "Core") return None def download_gallery(self, url: str, progress_callback=None, gallery_info: Dict = None) -> Tuple[bool, Optional[str], Optional[Dict]]: """Download a gallery/album using gallery-dl Args: url: Gallery URL progress_callback: Optional callback for progress updates (message, percentage, speed, eta) gallery_info: Optional pre-fetched gallery info from get_gallery_info() Returns: Tuple of (success, output_directory, metadata) """ try: # Use album ID from gallery_info if available, otherwise generate hash gallery_id = gallery_info.get('video_id') if gallery_info else None if not gallery_id: gallery_id = hashlib.sha256(url.encode()).hexdigest()[:12] self.log(f"Starting gallery download: {url}", "info", "Core") if progress_callback: progress_callback(f"Starting gallery download...", 0, None, None) # Get uploader for subfolder organization uploader = gallery_info.get('uploader', '') if gallery_info else '' if not uploader: uploader = 'unknown' # Sanitize channel name for filesystem safe_channel = re.sub(r'[<>:"/\\|?*]', '', uploader) safe_channel = re.sub(r'\s+', ' ', safe_channel).strip('. ')[:50] or 'unknown' # Create output directory under channel subfolder channel_dir = self.base_path / safe_channel output_dir = channel_dir / gallery_id output_dir.mkdir(parents=True, exist_ok=True) # Build gallery-dl command cmd = self._get_gallery_dl_base_cmd() + [ '--directory', str(output_dir), '--filename', '{filename}.{extension}', '--write-metadata', '--write-info-json', url ] # Run gallery-dl with progress tracking process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1 ) downloaded_files = [] total_files = 0 current_file = 0 for line in iter(process.stdout.readline, ''): line = line.strip() if not line: continue self.log(line, "debug", "Download") # Parse progress from gallery-dl output if line.startswith('#'): # Extract total count from "# 1/10" format match = re.search(r'#\s*(\d+)/(\d+)', line) if match: current_file = int(match.group(1)) total_files = int(match.group(2)) percentage = int((current_file / total_files) * 100) if progress_callback: progress_callback(f"Downloading file {current_file}/{total_files}", percentage, None, None) elif 'Downloading' in line or 'Saving' in line: if progress_callback: progress_callback(line, 50 if total_files == 0 else int((current_file / total_files) * 100), None, None) # Track downloaded files if output_dir.exists(): current_files = list(output_dir.glob('*')) downloaded_files = [f for f in current_files if f.is_file() and not f.name.endswith('.json')] process.wait() if process.returncode != 0: self.log(f"Gallery download failed with code {process.returncode}", "error", "Core") if progress_callback: progress_callback("Download failed", 0, None, None) return False, None, None # Get final list of downloaded files downloaded_files = [f for f in output_dir.glob('*') if f.is_file() and not f.name.endswith('.json')] if not downloaded_files: self.log("No files were downloaded", "error", "Core") return False, None, None # Parse upload_date from gallery_info upload_date = None if gallery_info and gallery_info.get('upload_date'): ud = gallery_info['upload_date'] if isinstance(ud, datetime): upload_date = ud elif isinstance(ud, str): # Try parsing common date formats for fmt in ['%Y-%m-%d %H:%M:%S', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%S']: try: upload_date = datetime.strptime(ud, fmt) break except ValueError: continue # Set file timestamps to upload date (same as yt-dlp) if upload_date: timestamp = upload_date.timestamp() for file_path in downloaded_files: os.utime(file_path, (timestamp, timestamp)) self.log(f"Set file timestamps to {upload_date}", "info", "Core") # Calculate total size total_size = sum(f.stat().st_size for f in downloaded_files) # Use gallery_info if available for better metadata metadata = { 'video_id': gallery_id, 'title': gallery_info.get('title', f"Gallery {gallery_id}") if gallery_info else f"Gallery {gallery_id}", 'uploader': gallery_info.get('uploader', '') if gallery_info else '', 'upload_date': upload_date or datetime.now(), 'duration': 0, 'description': gallery_info.get('description', '') if gallery_info else '', 'thumbnail': gallery_info.get('thumbnail', '') if gallery_info else '', 'view_count': gallery_info.get('view_count', 0) if gallery_info else 0, 'like_count': gallery_info.get('like_count', 0) if gallery_info else 0, 'is_gallery': True, 'file_count': len(downloaded_files), 'total_size': total_size, 'files': [str(f) for f in downloaded_files], 'tags': gallery_info.get('tags', []) if gallery_info else [], } self.log(f"Gallery download complete: {len(downloaded_files)} files, {total_size} bytes", "success", "Core") if progress_callback: progress_callback(f"Downloaded {len(downloaded_files)} files", 100, None, None) # Record to video_downloads table self._record_download( video_id=gallery_id, url=url, title=metadata.get('title', f"Gallery {gallery_id}"), file_path=str(output_dir), uploader=metadata.get('uploader', ''), upload_date=upload_date, duration=0, file_size=total_size, metadata=metadata ) # Also add to general downloads table for Media/Downloads page url_hash = hashlib.sha256(url.encode()).hexdigest() post_date = format_datetime_for_db(upload_date) if upload_date else format_datetime_for_db() with self.unified_db.get_connection() as conn: cursor = conn.cursor() cursor.execute(''' INSERT OR REPLACE INTO downloads (url_hash, url, platform, source, post_date, download_date, status, file_path, filename) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( url_hash, url, self.platform, metadata.get('uploader', ''), post_date, format_datetime_for_db(), 'completed', str(output_dir), gallery_id )) conn.commit() # Add each file to file_inventory for Media page (same as yt-dlp) created_date = format_datetime_for_db(upload_date) if upload_date else format_datetime_for_db() for file_path in downloaded_files: file_stat = file_path.stat() ext = file_path.suffix.lower() content_type = 'video' if ext in ['.mp4', '.webm', '.mov', '.avi', '.mkv'] else 'image' # Prepare metadata for JSON serialization file_metadata = { 'gallery_id': gallery_id, 'title': metadata.get('title', ''), 'uploader': metadata.get('uploader', ''), 'tags': metadata.get('tags', []), 'url': url, } self.unified_db.upsert_file_inventory( file_path=str(file_path), filename=file_path.name, platform=self.platform, source=metadata.get('uploader', ''), content_type=content_type, file_size=file_stat.st_size, location='final', metadata=file_metadata, created_date=created_date ) self.log(f"Added {len(downloaded_files)} files to file_inventory", "info", "Database") return True, str(output_dir), metadata except Exception as e: self.log(f"Error downloading gallery: {e}", "error", "Core") if progress_callback: progress_callback(f"Error: {str(e)}", 0, None, None) return False, None, None def download_video(self, url: str, progress_callback=None, update_activity: bool = True) -> Tuple[bool, Optional[str], Optional[Dict]]: """Download a video with metadata extraction Args: url: Video URL progress_callback: Optional callback for progress updates (message, percentage) update_activity: Whether to update the activity_status table (set False for queue downloads) Returns: Tuple of (success, file_path, metadata) """ try: # Extract video ID video_id = self.extract_video_id(url) if not video_id: self.log(f"Invalid {self.platform_config['name']} URL: {url}", "error", "Core") return False, None, None # Check if already downloaded if self._is_already_downloaded(video_id): self.log(f"Video {video_id} already downloaded, skipping", "info", "Core") return False, None, {'error': 'Already downloaded'} # Update activity status (only for scheduler-driven downloads, not queue) activity_key = f'{self.platform}_downloader' if update_activity: self.activity_manager.update_status(f'Downloading: {url}') if progress_callback: progress_callback("Fetching video metadata...", 5) # Get video info first info = self.get_video_info(url) if not info: if update_activity: self.activity_manager.update_status('Idle') return False, None, {'error': 'Failed to fetch video info'} self.log(f"Downloading: {info['title']}", "info", "Core") if progress_callback: progress_callback(f"Downloading: {info['title']}", 10) # Generate output filename with date prefix upload_date = info.get('upload_date') if upload_date: date_prefix = upload_date.strftime('%Y%m%d') else: date_prefix = datetime.now().strftime('%Y%m%d') # Sanitize title for filename safe_title = re.sub(r'[<>:"/\\|?*]', '_', info['title'][:100]) # Get channel/uploader for subfolder organization uploader = info.get('uploader') or info.get('channel') or info.get('creator') or 'unknown' # Sanitize channel name for filesystem safe_channel = re.sub(r'[<>:"/\\|?*]', '', uploader) safe_channel = re.sub(r'\s+', ' ', safe_channel).strip('. ')[:50] or 'unknown' # Create channel subfolder channel_dir = self.base_path / safe_channel channel_dir.mkdir(parents=True, exist_ok=True) output_template = str(channel_dir / f"{date_prefix}_{safe_title}_{video_id}.%(ext)s") # Get anti-bot settings antibot = get_antibot_settings(self.unified_db) # Build base command cmd = self._get_ytdlp_base_cmd() + [ '--no-playlist', '--format', 'bestvideo+bestaudio/best', '--merge-output-format', 'mp4', '--output', output_template, ] # Add metadata embedding based on settings if self.video_settings.get('embed_metadata', True): cmd.append('--add-metadata') # Add thumbnail embedding based on settings if self.video_settings.get('cache_thumbnails', True): cmd.append('--embed-thumbnail') # Add anti-bot measures if enabled if antibot.get('enabled', True): # User agent user_agent = get_user_agent(antibot) cmd.extend(['--user-agent', user_agent]) # Rate limiting if antibot.get('limit_rate'): cmd.extend(['--limit-rate', antibot['limit_rate']]) # Throttle detection if antibot.get('throttled_rate'): cmd.extend(['--throttled-rate', antibot['throttled_rate']]) # Sleep between requests sleep_min = antibot.get('sleep_requests_min', 1) sleep_max = antibot.get('sleep_requests_max', 3) cmd.extend(['--sleep-requests', str(sleep_min)]) # Use sleep-interval for delays between downloads (with max variant) if sleep_max > sleep_min: cmd.extend(['--sleep-interval', str(sleep_min), '--max-sleep-interval', str(sleep_max)]) # Concurrent fragments cmd.extend(['--concurrent-fragments', str(antibot.get('concurrent_fragments', 1))]) # Retries cmd.extend(['--retries', str(antibot.get('retries', 10))]) cmd.extend(['--fragment-retries', str(antibot.get('fragment_retries', 10))]) # Socket timeout cmd.extend(['--socket-timeout', str(antibot.get('socket_timeout', 30))]) # Don't abort on errors cmd.append('--no-abort-on-error') # Add URL last cmd.append(url) if progress_callback: progress_callback("Downloading video...", 20) # Run download with progress tracking process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True ) # Collect output for error detection output_lines = [] # Parse yt-dlp output for progress for line in process.stdout: output_lines.append(line) # Look for [download] XX.X% lines # Format: [download] 45.2% of 123.45MiB at 2.5MiB/s ETA 00:32 if '[download]' in line and '%' in line: try: percent_match = re.search(r'(\d+\.?\d*)%', line) speed_match = re.search(r'at\s+([\d.]+\s*\w+/s)', line) eta_match = re.search(r'ETA\s+([\d:]+)', line) if percent_match: percent = float(percent_match.group(1)) # Scale to 20-90% range scaled_percent = 20 + (percent * 0.7) speed = speed_match.group(1) if speed_match else None eta = eta_match.group(1) if eta_match else None if progress_callback: # Build message with speed/ETA if available msg = f"Downloading: {percent:.1f}%" if speed: msg += f" • {speed}" if eta: msg += f" • ETA {eta}" progress_callback(msg, int(scaled_percent), speed, eta) except (ValueError, KeyError, TypeError): pass process.wait() # Check for cookie/auth errors in output full_output = ''.join(output_lines) if process.returncode != 0 and is_cookie_error(full_output): self.log("Download failed: Cookie/authentication error detected", "error", "Core") if update_activity: self.activity_manager.update_status('Idle') return False, None, {'error': 'Cookie expired', 'cookie_error': True} if process.returncode != 0: self.log("Download failed", "error", "Core") if update_activity: self.activity_manager.update_status('Idle') return False, None, {'error': 'Download failed'} if progress_callback: progress_callback("Processing metadata...", 95) # Find the downloaded file # Escape glob special characters (brackets, etc.) in the pattern import glob as glob_module escaped_prefix = glob_module.escape(f"{date_prefix}_{safe_title}_{video_id}") expected_pattern = f"{escaped_prefix}.*" downloaded_files = list(channel_dir.glob(expected_pattern)) if not downloaded_files: self.log("Downloaded file not found", "error", "Core") if update_activity: self.activity_manager.update_status('Idle') return False, None, {'error': 'File not found after download'} file_path = downloaded_files[0] # Set file timestamp to upload date if upload_date: timestamp = upload_date.timestamp() os.utime(file_path, (timestamp, timestamp)) self.log(f"Set file timestamp to {upload_date}", "info", "Core") # Get file size file_size = file_path.stat().st_size # Get video dimensions using yt-dlp metadata width = info.get('width') height = info.get('height') # Record download in video_downloads table self._record_download( video_id=video_id, url=url, title=info['title'], file_path=str(file_path), uploader=info.get('uploader'), upload_date=upload_date, duration=info.get('duration'), file_size=file_size, metadata=info ) # Also add to general downloads table for Media/Downloads page queries # post_date = upload date, download_date = today url_hash = hashlib.sha256(url.encode()).hexdigest() with self.unified_db.get_connection() as conn: cursor = conn.cursor() cursor.execute(''' INSERT OR IGNORE INTO downloads (url_hash, url, platform, source, post_date, download_date, status, file_path, filename) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( url_hash, url, self.platform, info.get('uploader'), format_datetime_for_db(upload_date) if upload_date else None, format_datetime_for_db(), 'completed', str(file_path), file_path.name )) conn.commit() self.log(f"Added to downloads table: {file_path.name}", "info", "Database") # Add to file inventory for media gallery download_time = format_datetime_for_db() # Prepare metadata for JSON serialization (convert datetime to string) metadata_serializable = dict(info) if 'upload_date' in metadata_serializable and metadata_serializable['upload_date']: metadata_serializable['upload_date'] = format_datetime_for_db(metadata_serializable['upload_date']) if isinstance(metadata_serializable['upload_date'], datetime) else metadata_serializable['upload_date'] self.unified_db.upsert_file_inventory( file_path=str(file_path), filename=file_path.name, platform=self.platform, source=info.get('uploader'), content_type='video', file_size=file_size, width=width, height=height, location='final', metadata=metadata_serializable, created_date=download_time, video_id=info.get('id') # For YouTube thumbnail lookup ) self.log(f"Added to file inventory: {file_path.name}", "info", "Database") if progress_callback: progress_callback("Download complete!", 100) self.log(f"Successfully downloaded: {file_path.name}", "success", "Core") if update_activity: self.activity_manager.update_status('Idle') return True, str(file_path), info except Exception as e: self.log(f"Error downloading video: {e}", "error", "Core") if update_activity: self.activity_manager.update_status('Idle') return False, None, {'error': str(e)} def main(): """Test function""" from modules.unified_database import UnifiedDatabase db = UnifiedDatabase() print("Available platforms:") for key, config in PLATFORMS.items(): print(f" {key}: {config['name']}") platform = input("\nSelect platform: ").lower() if platform not in PLATFORMS: print(f"Invalid platform. Choose from: {', '.join(PLATFORMS.keys())}") return downloader = UniversalVideoDownloader(platform=platform, unified_db=db) # Test URL test_url = input(f"Enter {PLATFORMS[platform]['name']} URL: ") def progress(msg, pct): print(f"[{pct}%] {msg}") success, file_path, metadata = downloader.download_video(test_url, progress) if success: print(f"\nSuccess! Downloaded to: {file_path}") else: print(f"\nFailed: {metadata.get('error', 'Unknown error')}") if __name__ == '__main__': main()