Files
media-downloader/modules/universal_video_downloader.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

1434 lines
54 KiB
Python

#!/usr/bin/env python3
"""
Universal Video Downloader Module - Downloads videos from YouTube, Vimeo, Dailymotion, Bilibili, and more
"""
import os
import re
import json
import subprocess
import hashlib
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from modules.universal_logger import get_logger
logger = get_logger('UniversalVideoDownloader')
# Cookie/auth error patterns that indicate expired or invalid cookies
COOKIE_ERROR_PATTERNS = [
r'sign in to confirm',
r'login required',
r'cookies.*expired',
r'please sign in',
r'authentication required',
r'private video',
r'video is unavailable.*sign in',
r'age-restricted.*sign in',
r'members-only content',
r'this video is available to this channel',
r'confirm your age',
]
# Browser User-Agent strings (updated Dec 2024)
BROWSER_USER_AGENTS = {
'edge': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
'chrome': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
'firefox': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
'safari': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
}
# Default anti-bot settings
DEFAULT_ANTIBOT_SETTINGS = {
'browser': 'edge',
'custom_user_agent': '',
'limit_rate': '2M',
'throttled_rate': '100K',
'sleep_requests_min': 1,
'sleep_requests_max': 3,
'retries': 10,
'fragment_retries': 10,
'concurrent_fragments': 1,
'socket_timeout': 30,
'enabled': True,
}
def is_cookie_error(output: str) -> bool:
"""Check if output contains cookie/auth error patterns."""
if not output:
return False
output_lower = output.lower()
for pattern in COOKIE_ERROR_PATTERNS:
if re.search(pattern, output_lower):
return True
return False
def get_antibot_settings(unified_db) -> dict:
"""Get anti-bot settings from database or return defaults."""
if not unified_db:
return DEFAULT_ANTIBOT_SETTINGS.copy()
try:
import json
with unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT value FROM settings WHERE key = 'antibot_settings'")
row = cursor.fetchone()
if row:
settings = json.loads(row[0])
# Merge with defaults to ensure all keys exist
merged = DEFAULT_ANTIBOT_SETTINGS.copy()
merged.update(settings)
return merged
except Exception:
pass
return DEFAULT_ANTIBOT_SETTINGS.copy()
def get_user_agent(settings: dict) -> str:
"""Get the user agent string based on settings."""
browser = settings.get('browser', 'edge')
if browser == 'custom':
custom_ua = settings.get('custom_user_agent', '').strip()
if custom_ua:
return custom_ua
# Fall back to edge if custom is empty
return BROWSER_USER_AGENTS['edge']
return BROWSER_USER_AGENTS.get(browser, BROWSER_USER_AGENTS['edge'])
def format_datetime_for_db(dt: datetime = None) -> str:
"""Format datetime for database storage using space separator (not ISO T separator).
This ensures consistent string sorting in SQLite since 'T' > ' ' would cause
ISO format dates to sort incorrectly with space-separated dates.
Uses UTC time for consistency with other parts of the system.
"""
if dt is None:
dt = datetime.utcnow()
return dt.strftime('%Y-%m-%d %H:%M:%S')
# Platform configurations
PLATFORMS = {
'youtube': {
'name': 'YouTube',
'color': 'red',
'base_path': '/opt/immich/md/youtube',
'url_patterns': [
r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})',
r'youtube\.com/shorts/([a-zA-Z0-9_-]{11})',
],
'id_pattern': r'^[a-zA-Z0-9_-]{11}$'
},
'vimeo': {
'name': 'Vimeo',
'color': 'blue',
'base_path': '/opt/immich/md/vimeo',
'url_patterns': [
r'vimeo\.com/(\d+)',
r'vimeo\.com/video/(\d+)',
r'vimeo\.com/channels/[^/]+/(\d+)',
],
'id_pattern': r'^\d+$'
},
'dailymotion': {
'name': 'Dailymotion',
'color': 'cyan',
'base_path': '/opt/immich/md/dailymotion',
'url_patterns': [
r'dailymotion\.com/video/([a-zA-Z0-9]+)',
r'dai\.ly/([a-zA-Z0-9]+)',
],
'id_pattern': r'^[a-zA-Z0-9]+$'
},
'bilibili': {
'name': 'Bilibili',
'color': 'pink',
'base_path': '/opt/immich/md/bilibili',
'url_patterns': [
r'bilibili\.com/video/(BV[a-zA-Z0-9]+)',
r'bilibili\.com/video/(av\d+)',
r'b23\.tv/([a-zA-Z0-9]+)',
],
'id_pattern': r'^(BV[a-zA-Z0-9]+|av\d+)$'
}
}
# Sites that should use gallery-dl instead of yt-dlp (image/gallery focused)
GALLERY_DL_SITES = {
'erome': {
'name': 'Erome',
'color': 'purple',
'base_path': '/opt/immich/md/erome',
'url_patterns': [r'erome\.com/a/([a-zA-Z0-9]+)', r'erome\.com/([a-zA-Z0-9_-]+)$'],
},
'bunkr': {
'name': 'Bunkr',
'color': 'blue',
'base_path': '/opt/immich/md/bunkr',
'url_patterns': [r'bunkr\.\w+/a/([a-zA-Z0-9]+)', r'bunkr\.\w+/v/([a-zA-Z0-9]+)'],
},
'cyberdrop': {
'name': 'Cyberdrop',
'color': 'cyan',
'base_path': '/opt/immich/md/cyberdrop',
'url_patterns': [r'cyberdrop\.\w+/a/([a-zA-Z0-9]+)'],
},
'kemono': {
'name': 'Kemono',
'color': 'green',
'base_path': '/opt/immich/md/kemono',
'url_patterns': [r'kemono\.\w+/([^/]+)/user/(\d+)'],
},
'coomer': {
'name': 'Coomer',
'color': 'pink',
'base_path': '/opt/immich/md/coomer',
'url_patterns': [r'coomer\.\w+/([^/]+)/user/(\d+)'],
},
'pixeldrain': {
'name': 'Pixeldrain',
'color': 'indigo',
'base_path': '/opt/immich/md/pixeldrain',
'url_patterns': [r'pixeldrain\.com/u/([a-zA-Z0-9]+)', r'pixeldrain\.com/l/([a-zA-Z0-9]+)'],
},
'gofile': {
'name': 'GoFile',
'color': 'yellow',
'base_path': '/opt/immich/md/gofile',
'url_patterns': [r'gofile\.io/d/([a-zA-Z0-9]+)'],
},
'imgbox': {
'name': 'ImgBox',
'color': 'gray',
'base_path': '/opt/immich/md/imgbox',
'url_patterns': [r'imgbox\.com/g/([a-zA-Z0-9]+)'],
},
'imagebam': {
'name': 'ImageBam',
'color': 'orange',
'base_path': '/opt/immich/md/imagebam',
'url_patterns': [r'imagebam\.com/gallery/([a-zA-Z0-9]+)'],
},
'fapello': {
'name': 'Fapello',
'color': 'red',
'base_path': '/opt/immich/md/fapello',
'url_patterns': [r'fapello\.com/([a-zA-Z0-9_-]+)'],
},
'imagefap': {
'name': 'ImageFap',
'color': 'green',
'base_path': '/opt/immich/md/imagefap',
'url_patterns': [r'imagefap\.com/pictures/(\d+)', r'imagefap\.com/gallery/(\d+)'],
},
'rule34': {
'name': 'Rule34',
'color': 'green',
'base_path': '/opt/immich/md/rule34',
'url_patterns': [r'rule34\.(xxx|us|paheal)'],
},
'e621': {
'name': 'e621',
'color': 'blue',
'base_path': '/opt/immich/md/e621',
'url_patterns': [r'e621\.net'],
},
'nhentai': {
'name': 'nHentai',
'color': 'pink',
'base_path': '/opt/immich/md/nhentai',
'url_patterns': [r'nhentai\.net/g/(\d+)'],
},
'hitomi': {
'name': 'Hitomi',
'color': 'pink',
'base_path': '/opt/immich/md/hitomi',
'url_patterns': [r'hitomi\.la'],
},
'gelbooru': {
'name': 'Gelbooru',
'color': 'blue',
'base_path': '/opt/immich/md/gelbooru',
'url_patterns': [r'gelbooru\.com'],
},
'danbooru': {
'name': 'Danbooru',
'color': 'blue',
'base_path': '/opt/immich/md/danbooru',
'url_patterns': [r'danbooru\.donmai\.us'],
},
'deviantart': {
'name': 'DeviantArt',
'color': 'green',
'base_path': '/opt/immich/md/deviantart',
'url_patterns': [r'deviantart\.com'],
},
'artstation': {
'name': 'ArtStation',
'color': 'blue',
'base_path': '/opt/immich/md/artstation',
'url_patterns': [r'artstation\.com'],
},
'pixiv': {
'name': 'Pixiv',
'color': 'blue',
'base_path': '/opt/immich/md/pixiv',
'url_patterns': [r'pixiv\.net'],
},
'furaffinity': {
'name': 'FurAffinity',
'color': 'orange',
'base_path': '/opt/immich/md/furaffinity',
'url_patterns': [r'furaffinity\.net'],
},
'catbox': {
'name': 'Catbox',
'color': 'purple',
'base_path': '/opt/immich/md/catbox',
'url_patterns': [r'catbox\.moe', r'files\.catbox\.moe'],
},
}
class UniversalVideoDownloader:
"""Downloads videos from multiple platforms using yt-dlp and gallery-dl"""
# Default base directory for all downloads
DEFAULT_BASE_DIR = '/opt/immich/md'
def __init__(self, platform: str = 'youtube', base_path: Path = None, unified_db=None, cookies_file: str = None):
"""
Initialize Universal Video Downloader
Args:
platform: Platform name (youtube, vimeo, dailymotion, bilibili, or gallery-dl sites)
base_path: Base path for downloads (default: from settings or platform config)
unified_db: UnifiedDatabase instance (required)
cookies_file: Path to cookies file for yt-dlp (optional)
"""
self.cookies_file = cookies_file
# Check if platform is a gallery-dl site
self.is_gallery_dl = platform in GALLERY_DL_SITES
if platform not in PLATFORMS and platform not in GALLERY_DL_SITES:
raise ValueError(f"Unsupported platform: {platform}. Supported: {', '.join(list(PLATFORMS.keys()) + list(GALLERY_DL_SITES.keys()))}")
self.platform = platform
if self.is_gallery_dl:
self.platform_config = GALLERY_DL_SITES[platform]
else:
self.platform_config = PLATFORMS[platform]
# Set base path - check settings first, then use default
if base_path:
self.base_path = Path(base_path)
else:
# Try to get base directory from settings
config_base_dir = self._get_configured_base_dir(unified_db)
self.base_path = Path(config_base_dir) / platform
self.base_path.mkdir(parents=True, exist_ok=True)
# Load video downloader settings
self.video_settings = self._get_video_downloader_settings(unified_db)
# Initialize universal logger
self.logger = get_logger('UniversalVideoDownloader')
# Always use unified database adapter
if not unified_db:
raise ValueError("Universal video downloader requires unified_db")
self.unified_db = unified_db
# Initialize activity status manager for real-time updates
from modules.activity_status import get_activity_manager
self.activity_manager = get_activity_manager(unified_db)
def _get_video_downloader_settings(self, unified_db) -> dict:
"""Get video downloader settings from database."""
defaults = {
'base_path': '',
'max_concurrent': 3,
'cache_thumbnails': True,
'auto_generate_thumbnails': True,
'embed_metadata': True
}
if not unified_db:
return defaults
try:
import json
with unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT value FROM settings WHERE key = 'video_downloader'")
row = cursor.fetchone()
if row:
settings = json.loads(row[0])
defaults.update(settings)
except Exception:
pass
return defaults
def _get_configured_base_dir(self, unified_db) -> str:
"""Get base download directory from settings or use default."""
if not unified_db:
return self.DEFAULT_BASE_DIR
try:
import json
with unified_db.get_connection() as conn:
cursor = conn.cursor()
# First check video_downloader.base_path
cursor.execute("SELECT value FROM settings WHERE key = 'video_downloader'")
row = cursor.fetchone()
if row:
settings = json.loads(row[0])
base_path = settings.get('base_path')
if base_path:
return base_path
# Fall back to download_settings.base_directory
cursor.execute("SELECT value FROM settings WHERE key = 'download_settings'")
row = cursor.fetchone()
if row:
settings = json.loads(row[0])
base_dir = settings.get('base_directory')
if base_dir:
return base_dir
except Exception:
pass
return self.DEFAULT_BASE_DIR
def _get_ytdlp_base_cmd(self) -> list:
"""Get base yt-dlp command with cookies if configured."""
cmd = ['/opt/media-downloader/venv/bin/yt-dlp']
# Enable remote EJS components for YouTube n-challenge solving (deno required)
cmd.extend(['--remote-components', 'ejs:github'])
if self.cookies_file:
cmd.extend(['--cookies', self.cookies_file])
return cmd
def _get_gallery_dl_base_cmd(self) -> list:
"""Get base gallery-dl command with cookies if configured."""
cmd = ['/opt/media-downloader/venv/bin/gallery-dl']
if self.cookies_file:
cmd.extend(['--cookies', self.cookies_file])
return cmd
def log(self, message: str, level: str = "info", module: str = "Download"):
"""Log a message with level
Args:
message: The message to log
level: Log level ('debug', 'info', 'warning', 'error', 'success')
module: Module name for logging
"""
level = level.lower()
self.logger.log(f"[{self.platform_config['name']}] {message}", level.upper(), module=module)
def detect_platform(self, url: str) -> Optional[str]:
"""Detect platform from URL
Args:
url: Video URL
Returns:
Platform name or None if not detected
"""
# Check yt-dlp platforms first
for platform, config in PLATFORMS.items():
for pattern in config['url_patterns']:
if re.search(pattern, url, re.IGNORECASE):
return platform
# Check gallery-dl sites
for platform, config in GALLERY_DL_SITES.items():
for pattern in config['url_patterns']:
if re.search(pattern, url, re.IGNORECASE):
return platform
return None
@staticmethod
def detect_gallery_dl_site(url: str) -> Optional[str]:
"""Detect if URL is a gallery-dl supported site
Args:
url: URL to check
Returns:
Site name or None if not a gallery-dl site
"""
for site, config in GALLERY_DL_SITES.items():
for pattern in config['url_patterns']:
if re.search(pattern, url, re.IGNORECASE):
return site
return None
def extract_video_id(self, url: str) -> Optional[str]:
"""Extract video ID from URL
Args:
url: Video URL
Returns:
Video ID or None if not found
"""
# Try patterns for current platform
for pattern in self.platform_config['url_patterns']:
match = re.search(pattern, url, re.IGNORECASE)
if match:
return match.group(1)
# If URL is just the video ID
if re.match(self.platform_config['id_pattern'], url):
return url
return None
def _is_already_downloaded(self, video_id: str) -> bool:
"""Check if a video has already been downloaded
Args:
video_id: Video ID
Returns:
True if already downloaded
"""
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT COUNT(*) as count FROM video_downloads
WHERE platform = ? AND video_id = ?
''', (self.platform, video_id))
result = cursor.fetchone()
return result['count'] > 0
except Exception as e:
self.log(f"Error checking if video already downloaded: {e}", "error", "Database")
return False
def _record_download(self, video_id: str, url: str, title: str,
file_path: str, uploader: str = None,
upload_date: Optional[datetime] = None,
duration: int = None, file_size: int = None,
metadata: Dict = None):
"""Record a successful download in the database
Args:
video_id: Video ID
url: Original URL
title: Video title
file_path: Path to downloaded file
uploader: Channel/uploader name
upload_date: Upload date
duration: Duration in seconds
file_size: File size in bytes
metadata: Additional metadata
"""
try:
# Prepare metadata for JSON serialization
metadata_serializable = None
if metadata:
metadata_serializable = dict(metadata)
# Convert datetime objects to ISO format strings
if 'upload_date' in metadata_serializable and isinstance(metadata_serializable['upload_date'], datetime):
metadata_serializable['upload_date'] = metadata_serializable['upload_date'].isoformat()
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
# Check if we have cached thumbnail from preview list
cursor.execute('''
SELECT thumbnail_data FROM video_preview_list
WHERE platform = ? AND video_id = ?
''', (self.platform, video_id))
preview_row = cursor.fetchone()
thumbnail_data = preview_row[0] if preview_row else None
# Also check video_download_queue (for downloads initiated from queue)
if not thumbnail_data:
cursor.execute('''
SELECT thumbnail_data FROM video_download_queue
WHERE platform = ? AND video_id = ?
''', (self.platform, video_id))
queue_row = cursor.fetchone()
if queue_row and queue_row[0]:
thumbnail_data = queue_row[0]
# Fallback: fetch thumbnail from URL if not in cache
if not thumbnail_data and metadata:
thumbnail_url = metadata.get('thumbnail')
if thumbnail_url:
thumbnail_data = self._fetch_thumbnail(thumbnail_url, video_id)
cursor.execute('''
INSERT INTO video_downloads
(platform, video_id, url, title, uploader, upload_date, duration, file_path, file_size, metadata, download_date, thumbnail_data)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
self.platform,
video_id,
url,
title,
uploader,
format_datetime_for_db(upload_date) if upload_date else None,
duration,
file_path,
file_size,
json.dumps(metadata_serializable) if metadata_serializable else None,
format_datetime_for_db(),
thumbnail_data
))
conn.commit()
self.log(f"Recorded download: {title}", "success", "Database")
except Exception as e:
self.log(f"Error recording download: {e}", "error", "Database")
def _fetch_thumbnail(self, thumbnail_url: str, video_id: str) -> Optional[bytes]:
"""Fetch thumbnail from URL and return binary data.
Args:
thumbnail_url: URL of the thumbnail
video_id: Video ID for logging
Returns:
Thumbnail binary data or None on failure
"""
import requests
if not thumbnail_url:
return None
try:
# For YouTube, try maxresdefault first (1280x720, no black bars), fallback to hqdefault
url_to_fetch = thumbnail_url
if 'ytimg.com' in thumbnail_url:
# Try maxresdefault first (best quality, no letterboxing)
for quality in ['maxresdefault', 'hqdefault']:
url_to_fetch = f"https://i.ytimg.com/vi/{video_id}/{quality}.jpg"
response = requests.get(
url_to_fetch,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'},
timeout=10
)
if response.status_code == 200 and len(response.content) > 1000:
self.log(f"Fetched {quality} thumbnail for {video_id}", "debug", "Database")
return response.content
return None
response = requests.get(
url_to_fetch,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'},
timeout=10
)
if response.status_code == 200 and len(response.content) > 1000:
self.log(f"Fetched thumbnail for {video_id}", "debug", "Database")
return response.content
except Exception as e:
self.log(f"Failed to fetch thumbnail for {video_id}: {e}", "warning", "Database")
return None
def get_video_info(self, url: str) -> Optional[Dict]:
"""Get video metadata using yt-dlp without downloading
Args:
url: Video URL
Returns:
Dictionary with video info or None on error
"""
try:
self.log(f"Fetching video info for: {url}", "info", "Core")
cmd = self._get_ytdlp_base_cmd() + [
'--dump-json',
'--no-playlist',
url
]
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=30
)
if result.returncode != 0:
self.log(f"Failed to fetch video info: {result.stderr}", "error", "Core")
return None
info = json.loads(result.stdout)
# Extract upload date
upload_date = None
if 'upload_date' in info and info['upload_date']:
try:
upload_date = datetime.strptime(info['upload_date'], '%Y%m%d')
except Exception as e:
self.log(f"Error parsing upload date: {e}", "warning", "Core")
# Extract video ID from info
video_id = info.get('id') or self.extract_video_id(url)
return {
'video_id': video_id,
'title': info.get('title'),
'uploader': info.get('uploader') or info.get('channel') or info.get('creator'),
'upload_date': upload_date,
'duration': info.get('duration'),
'description': info.get('description'),
'thumbnail': info.get('thumbnail'),
'view_count': info.get('view_count'),
'like_count': info.get('like_count'),
}
except subprocess.TimeoutExpired:
self.log("Timeout fetching video info", "error", "Core")
return None
except Exception as e:
self.log(f"Error fetching video info: {e}", "error", "Core")
return None
def get_playlist_info(self, url: str) -> Optional[Dict]:
"""Get playlist info including all video entries
Args:
url: Playlist URL
Returns:
Dictionary with playlist info and video entries or None on error
"""
try:
self.log(f"Fetching playlist info for: {url}", "info", "Core")
cmd = self._get_ytdlp_base_cmd() + [
'--dump-json',
'--flat-playlist', # Only fetch metadata, not full video info
url
]
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60
)
if result.returncode != 0:
self.log(f"Failed to fetch playlist info: {result.stderr}", "error", "Core")
return None
# Parse JSONL output (one JSON object per line)
videos = []
lines = result.stdout.strip().split('\n')
for line in lines:
if not line.strip():
continue
try:
entry = json.loads(line)
# Skip non-video entries
if entry.get('_type') == 'playlist':
continue
videos.append({
'video_id': entry.get('id'),
'title': entry.get('title'),
'uploader': entry.get('uploader') or entry.get('channel'),
'upload_date': None, # Not available in flat-playlist
'duration': entry.get('duration'),
'description': '',
'thumbnail': entry.get('thumbnail'),
'view_count': entry.get('view_count'),
'like_count': entry.get('like_count'),
'url': entry.get('url') or entry.get('webpage_url'),
})
except json.JSONDecodeError:
continue
if not videos:
self.log("No videos found in playlist", "warning", "Core")
return None
return {
'is_playlist': True,
'playlist_count': len(videos),
'playlist_videos': videos
}
except subprocess.TimeoutExpired:
self.log("Timeout fetching playlist info", "error", "Core")
return None
except Exception as e:
self.log(f"Error fetching playlist info: {e}", "error", "Core")
return None
def get_gallery_info(self, url: str) -> Optional[Dict]:
"""Get gallery/album info using gallery-dl
Args:
url: Gallery URL
Returns:
Dictionary with gallery info or None on error
"""
try:
self.log(f"Fetching gallery info for: {url}", "info", "Core")
cmd = self._get_gallery_dl_base_cmd() + [
'--dump-json',
'--no-download',
url
]
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60
)
if result.returncode != 0:
self.log(f"Failed to fetch gallery info: {result.stderr}", "error", "Core")
return None
# Parse JSON output
try:
entries = json.loads(result.stdout)
except json.JSONDecodeError:
self.log("Failed to parse gallery-dl JSON output", "error", "Core")
return None
if not entries:
self.log("No entries found in gallery", "warning", "Core")
return None
# gallery-dl output format:
# - Entry with [2, {album_metadata}] = album info
# - Entry with [3, "url", {file_metadata}] = file entries
album_metadata = {}
file_entries = []
first_thumbnail = None
for entry in entries:
if isinstance(entry, list) and len(entry) >= 2:
entry_type = entry[0]
if entry_type == 2 and isinstance(entry[1], dict):
# Album metadata
album_metadata = entry[1]
elif entry_type == 3 and len(entry) >= 3:
# File entry: [3, url, metadata]
file_url = entry[1]
file_meta = entry[2] if isinstance(entry[2], dict) else {}
file_entries.append({
'url': file_url,
'extension': file_meta.get('extension', ''),
'filename': file_meta.get('filename', '')
})
# Get first image as thumbnail
if not first_thumbnail and file_meta.get('extension', '').lower() in ['jpg', 'jpeg', 'png', 'gif', 'webp']:
first_thumbnail = file_url
if not file_entries and not album_metadata:
self.log("No valid entries found in gallery", "warning", "Core")
return None
# Generate a unique ID for the gallery
gallery_id = album_metadata.get('album_id') or hashlib.sha256(url.encode()).hexdigest()[:12]
# Count media types
video_extensions = ['mp4', 'webm', 'mov', 'avi', 'mkv', 'm4v']
video_count = sum(1 for e in file_entries if e.get('extension', '').lower() in video_extensions)
image_count = len(file_entries) - video_count
# Get title from metadata
title = (album_metadata.get('title') or
album_metadata.get('album') or
album_metadata.get('gallery') or
f"Gallery {gallery_id}")
return {
'video_id': gallery_id,
'title': title,
'uploader': album_metadata.get('user') or album_metadata.get('uploader') or album_metadata.get('author', ''),
'upload_date': album_metadata.get('date'),
'duration': 0,
'description': album_metadata.get('description', ''),
'thumbnail': first_thumbnail or (file_entries[0]['url'] if file_entries else ''),
'view_count': 0,
'like_count': 0,
'is_gallery': True,
'file_count': len(file_entries),
'image_count': image_count,
'video_count': video_count,
'url': url,
'tags': album_metadata.get('tags', []),
}
except subprocess.TimeoutExpired:
self.log("Timeout fetching gallery info", "error", "Core")
return None
except Exception as e:
self.log(f"Error fetching gallery info: {e}", "error", "Core")
return None
def download_gallery(self, url: str, progress_callback=None, gallery_info: Dict = None) -> Tuple[bool, Optional[str], Optional[Dict]]:
"""Download a gallery/album using gallery-dl
Args:
url: Gallery URL
progress_callback: Optional callback for progress updates (message, percentage, speed, eta)
gallery_info: Optional pre-fetched gallery info from get_gallery_info()
Returns:
Tuple of (success, output_directory, metadata)
"""
try:
# Use album ID from gallery_info if available, otherwise generate hash
gallery_id = gallery_info.get('video_id') if gallery_info else None
if not gallery_id:
gallery_id = hashlib.sha256(url.encode()).hexdigest()[:12]
self.log(f"Starting gallery download: {url}", "info", "Core")
if progress_callback:
progress_callback(f"Starting gallery download...", 0, None, None)
# Get uploader for subfolder organization
uploader = gallery_info.get('uploader', '') if gallery_info else ''
if not uploader:
uploader = 'unknown'
# Sanitize channel name for filesystem
safe_channel = re.sub(r'[<>:"/\\|?*]', '', uploader)
safe_channel = re.sub(r'\s+', ' ', safe_channel).strip('. ')[:50] or 'unknown'
# Create output directory under channel subfolder
channel_dir = self.base_path / safe_channel
output_dir = channel_dir / gallery_id
output_dir.mkdir(parents=True, exist_ok=True)
# Build gallery-dl command
cmd = self._get_gallery_dl_base_cmd() + [
'--directory', str(output_dir),
'--filename', '{filename}.{extension}',
'--write-metadata',
'--write-info-json',
url
]
# Run gallery-dl with progress tracking
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1
)
downloaded_files = []
total_files = 0
current_file = 0
for line in iter(process.stdout.readline, ''):
line = line.strip()
if not line:
continue
self.log(line, "debug", "Download")
# Parse progress from gallery-dl output
if line.startswith('#'):
# Extract total count from "# 1/10" format
match = re.search(r'#\s*(\d+)/(\d+)', line)
if match:
current_file = int(match.group(1))
total_files = int(match.group(2))
percentage = int((current_file / total_files) * 100)
if progress_callback:
progress_callback(f"Downloading file {current_file}/{total_files}", percentage, None, None)
elif 'Downloading' in line or 'Saving' in line:
if progress_callback:
progress_callback(line, 50 if total_files == 0 else int((current_file / total_files) * 100), None, None)
# Track downloaded files
if output_dir.exists():
current_files = list(output_dir.glob('*'))
downloaded_files = [f for f in current_files if f.is_file() and not f.name.endswith('.json')]
process.wait()
if process.returncode != 0:
self.log(f"Gallery download failed with code {process.returncode}", "error", "Core")
if progress_callback:
progress_callback("Download failed", 0, None, None)
return False, None, None
# Get final list of downloaded files
downloaded_files = [f for f in output_dir.glob('*') if f.is_file() and not f.name.endswith('.json')]
if not downloaded_files:
self.log("No files were downloaded", "error", "Core")
return False, None, None
# Parse upload_date from gallery_info
upload_date = None
if gallery_info and gallery_info.get('upload_date'):
ud = gallery_info['upload_date']
if isinstance(ud, datetime):
upload_date = ud
elif isinstance(ud, str):
# Try parsing common date formats
for fmt in ['%Y-%m-%d %H:%M:%S', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%S']:
try:
upload_date = datetime.strptime(ud, fmt)
break
except ValueError:
continue
# Set file timestamps to upload date (same as yt-dlp)
if upload_date:
timestamp = upload_date.timestamp()
for file_path in downloaded_files:
os.utime(file_path, (timestamp, timestamp))
self.log(f"Set file timestamps to {upload_date}", "info", "Core")
# Calculate total size
total_size = sum(f.stat().st_size for f in downloaded_files)
# Use gallery_info if available for better metadata
metadata = {
'video_id': gallery_id,
'title': gallery_info.get('title', f"Gallery {gallery_id}") if gallery_info else f"Gallery {gallery_id}",
'uploader': gallery_info.get('uploader', '') if gallery_info else '',
'upload_date': upload_date or datetime.now(),
'duration': 0,
'description': gallery_info.get('description', '') if gallery_info else '',
'thumbnail': gallery_info.get('thumbnail', '') if gallery_info else '',
'view_count': gallery_info.get('view_count', 0) if gallery_info else 0,
'like_count': gallery_info.get('like_count', 0) if gallery_info else 0,
'is_gallery': True,
'file_count': len(downloaded_files),
'total_size': total_size,
'files': [str(f) for f in downloaded_files],
'tags': gallery_info.get('tags', []) if gallery_info else [],
}
self.log(f"Gallery download complete: {len(downloaded_files)} files, {total_size} bytes", "success", "Core")
if progress_callback:
progress_callback(f"Downloaded {len(downloaded_files)} files", 100, None, None)
# Record to video_downloads table
self._record_download(
video_id=gallery_id,
url=url,
title=metadata.get('title', f"Gallery {gallery_id}"),
file_path=str(output_dir),
uploader=metadata.get('uploader', ''),
upload_date=upload_date,
duration=0,
file_size=total_size,
metadata=metadata
)
# Also add to general downloads table for Media/Downloads page
url_hash = hashlib.sha256(url.encode()).hexdigest()
post_date = format_datetime_for_db(upload_date) if upload_date else format_datetime_for_db()
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
INSERT OR REPLACE INTO downloads
(url_hash, url, platform, source, post_date, download_date, status, file_path, filename)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
url_hash,
url,
self.platform,
metadata.get('uploader', ''),
post_date,
format_datetime_for_db(),
'completed',
str(output_dir),
gallery_id
))
conn.commit()
# Add each file to file_inventory for Media page (same as yt-dlp)
created_date = format_datetime_for_db(upload_date) if upload_date else format_datetime_for_db()
for file_path in downloaded_files:
file_stat = file_path.stat()
ext = file_path.suffix.lower()
content_type = 'video' if ext in ['.mp4', '.webm', '.mov', '.avi', '.mkv'] else 'image'
# Prepare metadata for JSON serialization
file_metadata = {
'gallery_id': gallery_id,
'title': metadata.get('title', ''),
'uploader': metadata.get('uploader', ''),
'tags': metadata.get('tags', []),
'url': url,
}
self.unified_db.upsert_file_inventory(
file_path=str(file_path),
filename=file_path.name,
platform=self.platform,
source=metadata.get('uploader', ''),
content_type=content_type,
file_size=file_stat.st_size,
location='final',
metadata=file_metadata,
created_date=created_date
)
self.log(f"Added {len(downloaded_files)} files to file_inventory", "info", "Database")
return True, str(output_dir), metadata
except Exception as e:
self.log(f"Error downloading gallery: {e}", "error", "Core")
if progress_callback:
progress_callback(f"Error: {str(e)}", 0, None, None)
return False, None, None
def download_video(self, url: str, progress_callback=None, update_activity: bool = True) -> Tuple[bool, Optional[str], Optional[Dict]]:
"""Download a video with metadata extraction
Args:
url: Video URL
progress_callback: Optional callback for progress updates (message, percentage)
update_activity: Whether to update the activity_status table (set False for queue downloads)
Returns:
Tuple of (success, file_path, metadata)
"""
try:
# Extract video ID
video_id = self.extract_video_id(url)
if not video_id:
self.log(f"Invalid {self.platform_config['name']} URL: {url}", "error", "Core")
return False, None, None
# Check if already downloaded
if self._is_already_downloaded(video_id):
self.log(f"Video {video_id} already downloaded, skipping", "info", "Core")
return False, None, {'error': 'Already downloaded'}
# Update activity status (only for scheduler-driven downloads, not queue)
activity_key = f'{self.platform}_downloader'
if update_activity:
self.activity_manager.update_status(f'Downloading: {url}')
if progress_callback:
progress_callback("Fetching video metadata...", 5)
# Get video info first
info = self.get_video_info(url)
if not info:
if update_activity:
self.activity_manager.update_status('Idle')
return False, None, {'error': 'Failed to fetch video info'}
self.log(f"Downloading: {info['title']}", "info", "Core")
if progress_callback:
progress_callback(f"Downloading: {info['title']}", 10)
# Generate output filename with date prefix
upload_date = info.get('upload_date')
if upload_date:
date_prefix = upload_date.strftime('%Y%m%d')
else:
date_prefix = datetime.now().strftime('%Y%m%d')
# Sanitize title for filename
safe_title = re.sub(r'[<>:"/\\|?*]', '_', info['title'][:100])
# Get channel/uploader for subfolder organization
uploader = info.get('uploader') or info.get('channel') or info.get('creator') or 'unknown'
# Sanitize channel name for filesystem
safe_channel = re.sub(r'[<>:"/\\|?*]', '', uploader)
safe_channel = re.sub(r'\s+', ' ', safe_channel).strip('. ')[:50] or 'unknown'
# Create channel subfolder
channel_dir = self.base_path / safe_channel
channel_dir.mkdir(parents=True, exist_ok=True)
output_template = str(channel_dir / f"{date_prefix}_{safe_title}_{video_id}.%(ext)s")
# Get anti-bot settings
antibot = get_antibot_settings(self.unified_db)
# Build base command
cmd = self._get_ytdlp_base_cmd() + [
'--no-playlist',
'--format', 'bestvideo+bestaudio/best',
'--merge-output-format', 'mp4',
'--output', output_template,
]
# Add metadata embedding based on settings
if self.video_settings.get('embed_metadata', True):
cmd.append('--add-metadata')
# Add thumbnail embedding based on settings
if self.video_settings.get('cache_thumbnails', True):
cmd.append('--embed-thumbnail')
# Add anti-bot measures if enabled
if antibot.get('enabled', True):
# User agent
user_agent = get_user_agent(antibot)
cmd.extend(['--user-agent', user_agent])
# Rate limiting
if antibot.get('limit_rate'):
cmd.extend(['--limit-rate', antibot['limit_rate']])
# Throttle detection
if antibot.get('throttled_rate'):
cmd.extend(['--throttled-rate', antibot['throttled_rate']])
# Sleep between requests
sleep_min = antibot.get('sleep_requests_min', 1)
sleep_max = antibot.get('sleep_requests_max', 3)
cmd.extend(['--sleep-requests', str(sleep_min)])
# Use sleep-interval for delays between downloads (with max variant)
if sleep_max > sleep_min:
cmd.extend(['--sleep-interval', str(sleep_min), '--max-sleep-interval', str(sleep_max)])
# Concurrent fragments
cmd.extend(['--concurrent-fragments', str(antibot.get('concurrent_fragments', 1))])
# Retries
cmd.extend(['--retries', str(antibot.get('retries', 10))])
cmd.extend(['--fragment-retries', str(antibot.get('fragment_retries', 10))])
# Socket timeout
cmd.extend(['--socket-timeout', str(antibot.get('socket_timeout', 30))])
# Don't abort on errors
cmd.append('--no-abort-on-error')
# Add URL last
cmd.append(url)
if progress_callback:
progress_callback("Downloading video...", 20)
# Run download with progress tracking
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True
)
# Collect output for error detection
output_lines = []
# Parse yt-dlp output for progress
for line in process.stdout:
output_lines.append(line)
# Look for [download] XX.X% lines
# Format: [download] 45.2% of 123.45MiB at 2.5MiB/s ETA 00:32
if '[download]' in line and '%' in line:
try:
percent_match = re.search(r'(\d+\.?\d*)%', line)
speed_match = re.search(r'at\s+([\d.]+\s*\w+/s)', line)
eta_match = re.search(r'ETA\s+([\d:]+)', line)
if percent_match:
percent = float(percent_match.group(1))
# Scale to 20-90% range
scaled_percent = 20 + (percent * 0.7)
speed = speed_match.group(1) if speed_match else None
eta = eta_match.group(1) if eta_match else None
if progress_callback:
# Build message with speed/ETA if available
msg = f"Downloading: {percent:.1f}%"
if speed:
msg += f"{speed}"
if eta:
msg += f" • ETA {eta}"
progress_callback(msg, int(scaled_percent), speed, eta)
except (ValueError, KeyError, TypeError):
pass
process.wait()
# Check for cookie/auth errors in output
full_output = ''.join(output_lines)
if process.returncode != 0 and is_cookie_error(full_output):
self.log("Download failed: Cookie/authentication error detected", "error", "Core")
if update_activity:
self.activity_manager.update_status('Idle')
return False, None, {'error': 'Cookie expired', 'cookie_error': True}
if process.returncode != 0:
self.log("Download failed", "error", "Core")
if update_activity:
self.activity_manager.update_status('Idle')
return False, None, {'error': 'Download failed'}
if progress_callback:
progress_callback("Processing metadata...", 95)
# Find the downloaded file
# Escape glob special characters (brackets, etc.) in the pattern
import glob as glob_module
escaped_prefix = glob_module.escape(f"{date_prefix}_{safe_title}_{video_id}")
expected_pattern = f"{escaped_prefix}.*"
downloaded_files = list(channel_dir.glob(expected_pattern))
if not downloaded_files:
self.log("Downloaded file not found", "error", "Core")
if update_activity:
self.activity_manager.update_status('Idle')
return False, None, {'error': 'File not found after download'}
file_path = downloaded_files[0]
# Set file timestamp to upload date
if upload_date:
timestamp = upload_date.timestamp()
os.utime(file_path, (timestamp, timestamp))
self.log(f"Set file timestamp to {upload_date}", "info", "Core")
# Get file size
file_size = file_path.stat().st_size
# Get video dimensions using yt-dlp metadata
width = info.get('width')
height = info.get('height')
# Record download in video_downloads table
self._record_download(
video_id=video_id,
url=url,
title=info['title'],
file_path=str(file_path),
uploader=info.get('uploader'),
upload_date=upload_date,
duration=info.get('duration'),
file_size=file_size,
metadata=info
)
# Also add to general downloads table for Media/Downloads page queries
# post_date = upload date, download_date = today
url_hash = hashlib.sha256(url.encode()).hexdigest()
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
INSERT OR IGNORE INTO downloads
(url_hash, url, platform, source, post_date, download_date, status, file_path, filename)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
url_hash,
url,
self.platform,
info.get('uploader'),
format_datetime_for_db(upload_date) if upload_date else None,
format_datetime_for_db(),
'completed',
str(file_path),
file_path.name
))
conn.commit()
self.log(f"Added to downloads table: {file_path.name}", "info", "Database")
# Add to file inventory for media gallery
download_time = format_datetime_for_db()
# Prepare metadata for JSON serialization (convert datetime to string)
metadata_serializable = dict(info)
if 'upload_date' in metadata_serializable and metadata_serializable['upload_date']:
metadata_serializable['upload_date'] = format_datetime_for_db(metadata_serializable['upload_date']) if isinstance(metadata_serializable['upload_date'], datetime) else metadata_serializable['upload_date']
self.unified_db.upsert_file_inventory(
file_path=str(file_path),
filename=file_path.name,
platform=self.platform,
source=info.get('uploader'),
content_type='video',
file_size=file_size,
width=width,
height=height,
location='final',
metadata=metadata_serializable,
created_date=download_time,
video_id=info.get('id') # For YouTube thumbnail lookup
)
self.log(f"Added to file inventory: {file_path.name}", "info", "Database")
if progress_callback:
progress_callback("Download complete!", 100)
self.log(f"Successfully downloaded: {file_path.name}", "success", "Core")
if update_activity:
self.activity_manager.update_status('Idle')
return True, str(file_path), info
except Exception as e:
self.log(f"Error downloading video: {e}", "error", "Core")
if update_activity:
self.activity_manager.update_status('Idle')
return False, None, {'error': str(e)}
def main():
"""Test function"""
from modules.unified_database import UnifiedDatabase
db = UnifiedDatabase()
print("Available platforms:")
for key, config in PLATFORMS.items():
print(f" {key}: {config['name']}")
platform = input("\nSelect platform: ").lower()
if platform not in PLATFORMS:
print(f"Invalid platform. Choose from: {', '.join(PLATFORMS.keys())}")
return
downloader = UniversalVideoDownloader(platform=platform, unified_db=db)
# Test URL
test_url = input(f"Enter {PLATFORMS[platform]['name']} URL: ")
def progress(msg, pct):
print(f"[{pct}%] {msg}")
success, file_path, metadata = downloader.download_video(test_url, progress)
if success:
print(f"\nSuccess! Downloaded to: {file_path}")
else:
print(f"\nFailed: {metadata.get('error', 'Unknown error')}")
if __name__ == '__main__':
main()