#!/usr/bin/env python3 """ Multi-threaded Download Manager Handles concurrent downloads with rate limiting, retries, and progress tracking Can be used by forum_downloader, fastdl_module, and other downloaders """ import os import re import time import hashlib import requests import threading from pathlib import Path from datetime import datetime from typing import Dict, List, Optional, Any, Callable from concurrent.futures import ThreadPoolExecutor, as_completed from threading import Lock, Semaphore from dataclasses import dataclass import sqlite3 from urllib.parse import urlparse from modules.base_module import LoggingMixin from modules.universal_logger import get_logger logger = get_logger('DownloadManager') # For standalone/example usage @dataclass class DownloadItem: """Single download item""" url: str save_path: Path referer: Optional[str] = None headers: Optional[Dict[str, str]] = None metadata: Optional[Dict[str, Any]] = None post_date: Optional[datetime] = None # Timestamp to set on downloaded file retry_count: int = 0 max_retries: int = 3 @dataclass class DownloadResult: """Result of a download""" success: bool item: DownloadItem file_size: Optional[int] = None download_time: Optional[float] = None error: Optional[str] = None file_hash: Optional[str] = None class DownloadManager(LoggingMixin): """ Multi-threaded download manager with: - Concurrent downloads - Rate limiting - Automatic retries - Progress tracking - Database tracking - Playwright support for authenticated downloads """ def __init__(self, max_workers: int = 5, rate_limit: float = 0.5, timeout: int = 30, chunk_size: int = 8192, use_database: bool = False, db_path: str = None, show_progress: bool = True, show_debug: bool = False): """ Initialize download manager Args: max_workers: Maximum concurrent downloads rate_limit: Seconds between downloads per thread timeout: Download timeout in seconds chunk_size: Chunk size for streaming downloads use_database: Track downloads in database db_path: Path to database file show_progress: Show download progress show_debug: Show debug messages """ self.max_workers = max_workers self.rate_limit = rate_limit self.timeout = timeout self.chunk_size = chunk_size self.use_database = use_database self.db_path = db_path self.show_progress = show_progress # Initialize logging via mixin self._init_logger('DownloadManager', None, default_module='Download', show_debug=show_debug) # Thread synchronization self.download_lock = Lock() self.rate_limiter = Semaphore(max_workers) self.last_download_time = {} # Thread-local storage for ImageBam sessions (each thread gets its own session) self._imagebam_session_local = threading.local() # Statistics self.stats = { 'total': 0, 'successful': 0, 'failed': 0, 'skipped': 0, 'total_bytes': 0, 'total_time': 0 } # User agent self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" # Playwright context for authenticated downloads self.playwright_context = None # Initialize database only if explicitly enabled AND path provided if self.use_database and self.db_path: self._init_database() elif self.use_database and not self.db_path: # Disable database if no path provided to prevent creating files in CWD self.use_database = False def _init_database(self): """Initialize download tracking database""" if not self.db_path: return conn = sqlite3.connect(self.db_path) try: cursor = conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS downloads ( id INTEGER PRIMARY KEY AUTOINCREMENT, url TEXT UNIQUE NOT NULL, file_path TEXT NOT NULL, file_hash TEXT, file_size INTEGER, download_date DATETIME DEFAULT CURRENT_TIMESTAMP, metadata TEXT ) ''') cursor.execute(''' CREATE INDEX IF NOT EXISTS idx_downloads_url ON downloads(url) ''') cursor.execute(''' CREATE INDEX IF NOT EXISTS idx_downloads_hash ON downloads(file_hash) ''') conn.commit() finally: conn.close() def set_playwright_context(self, context): """Set Playwright context for authenticated downloads""" self.playwright_context = context # Extract cookies from context for requests library if context: try: self.cookies = {} cookies = context.cookies() for cookie in cookies: self.cookies[cookie['name']] = cookie['value'] except Exception: self.cookies = {} def _is_already_downloaded(self, url: str, file_path: Path) -> bool: """Check if file was already downloaded""" if not self.use_database: return file_path.exists() and file_path.stat().st_size > 0 conn = sqlite3.connect(self.db_path) try: cursor = conn.cursor() cursor.execute( "SELECT file_path, file_size FROM downloads WHERE url = ?", (url,) ) result = cursor.fetchone() finally: conn.close() if result: # Check if file still exists and has expected size saved_path = Path(result[0]) if saved_path.exists() and saved_path.stat().st_size == result[1]: return True return False def _apply_rate_limit(self, thread_id: int): """Apply rate limiting per thread""" with self.download_lock: if thread_id in self.last_download_time: elapsed = time.time() - self.last_download_time[thread_id] if elapsed < self.rate_limit: time.sleep(self.rate_limit - elapsed) self.last_download_time[thread_id] = time.time() def _extract_pixhost_direct_url(self, show_url: str) -> Optional[str]: """Extract direct image URL from pixhost show URL""" try: # Pattern to extract ID and filename from show URL show_pattern = re.compile(r"https?://(?:www\.)?pixhost\.to/show/(\d+)/([^/]+)$", re.IGNORECASE) match = show_pattern.match(show_url) if not match: return None img_id = match.group(1) filename = match.group(2) # Try common hosts in order common_hosts = [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100] for host_num in common_hosts: test_url = f"https://img{host_num}.pixhost.to/images/{img_id}/{filename}" try: # Quick HEAD request to check if URL exists response = requests.head(test_url, timeout=2, allow_redirects=False) if response.status_code == 200: return test_url except requests.RequestException: continue # Try sequential scan if common hosts don't work for host_num in range(1, 121): if host_num in common_hosts: continue test_url = f"https://img{host_num}.pixhost.to/images/{img_id}/{filename}" try: response = requests.head(test_url, timeout=1, allow_redirects=False) if response.status_code == 200: return test_url except requests.RequestException: continue return None except Exception as e: self.log(f"Error extracting pixhost URL: {e}", "error") return None def _extract_imagebam_direct_url(self, imagebam_url: str) -> Optional[str]: """Extract direct image URL from ImageBam page""" try: # Get or create thread-local ImageBam session (thread-safe) session = getattr(self._imagebam_session_local, 'session', None) if session is None: session = requests.Session() session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) # Set cookies to bypass the interstitial ad page (both old and new cookies) session.cookies.set('nsfw_inter', '1', domain='.imagebam.com') session.cookies.set('sfw_inter', '1', domain='.imagebam.com') self._imagebam_session_local.session = session # ImageBam now requires two requests - first to get session cookies, second to get image # First request sets up the session response = session.get(imagebam_url, timeout=5) if response.status_code != 200: self.log(f"ImageBam page returned {response.status_code}", "warning") return None # Check if we got the interstitial page (contains "Continue to your image") if 'Continue to your image' in response.text or 'Please wait' in response.text: # Make sure bypass cookies are set and request again session.cookies.set('sfw_inter', '1', domain='.imagebam.com') session.cookies.set('nsfw_inter', '1', domain='.imagebam.com') response = session.get(imagebam_url, timeout=5) # Look for the direct image URL in the HTML # ImageBam stores the full image with _o suffix # First try to find the full resolution image full_img_pattern = r'(https?://images\d*\.imagebam\.com/[a-f0-9/]+/[A-Z0-9]+_o\.\w+)' matches = re.findall(full_img_pattern, response.text, re.IGNORECASE) if matches: # Return the first full resolution image found direct_url = matches[0] self.log(f"Extracted ImageBam direct URL: {direct_url}", "debug") return direct_url # Fallback: look for any image on images*.imagebam.com fallback_patterns = [ r']+src="(https?://images\d*\.imagebam\.com/[^"]+)"', r'"(https?://images\d*\.imagebam\.com/[^"]+\.(?:jpg|jpeg|png|gif))"', ] for pattern in fallback_patterns: matches = re.findall(pattern, response.text, re.IGNORECASE) if matches: direct_url = matches[0] self.log(f"Extracted ImageBam direct URL (fallback): {direct_url}", "debug") return direct_url self.log("No direct image URL found in ImageBam HTML", "warning") return None except requests.Timeout: self.log(f"ImageBam extraction timed out for {imagebam_url}", "warning") return None except Exception as e: self.log(f"Error extracting ImageBam URL: {e}", "error") return None def _download_with_gallery_dl(self, item: DownloadItem) -> DownloadResult: """Download using gallery-dl for supported hosts (ImageTwist, etc.)""" import subprocess start_time = time.time() try: # Ensure parent directory exists item.save_path.parent.mkdir(parents=True, exist_ok=True) # Build gallery-dl command cmd = [ "gallery-dl", "--dest", str(item.save_path.parent), "--filename", item.save_path.name, "--no-skip", "--no-part", "--quiet" ] # Add referer if provided if item.referer: cmd.extend(["--header", f"Referer: {item.referer}"]) cmd.append(item.url) # Run gallery-dl with timeout result = subprocess.run( cmd, capture_output=True, text=True, timeout=60 ) if result.returncode == 0 and item.save_path.exists(): file_size = item.save_path.stat().st_size download_time = time.time() - start_time # Calculate hash (SHA256 for consistency with unified database) with open(item.save_path, 'rb') as f: file_hash = hashlib.sha256(f.read()).hexdigest() # Set file timestamp if we have a date if item.post_date: try: timestamp_unix = item.post_date.timestamp() os.utime(item.save_path, (timestamp_unix, timestamp_unix)) except Exception as e: self.log(f"Failed to set timestamp: {e}", "warning") self.log(f"Downloaded via gallery-dl: {item.save_path.name}", "success") return DownloadResult( success=True, item=item, file_size=file_size, download_time=download_time, file_hash=file_hash ) else: error_msg = result.stderr or "Unknown error" return DownloadResult( success=False, item=item, error=f"gallery-dl failed: {error_msg}" ) except subprocess.TimeoutExpired: return DownloadResult( success=False, item=item, error="gallery-dl timed out" ) except Exception as e: return DownloadResult( success=False, item=item, error=str(e) ) def _download_from_imagetwist(self, item: DownloadItem) -> DownloadResult: """Download image from ImageTwist using gallery-dl for URL resolution""" import subprocess start_time = time.time() # Rate limiting for ImageTwist (they return error images if too fast) if not hasattr(self, '_imagetwist_last_request'): self._imagetwist_last_request = 0 with self.download_lock: elapsed = time.time() - self._imagetwist_last_request if elapsed < 2.0: # Minimum 2 seconds between ImageTwist requests time.sleep(2.0 - elapsed) self._imagetwist_last_request = time.time() try: # Use gallery-dl to get the actual image URL result = subprocess.run( ['/opt/media-downloader/venv/bin/gallery-dl', '-g', item.url], capture_output=True, text=True, timeout=30 ) if result.returncode != 0 or not result.stdout.strip(): # Fallback to manual parsing return self._download_from_imagetwist_fallback(item, start_time) img_url = result.stdout.strip().split('\n')[0] if not img_url or 'imagetwist' not in img_url: return self._download_from_imagetwist_fallback(item, start_time) # Rate limit again before actual download with self.download_lock: elapsed = time.time() - self._imagetwist_last_request if elapsed < 2.0: time.sleep(2.0 - elapsed) self._imagetwist_last_request = time.time() # Download the actual image - use imagetwist page as Referer item.save_path.parent.mkdir(parents=True, exist_ok=True) headers = { 'User-Agent': self.user_agent, 'Referer': item.url # Use imagetwist page URL as Referer } img_response = requests.get(img_url, headers=headers, timeout=30, stream=True) img_response.raise_for_status() # Check for ImageTwist error placeholder (8346 bytes - rate limited or deleted) content_length = img_response.headers.get('Content-Length', '') if content_length == '8346': self.log(f"ImageTwist rate limited or unavailable: {item.url}", "warning") return DownloadResult(success=False, item=item, error="ImageTwist error image (rate limited)") # Validate it's an image, not HTML chunks = [] for chunk in img_response.iter_content(chunk_size=8192): if not chunks: # First chunk if chunk[:100].lower().find(b' DownloadResult: """Fallback method using manual page parsing""" from bs4 import BeautifulSoup import re try: headers = { 'User-Agent': self.user_agent, 'Referer': item.referer or 'https://forum.phun.org/' } response = requests.get(item.url, headers=headers, timeout=30) response.raise_for_status() page_content = response.text img_url = None # Method 1: Look for pic class soup = BeautifulSoup(page_content, 'html.parser') pic_img = soup.find('img', class_='pic') if pic_img and pic_img.get('src'): img_url = pic_img['src'] # Method 2: Regex for i*.imagetwist.com/i/ pattern if not img_url: match = re.search(r'(https?://i\d*(?:phun)?\.imagetwist\.com/i/[^"\'>\s]+)', page_content) if match: img_url = match.group(1) if not img_url: return DownloadResult( success=False, item=item, error="Could not find direct image URL on ImageTwist page" ) # Download the actual image item.save_path.parent.mkdir(parents=True, exist_ok=True) img_response = requests.get(img_url, headers=headers, timeout=30, stream=True) img_response.raise_for_status() chunks = [] for chunk in img_response.iter_content(chunk_size=8192): if not chunks: if chunk[:100].lower().find(b' DownloadResult: """Download using Playwright for authenticated sessions""" if not self.playwright_context: return self._download_with_requests(item) start_time = time.time() try: page = self.playwright_context.new_page() try: # Set headers headers = item.headers or {} if item.referer: headers['Referer'] = item.referer if headers: page.set_extra_http_headers(headers) # Direct download (pixhost should already be processed) response = page.goto(item.url, wait_until='networkidle', timeout=self.timeout * 1000) if response and response.ok: content = response.body() # Check for HTML error pages if content[:1000].lower().find(b' DownloadResult: """Download using requests library""" start_time = time.time() try: headers = item.headers or {} headers['User-Agent'] = self.user_agent if item.referer: headers['Referer'] = item.referer # Use cookies if available cookies = getattr(self, 'cookies', {}) response = requests.get( item.url, headers=headers, cookies=cookies if cookies else None, timeout=self.timeout, stream=True ) response.raise_for_status() # Stream download to memory first to validate content item.save_path.parent.mkdir(parents=True, exist_ok=True) content = b'' first_chunk_checked = False for chunk in response.iter_content(chunk_size=self.chunk_size): if chunk: # Check first chunk for HTML error pages if not first_chunk_checked: first_chunk_checked = True if chunk[:100].lower().find(b'') != -1: return DownloadResult( success=False, item=item, error="Got HTML instead of image" ) content += chunk # Save to file only after validation with open(item.save_path, 'wb') as f: f.write(content) # Calculate hash (SHA256 for consistency with unified database) file_hash = hashlib.sha256(content).hexdigest() # Set file timestamp if we have a date if item.post_date: try: timestamp_unix = item.post_date.timestamp() os.utime(item.save_path, (timestamp_unix, timestamp_unix)) self.log(f"Set timestamp to {item.post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug") except Exception as e: self.log(f"Failed to set timestamp: {e}", "warning") download_time = time.time() - start_time return DownloadResult( success=True, item=item, file_size=len(content), download_time=download_time, file_hash=file_hash ) except Exception as e: # Clean up partial download if item.save_path.exists(): item.save_path.unlink() return DownloadResult( success=False, item=item, error=str(e) ) def _download_worker(self, item: DownloadItem, thread_id: int) -> DownloadResult: """Worker function for downloading a single item""" # Process image hosting URLs to get direct URLs if 'pixhost.to/show/' in item.url: direct_url = self._extract_pixhost_direct_url(item.url) if direct_url: self.log(f"Converted pixhost URL to direct: {direct_url.split('/')[-1]}", "debug") item.url = direct_url else: self.log(f"Failed to extract pixhost direct URL: {item.url}", "warning") elif 'imagebam.com' in item.url: direct_url = self._extract_imagebam_direct_url(item.url) if direct_url: self.log(f"Converted ImageBam URL to direct: {direct_url.split('/')[-1]}", "debug") item.url = direct_url else: self.log(f"Failed to extract ImageBam direct URL: {item.url}", "warning") elif 'imagetwist.com' in item.url: # ImageTwist requires parsing the page to get direct image URL result = self._download_from_imagetwist(item) if result.success: return result self.log(f"ImageTwist download failed: {item.url}", "warning") # Check if already downloaded if self._is_already_downloaded(item.url, item.save_path): self.log(f"Already downloaded: {item.save_path.name}", "skip") return DownloadResult( success=True, item=item, file_size=item.save_path.stat().st_size if item.save_path.exists() else 0 ) # Apply rate limiting self._apply_rate_limit(thread_id) # Always use requests for direct image downloads (faster) result = self._download_with_requests(item) # Handle retries if not result.success and item.retry_count < item.max_retries: item.retry_count += 1 self.log(f"Retrying {item.url} ({item.retry_count}/{item.max_retries})", "warning") time.sleep(self.rate_limit * 2) # Extra delay before retry return self._download_worker(item, thread_id) # Save to database if successful if result.success and self.use_database: self._save_to_database(result) # Update statistics with self.download_lock: if result.success: self.stats['successful'] += 1 if result.file_size: self.stats['total_bytes'] += result.file_size if result.download_time: self.stats['total_time'] += result.download_time else: self.stats['failed'] += 1 return result def _save_to_database(self, result: DownloadResult): """Save successful download to database""" conn = sqlite3.connect(self.db_path) try: cursor = conn.cursor() metadata_str = None if result.item.metadata: import json metadata_str = json.dumps(result.item.metadata) cursor.execute(''' INSERT OR REPLACE INTO downloads (url, file_path, file_hash, file_size, metadata) VALUES (?, ?, ?, ?, ?) ''', ( result.item.url, str(result.item.save_path), result.file_hash, result.file_size, metadata_str )) conn.commit() finally: conn.close() def download_batch(self, items: List[DownloadItem], progress_callback: Optional[Callable] = None) -> List[DownloadResult]: """ Download multiple items concurrently Args: items: List of DownloadItem objects progress_callback: Optional callback for progress updates Returns: List of DownloadResult objects """ self.stats['total'] = len(items) results = [] self.log(f"Starting batch download of {len(items)} items with {self.max_workers} workers", "info") with ThreadPoolExecutor(max_workers=self.max_workers) as executor: # Submit all downloads futures = { executor.submit(self._download_worker, item, i % self.max_workers): item for i, item in enumerate(items) } # Process completed downloads completed = 0 for future in as_completed(futures): result = future.result() results.append(result) completed += 1 # Progress update if progress_callback: progress_callback(completed, len(items), result) if self.show_progress: pct = (completed / len(items)) * 100 status = "✓" if result.success else "✗" self.log( f"[{completed}/{len(items)}] {pct:.1f}% - {status} {result.item.save_path.name}", "success" if result.success else "error" ) # Summary self.log(f"Batch complete: {self.stats['successful']} successful, {self.stats['failed']} failed", "info") if self.stats['successful'] > 0: avg_speed = self.stats['total_bytes'] / self.stats['total_time'] / 1024 / 1024 self.log(f"Average speed: {avg_speed:.2f} MB/s", "info") return results def download_urls(self, urls: List[str], base_path: Path, referer: Optional[str] = None, metadata: Optional[Dict] = None) -> List[DownloadResult]: """ Convenience method to download URLs to a directory Args: urls: List of URLs to download base_path: Directory to save files referer: Optional referer header metadata: Optional metadata for all downloads Returns: List of DownloadResult objects """ items = [] for url in urls: filename = os.path.basename(urlparse(url).path) or f"download_{hashlib.sha256(url.encode()).hexdigest()[:8]}" save_path = base_path / filename items.append(DownloadItem( url=url, save_path=save_path, referer=referer, metadata=metadata )) return self.download_batch(items) def get_statistics(self) -> Dict: """Get download statistics""" return self.stats.copy() def cleanup_old_downloads(self, days: int = 30): """Remove old download records from database""" if not self.use_database: return 0 conn = sqlite3.connect(self.db_path) try: cursor = conn.cursor() cursor.execute(''' DELETE FROM downloads WHERE download_date < datetime('now', ? || ' days') ''', (-days,)) deleted = cursor.rowcount conn.commit() finally: conn.close() self.log(f"Cleaned up {deleted} old download records", "info") return deleted # Example usage if __name__ == "__main__": from pathlib import Path # Test download manager manager = DownloadManager( max_workers=3, rate_limit=0.5, show_progress=True ) # Test URLs urls = [ "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", "https://sample-videos.com/img/Sample-jpg-image-50kb.jpg", "https://www.w3schools.com/html/img_girl.jpg" ] # Download results = manager.download_urls(urls, Path("/tmp/test-downloads")) # Print results logger.info(f"Downloaded {len([r for r in results if r.success])} of {len(results)} files") logger.info(f"Total bytes: {manager.stats['total_bytes'] / 1024:.1f} KB") logger.info(f"Total time: {manager.stats['total_time']:.2f} seconds")