Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/modules/download_manager.py
+++ b/modules/download_manager.py
@@ -0,0 +1,940 @@
+#!/usr/bin/env python3
+"""
+Multi-threaded Download Manager
+Handles concurrent downloads with rate limiting, retries, and progress tracking
+Can be used by forum_downloader, fastdl_module, and other downloaders
+"""
+
+import os
+import re
+import time
+import hashlib
+import requests
+import threading
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, List, Optional, Any, Callable
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from threading import Lock, Semaphore
+from dataclasses import dataclass
+import sqlite3
+from urllib.parse import urlparse
+from modules.base_module import LoggingMixin
+from modules.universal_logger import get_logger
+
+logger = get_logger('DownloadManager')  # For standalone/example usage
+
+
+@dataclass
+class DownloadItem:
+    """Single download item"""
+    url: str
+    save_path: Path
+    referer: Optional[str] = None
+    headers: Optional[Dict[str, str]] = None
+    metadata: Optional[Dict[str, Any]] = None
+    post_date: Optional[datetime] = None  # Timestamp to set on downloaded file
+    retry_count: int = 0
+    max_retries: int = 3
+    
+    
+@dataclass
+class DownloadResult:
+    """Result of a download"""
+    success: bool
+    item: DownloadItem
+    file_size: Optional[int] = None
+    download_time: Optional[float] = None
+    error: Optional[str] = None
+    file_hash: Optional[str] = None
+
+
+class DownloadManager(LoggingMixin):
+    """
+    Multi-threaded download manager with:
+    - Concurrent downloads
+    - Rate limiting
+    - Automatic retries
+    - Progress tracking
+    - Database tracking
+    - Playwright support for authenticated downloads
+    """
+    
+    def __init__(self,
+                 max_workers: int = 5,
+                 rate_limit: float = 0.5,
+                 timeout: int = 30,
+                 chunk_size: int = 8192,
+                 use_database: bool = False,
+                 db_path: str = None,
+                 show_progress: bool = True,
+                 show_debug: bool = False):
+        """
+        Initialize download manager
+        
+        Args:
+            max_workers: Maximum concurrent downloads
+            rate_limit: Seconds between downloads per thread
+            timeout: Download timeout in seconds
+            chunk_size: Chunk size for streaming downloads
+            use_database: Track downloads in database
+            db_path: Path to database file
+            show_progress: Show download progress
+            show_debug: Show debug messages
+        """
+        self.max_workers = max_workers
+        self.rate_limit = rate_limit
+        self.timeout = timeout
+        self.chunk_size = chunk_size
+        self.use_database = use_database
+        self.db_path = db_path
+        self.show_progress = show_progress
+
+        # Initialize logging via mixin
+        self._init_logger('DownloadManager', None, default_module='Download', show_debug=show_debug)
+
+        # Thread synchronization
+        self.download_lock = Lock()
+        self.rate_limiter = Semaphore(max_workers)
+        self.last_download_time = {}
+
+        # Thread-local storage for ImageBam sessions (each thread gets its own session)
+        self._imagebam_session_local = threading.local()
+        
+        # Statistics
+        self.stats = {
+            'total': 0,
+            'successful': 0,
+            'failed': 0,
+            'skipped': 0,
+            'total_bytes': 0,
+            'total_time': 0
+        }
+        
+        # User agent
+        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+        
+        # Playwright context for authenticated downloads
+        self.playwright_context = None
+        
+        # Initialize database only if explicitly enabled AND path provided
+        if self.use_database and self.db_path:
+            self._init_database()
+        elif self.use_database and not self.db_path:
+            # Disable database if no path provided to prevent creating files in CWD
+            self.use_database = False
+
+    def _init_database(self):
+        """Initialize download tracking database"""
+        if not self.db_path:
+            return
+        conn = sqlite3.connect(self.db_path)
+        try:
+            cursor = conn.cursor()
+
+            cursor.execute('''
+                CREATE TABLE IF NOT EXISTS downloads (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    url TEXT UNIQUE NOT NULL,
+                    file_path TEXT NOT NULL,
+                    file_hash TEXT,
+                    file_size INTEGER,
+                    download_date DATETIME DEFAULT CURRENT_TIMESTAMP,
+                    metadata TEXT
+                )
+            ''')
+
+            cursor.execute('''
+                CREATE INDEX IF NOT EXISTS idx_downloads_url ON downloads(url)
+            ''')
+            cursor.execute('''
+                CREATE INDEX IF NOT EXISTS idx_downloads_hash ON downloads(file_hash)
+            ''')
+
+            conn.commit()
+        finally:
+            conn.close()
+    
+    def set_playwright_context(self, context):
+        """Set Playwright context for authenticated downloads"""
+        self.playwright_context = context
+        # Extract cookies from context for requests library
+        if context:
+            try:
+                self.cookies = {}
+                cookies = context.cookies()
+                for cookie in cookies:
+                    self.cookies[cookie['name']] = cookie['value']
+            except Exception:
+                self.cookies = {}
+    
+    def _is_already_downloaded(self, url: str, file_path: Path) -> bool:
+        """Check if file was already downloaded"""
+        if not self.use_database:
+            return file_path.exists() and file_path.stat().st_size > 0
+
+        conn = sqlite3.connect(self.db_path)
+        try:
+            cursor = conn.cursor()
+
+            cursor.execute(
+                "SELECT file_path, file_size FROM downloads WHERE url = ?",
+                (url,)
+            )
+            result = cursor.fetchone()
+        finally:
+            conn.close()
+
+        if result:
+            # Check if file still exists and has expected size
+            saved_path = Path(result[0])
+            if saved_path.exists() and saved_path.stat().st_size == result[1]:
+                return True
+        
+        return False
+    
+    def _apply_rate_limit(self, thread_id: int):
+        """Apply rate limiting per thread"""
+        with self.download_lock:
+            if thread_id in self.last_download_time:
+                elapsed = time.time() - self.last_download_time[thread_id]
+                if elapsed < self.rate_limit:
+                    time.sleep(self.rate_limit - elapsed)
+            self.last_download_time[thread_id] = time.time()
+    
+    def _extract_pixhost_direct_url(self, show_url: str) -> Optional[str]:
+        """Extract direct image URL from pixhost show URL"""
+        try:
+            # Pattern to extract ID and filename from show URL
+            show_pattern = re.compile(r"https?://(?:www\.)?pixhost\.to/show/(\d+)/([^/]+)$", re.IGNORECASE)
+            match = show_pattern.match(show_url)
+            
+            if not match:
+                return None
+            
+            img_id = match.group(1)
+            filename = match.group(2)
+            
+            # Try common hosts in order
+            common_hosts = [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100]
+            
+            for host_num in common_hosts:
+                test_url = f"https://img{host_num}.pixhost.to/images/{img_id}/{filename}"
+                
+                try:
+                    # Quick HEAD request to check if URL exists
+                    response = requests.head(test_url, timeout=2, allow_redirects=False)
+                    if response.status_code == 200:
+                        return test_url
+                except requests.RequestException:
+                    continue
+
+            # Try sequential scan if common hosts don't work
+            for host_num in range(1, 121):
+                if host_num in common_hosts:
+                    continue
+
+                test_url = f"https://img{host_num}.pixhost.to/images/{img_id}/{filename}"
+
+                try:
+                    response = requests.head(test_url, timeout=1, allow_redirects=False)
+                    if response.status_code == 200:
+                        return test_url
+                except requests.RequestException:
+                    continue
+            
+            return None
+        except Exception as e:
+            self.log(f"Error extracting pixhost URL: {e}", "error")
+            return None
+    
+    def _extract_imagebam_direct_url(self, imagebam_url: str) -> Optional[str]:
+        """Extract direct image URL from ImageBam page"""
+        try:
+            # Get or create thread-local ImageBam session (thread-safe)
+            session = getattr(self._imagebam_session_local, 'session', None)
+            if session is None:
+                session = requests.Session()
+                session.headers.update({
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+                })
+                # Set cookies to bypass the interstitial ad page (both old and new cookies)
+                session.cookies.set('nsfw_inter', '1', domain='.imagebam.com')
+                session.cookies.set('sfw_inter', '1', domain='.imagebam.com')
+                self._imagebam_session_local.session = session
+
+            # ImageBam now requires two requests - first to get session cookies, second to get image
+            # First request sets up the session
+            response = session.get(imagebam_url, timeout=5)
+
+            if response.status_code != 200:
+                self.log(f"ImageBam page returned {response.status_code}", "warning")
+                return None
+
+            # Check if we got the interstitial page (contains "Continue to your image")
+            if 'Continue to your image' in response.text or 'Please wait' in response.text:
+                # Make sure bypass cookies are set and request again
+                session.cookies.set('sfw_inter', '1', domain='.imagebam.com')
+                session.cookies.set('nsfw_inter', '1', domain='.imagebam.com')
+                response = session.get(imagebam_url, timeout=5)
+
+            # Look for the direct image URL in the HTML
+            # ImageBam stores the full image with _o suffix
+            # First try to find the full resolution image
+            full_img_pattern = r'(https?://images\d*\.imagebam\.com/[a-f0-9/]+/[A-Z0-9]+_o\.\w+)'
+            matches = re.findall(full_img_pattern, response.text, re.IGNORECASE)
+
+            if matches:
+                # Return the first full resolution image found
+                direct_url = matches[0]
+                self.log(f"Extracted ImageBam direct URL: {direct_url}", "debug")
+                return direct_url
+
+            # Fallback: look for any image on images*.imagebam.com
+            fallback_patterns = [
+                r'<img[^>]+src="(https?://images\d*\.imagebam\.com/[^"]+)"',
+                r'"(https?://images\d*\.imagebam\.com/[^"]+\.(?:jpg|jpeg|png|gif))"',
+            ]
+
+            for pattern in fallback_patterns:
+                matches = re.findall(pattern, response.text, re.IGNORECASE)
+                if matches:
+                    direct_url = matches[0]
+                    self.log(f"Extracted ImageBam direct URL (fallback): {direct_url}", "debug")
+                    return direct_url
+
+            self.log("No direct image URL found in ImageBam HTML", "warning")
+            return None
+
+        except requests.Timeout:
+            self.log(f"ImageBam extraction timed out for {imagebam_url}", "warning")
+            return None
+        except Exception as e:
+            self.log(f"Error extracting ImageBam URL: {e}", "error")
+            return None
+
+    def _download_with_gallery_dl(self, item: DownloadItem) -> DownloadResult:
+        """Download using gallery-dl for supported hosts (ImageTwist, etc.)"""
+        import subprocess
+        start_time = time.time()
+
+        try:
+            # Ensure parent directory exists
+            item.save_path.parent.mkdir(parents=True, exist_ok=True)
+
+            # Build gallery-dl command
+            cmd = [
+                "gallery-dl",
+                "--dest", str(item.save_path.parent),
+                "--filename", item.save_path.name,
+                "--no-skip",
+                "--no-part",
+                "--quiet"
+            ]
+
+            # Add referer if provided
+            if item.referer:
+                cmd.extend(["--header", f"Referer: {item.referer}"])
+
+            cmd.append(item.url)
+
+            # Run gallery-dl with timeout
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=60
+            )
+
+            if result.returncode == 0 and item.save_path.exists():
+                file_size = item.save_path.stat().st_size
+                download_time = time.time() - start_time
+
+                # Calculate hash (SHA256 for consistency with unified database)
+                with open(item.save_path, 'rb') as f:
+                    file_hash = hashlib.sha256(f.read()).hexdigest()
+
+                # Set file timestamp if we have a date
+                if item.post_date:
+                    try:
+                        timestamp_unix = item.post_date.timestamp()
+                        os.utime(item.save_path, (timestamp_unix, timestamp_unix))
+                    except Exception as e:
+                        self.log(f"Failed to set timestamp: {e}", "warning")
+
+                self.log(f"Downloaded via gallery-dl: {item.save_path.name}", "success")
+                return DownloadResult(
+                    success=True,
+                    item=item,
+                    file_size=file_size,
+                    download_time=download_time,
+                    file_hash=file_hash
+                )
+            else:
+                error_msg = result.stderr or "Unknown error"
+                return DownloadResult(
+                    success=False,
+                    item=item,
+                    error=f"gallery-dl failed: {error_msg}"
+                )
+
+        except subprocess.TimeoutExpired:
+            return DownloadResult(
+                success=False,
+                item=item,
+                error="gallery-dl timed out"
+            )
+        except Exception as e:
+            return DownloadResult(
+                success=False,
+                item=item,
+                error=str(e)
+            )
+
+    def _download_from_imagetwist(self, item: DownloadItem) -> DownloadResult:
+        """Download image from ImageTwist using gallery-dl for URL resolution"""
+        import subprocess
+        start_time = time.time()
+
+        # Rate limiting for ImageTwist (they return error images if too fast)
+        if not hasattr(self, '_imagetwist_last_request'):
+            self._imagetwist_last_request = 0
+
+        with self.download_lock:
+            elapsed = time.time() - self._imagetwist_last_request
+            if elapsed < 2.0:  # Minimum 2 seconds between ImageTwist requests
+                time.sleep(2.0 - elapsed)
+            self._imagetwist_last_request = time.time()
+
+        try:
+            # Use gallery-dl to get the actual image URL
+            result = subprocess.run(
+                ['/opt/media-downloader/venv/bin/gallery-dl', '-g', item.url],
+                capture_output=True, text=True, timeout=30
+            )
+
+            if result.returncode != 0 or not result.stdout.strip():
+                # Fallback to manual parsing
+                return self._download_from_imagetwist_fallback(item, start_time)
+
+            img_url = result.stdout.strip().split('\n')[0]
+
+            if not img_url or 'imagetwist' not in img_url:
+                return self._download_from_imagetwist_fallback(item, start_time)
+
+            # Rate limit again before actual download
+            with self.download_lock:
+                elapsed = time.time() - self._imagetwist_last_request
+                if elapsed < 2.0:
+                    time.sleep(2.0 - elapsed)
+                self._imagetwist_last_request = time.time()
+
+            # Download the actual image - use imagetwist page as Referer
+            item.save_path.parent.mkdir(parents=True, exist_ok=True)
+
+            headers = {
+                'User-Agent': self.user_agent,
+                'Referer': item.url  # Use imagetwist page URL as Referer
+            }
+
+            img_response = requests.get(img_url, headers=headers, timeout=30, stream=True)
+            img_response.raise_for_status()
+
+            # Check for ImageTwist error placeholder (8346 bytes - rate limited or deleted)
+            content_length = img_response.headers.get('Content-Length', '')
+            if content_length == '8346':
+                self.log(f"ImageTwist rate limited or unavailable: {item.url}", "warning")
+                return DownloadResult(success=False, item=item, error="ImageTwist error image (rate limited)")
+
+            # Validate it's an image, not HTML
+            chunks = []
+            for chunk in img_response.iter_content(chunk_size=8192):
+                if not chunks:  # First chunk
+                    if chunk[:100].lower().find(b'<html') != -1 or chunk[:100].lower().find(b'<!doctype') != -1:
+                        return DownloadResult(
+                            success=False,
+                            item=item,
+                            error="Got HTML instead of image"
+                        )
+                chunks.append(chunk)
+
+            # Save the image
+            with open(item.save_path, 'wb') as f:
+                for chunk in chunks:
+                    f.write(chunk)
+
+            file_size = item.save_path.stat().st_size
+            download_time = time.time() - start_time
+
+            # Calculate hash (SHA256 for consistency with unified database)
+            with open(item.save_path, 'rb') as f:
+                file_hash = hashlib.sha256(f.read()).hexdigest()
+
+            # Set file timestamp if we have a date
+            if item.post_date:
+                try:
+                    timestamp_unix = item.post_date.timestamp()
+                    os.utime(item.save_path, (timestamp_unix, timestamp_unix))
+                except Exception:
+                    pass
+
+            self.log(f"Downloaded ImageTwist: {item.save_path.name}", "success")
+            return DownloadResult(
+                success=True,
+                item=item,
+                file_size=file_size,
+                download_time=download_time,
+                file_hash=file_hash
+            )
+
+        except Exception as e:
+            return DownloadResult(
+                success=False,
+                item=item,
+                error=f"ImageTwist download failed: {e}"
+            )
+
+    def _download_from_imagetwist_fallback(self, item: DownloadItem, start_time: float) -> DownloadResult:
+        """Fallback method using manual page parsing"""
+        from bs4 import BeautifulSoup
+        import re
+
+        try:
+            headers = {
+                'User-Agent': self.user_agent,
+                'Referer': item.referer or 'https://forum.phun.org/'
+            }
+
+            response = requests.get(item.url, headers=headers, timeout=30)
+            response.raise_for_status()
+
+            page_content = response.text
+            img_url = None
+
+            # Method 1: Look for pic class
+            soup = BeautifulSoup(page_content, 'html.parser')
+            pic_img = soup.find('img', class_='pic')
+            if pic_img and pic_img.get('src'):
+                img_url = pic_img['src']
+
+            # Method 2: Regex for i*.imagetwist.com/i/ pattern
+            if not img_url:
+                match = re.search(r'(https?://i\d*(?:phun)?\.imagetwist\.com/i/[^"\'>\s]+)', page_content)
+                if match:
+                    img_url = match.group(1)
+
+            if not img_url:
+                return DownloadResult(
+                    success=False,
+                    item=item,
+                    error="Could not find direct image URL on ImageTwist page"
+                )
+
+            # Download the actual image
+            item.save_path.parent.mkdir(parents=True, exist_ok=True)
+
+            img_response = requests.get(img_url, headers=headers, timeout=30, stream=True)
+            img_response.raise_for_status()
+
+            chunks = []
+            for chunk in img_response.iter_content(chunk_size=8192):
+                if not chunks:
+                    if chunk[:100].lower().find(b'<html') != -1:
+                        return DownloadResult(success=False, item=item, error="Got HTML instead of image")
+                chunks.append(chunk)
+
+            with open(item.save_path, 'wb') as f:
+                for chunk in chunks:
+                    f.write(chunk)
+
+            file_size = item.save_path.stat().st_size
+            download_time = time.time() - start_time
+
+            with open(item.save_path, 'rb') as f:
+                file_hash = hashlib.sha256(f.read()).hexdigest()
+
+            self.log(f"Downloaded ImageTwist (fallback): {item.save_path.name}", "success")
+            return DownloadResult(success=True, item=item, file_size=file_size, download_time=download_time, file_hash=file_hash)
+
+        except Exception as e:
+            return DownloadResult(success=False, item=item, error=f"ImageTwist fallback failed: {e}")
+
+    def _download_with_playwright(self, item: DownloadItem) -> DownloadResult:
+        """Download using Playwright for authenticated sessions"""
+        if not self.playwright_context:
+            return self._download_with_requests(item)
+        
+        start_time = time.time()
+        
+        try:
+            page = self.playwright_context.new_page()
+            try:
+                # Set headers
+                headers = item.headers or {}
+                if item.referer:
+                    headers['Referer'] = item.referer
+                if headers:
+                    page.set_extra_http_headers(headers)
+                
+                # Direct download (pixhost should already be processed)
+                response = page.goto(item.url, wait_until='networkidle', 
+                                   timeout=self.timeout * 1000)
+                
+                if response and response.ok:
+                    content = response.body()
+                    
+                    # Check for HTML error pages
+                    if content[:1000].lower().find(b'<!doctype') != -1 or \
+                       content[:1000].lower().find(b'<html') != -1:
+                        return DownloadResult(
+                            success=False,
+                            item=item,
+                            error="Got HTML instead of expected file"
+                        )
+                    
+                    # Save file
+                    item.save_path.parent.mkdir(parents=True, exist_ok=True)
+                    item.save_path.write_bytes(content)
+                    
+                    # Calculate hash (SHA256 for consistency with unified database)
+                    file_hash = hashlib.sha256(content).hexdigest()
+                    
+                    # Update timestamps if we have a date
+                    if item.post_date:
+                        try:
+                            timestamp_unix = item.post_date.timestamp()
+                            os.utime(item.save_path, (timestamp_unix, timestamp_unix))
+                            self.log(f"Set timestamp to {item.post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
+                        except Exception as e:
+                            self.log(f"Failed to set timestamp: {e}", "warning")
+                    
+                    download_time = time.time() - start_time
+                    
+                    return DownloadResult(
+                        success=True,
+                        item=item,
+                        file_size=len(content),
+                        download_time=download_time,
+                        file_hash=file_hash
+                    )
+                else:
+                    return DownloadResult(
+                        success=False,
+                        item=item,
+                        error=f"HTTP {response.status if response else 'No response'}"
+                    )
+                    
+            finally:
+                page.close()
+                
+        except Exception as e:
+            return DownloadResult(
+                success=False,
+                item=item,
+                error=str(e)
+            )
+    
+    def _download_with_requests(self, item: DownloadItem) -> DownloadResult:
+        """Download using requests library"""
+        start_time = time.time()
+        
+        try:
+            headers = item.headers or {}
+            headers['User-Agent'] = self.user_agent
+            if item.referer:
+                headers['Referer'] = item.referer
+            
+            # Use cookies if available
+            cookies = getattr(self, 'cookies', {})
+            
+            response = requests.get(
+                item.url,
+                headers=headers,
+                cookies=cookies if cookies else None,
+                timeout=self.timeout,
+                stream=True
+            )
+            response.raise_for_status()
+            
+            # Stream download to memory first to validate content
+            item.save_path.parent.mkdir(parents=True, exist_ok=True)
+            content = b''
+            first_chunk_checked = False
+
+            for chunk in response.iter_content(chunk_size=self.chunk_size):
+                if chunk:
+                    # Check first chunk for HTML error pages
+                    if not first_chunk_checked:
+                        first_chunk_checked = True
+                        if chunk[:100].lower().find(b'<html') != -1 or \
+                           chunk[:100].lower().find(b'<!doctype') != -1 or \
+                           chunk[:100].lower().find(b'<head>') != -1:
+                            return DownloadResult(
+                                success=False,
+                                item=item,
+                                error="Got HTML instead of image"
+                            )
+                    content += chunk
+
+            # Save to file only after validation
+            with open(item.save_path, 'wb') as f:
+                f.write(content)
+
+            # Calculate hash (SHA256 for consistency with unified database)
+            file_hash = hashlib.sha256(content).hexdigest()
+
+            # Set file timestamp if we have a date
+            if item.post_date:
+                try:
+                    timestamp_unix = item.post_date.timestamp()
+                    os.utime(item.save_path, (timestamp_unix, timestamp_unix))
+                    self.log(f"Set timestamp to {item.post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
+                except Exception as e:
+                    self.log(f"Failed to set timestamp: {e}", "warning")
+
+            download_time = time.time() - start_time
+
+            return DownloadResult(
+                success=True,
+                item=item,
+                file_size=len(content),
+                download_time=download_time,
+                file_hash=file_hash
+            )
+            
+        except Exception as e:
+            # Clean up partial download
+            if item.save_path.exists():
+                item.save_path.unlink()
+            
+            return DownloadResult(
+                success=False,
+                item=item,
+                error=str(e)
+            )
+    
+    def _download_worker(self, item: DownloadItem, thread_id: int) -> DownloadResult:
+        """Worker function for downloading a single item"""
+        # Process image hosting URLs to get direct URLs
+        if 'pixhost.to/show/' in item.url:
+            direct_url = self._extract_pixhost_direct_url(item.url)
+            if direct_url:
+                self.log(f"Converted pixhost URL to direct: {direct_url.split('/')[-1]}", "debug")
+                item.url = direct_url
+            else:
+                self.log(f"Failed to extract pixhost direct URL: {item.url}", "warning")
+        
+        elif 'imagebam.com' in item.url:
+            direct_url = self._extract_imagebam_direct_url(item.url)
+            if direct_url:
+                self.log(f"Converted ImageBam URL to direct: {direct_url.split('/')[-1]}", "debug")
+                item.url = direct_url
+            else:
+                self.log(f"Failed to extract ImageBam direct URL: {item.url}", "warning")
+
+        elif 'imagetwist.com' in item.url:
+            # ImageTwist requires parsing the page to get direct image URL
+            result = self._download_from_imagetwist(item)
+            if result.success:
+                return result
+            self.log(f"ImageTwist download failed: {item.url}", "warning")
+
+        # Check if already downloaded
+        if self._is_already_downloaded(item.url, item.save_path):
+            self.log(f"Already downloaded: {item.save_path.name}", "skip")
+            return DownloadResult(
+                success=True,
+                item=item,
+                file_size=item.save_path.stat().st_size if item.save_path.exists() else 0
+            )
+        
+        # Apply rate limiting
+        self._apply_rate_limit(thread_id)
+        
+        # Always use requests for direct image downloads (faster)
+        result = self._download_with_requests(item)
+        
+        # Handle retries
+        if not result.success and item.retry_count < item.max_retries:
+            item.retry_count += 1
+            self.log(f"Retrying {item.url} ({item.retry_count}/{item.max_retries})", "warning")
+            time.sleep(self.rate_limit * 2)  # Extra delay before retry
+            return self._download_worker(item, thread_id)
+        
+        # Save to database if successful
+        if result.success and self.use_database:
+            self._save_to_database(result)
+        
+        # Update statistics
+        with self.download_lock:
+            if result.success:
+                self.stats['successful'] += 1
+                if result.file_size:
+                    self.stats['total_bytes'] += result.file_size
+                if result.download_time:
+                    self.stats['total_time'] += result.download_time
+            else:
+                self.stats['failed'] += 1
+        
+        return result
+    
+    def _save_to_database(self, result: DownloadResult):
+        """Save successful download to database"""
+        conn = sqlite3.connect(self.db_path)
+        try:
+            cursor = conn.cursor()
+
+            metadata_str = None
+            if result.item.metadata:
+                import json
+                metadata_str = json.dumps(result.item.metadata)
+
+            cursor.execute('''
+                INSERT OR REPLACE INTO downloads
+                (url, file_path, file_hash, file_size, metadata)
+                VALUES (?, ?, ?, ?, ?)
+            ''', (
+                result.item.url,
+                str(result.item.save_path),
+                result.file_hash,
+                result.file_size,
+                metadata_str
+            ))
+
+            conn.commit()
+        finally:
+            conn.close()
+    
+    def download_batch(self, items: List[DownloadItem], 
+                      progress_callback: Optional[Callable] = None) -> List[DownloadResult]:
+        """
+        Download multiple items concurrently
+        
+        Args:
+            items: List of DownloadItem objects
+            progress_callback: Optional callback for progress updates
+        
+        Returns:
+            List of DownloadResult objects
+        """
+        self.stats['total'] = len(items)
+        results = []
+        
+        self.log(f"Starting batch download of {len(items)} items with {self.max_workers} workers", "info")
+        
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            # Submit all downloads
+            futures = {
+                executor.submit(self._download_worker, item, i % self.max_workers): item
+                for i, item in enumerate(items)
+            }
+            
+            # Process completed downloads
+            completed = 0
+            for future in as_completed(futures):
+                result = future.result()
+                results.append(result)
+                completed += 1
+                
+                # Progress update
+                if progress_callback:
+                    progress_callback(completed, len(items), result)
+                
+                if self.show_progress:
+                    pct = (completed / len(items)) * 100
+                    status = "✓" if result.success else "✗"
+                    self.log(
+                        f"[{completed}/{len(items)}] {pct:.1f}% - {status} {result.item.save_path.name}",
+                        "success" if result.success else "error"
+                    )
+        
+        # Summary
+        self.log(f"Batch complete: {self.stats['successful']} successful, {self.stats['failed']} failed", "info")
+        
+        if self.stats['successful'] > 0:
+            avg_speed = self.stats['total_bytes'] / self.stats['total_time'] / 1024 / 1024
+            self.log(f"Average speed: {avg_speed:.2f} MB/s", "info")
+        
+        return results
+    
+    def download_urls(self, urls: List[str], base_path: Path, 
+                     referer: Optional[str] = None,
+                     metadata: Optional[Dict] = None) -> List[DownloadResult]:
+        """
+        Convenience method to download URLs to a directory
+        
+        Args:
+            urls: List of URLs to download
+            base_path: Directory to save files
+            referer: Optional referer header
+            metadata: Optional metadata for all downloads
+        
+        Returns:
+            List of DownloadResult objects
+        """
+        items = []
+        for url in urls:
+            filename = os.path.basename(urlparse(url).path) or f"download_{hashlib.sha256(url.encode()).hexdigest()[:8]}"
+            save_path = base_path / filename
+            
+            items.append(DownloadItem(
+                url=url,
+                save_path=save_path,
+                referer=referer,
+                metadata=metadata
+            ))
+        
+        return self.download_batch(items)
+    
+    def get_statistics(self) -> Dict:
+        """Get download statistics"""
+        return self.stats.copy()
+    
+    def cleanup_old_downloads(self, days: int = 30):
+        """Remove old download records from database"""
+        if not self.use_database:
+            return 0
+
+        conn = sqlite3.connect(self.db_path)
+        try:
+            cursor = conn.cursor()
+
+            cursor.execute('''
+                DELETE FROM downloads
+                WHERE download_date < datetime('now', ? || ' days')
+            ''', (-days,))
+
+            deleted = cursor.rowcount
+            conn.commit()
+        finally:
+            conn.close()
+
+        self.log(f"Cleaned up {deleted} old download records", "info")
+        return deleted
+
+
+# Example usage
+if __name__ == "__main__":
+    from pathlib import Path
+    
+    # Test download manager
+    manager = DownloadManager(
+        max_workers=3,
+        rate_limit=0.5,
+        show_progress=True
+    )
+    
+    # Test URLs
+    urls = [
+        "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
+        "https://sample-videos.com/img/Sample-jpg-image-50kb.jpg",
+        "https://www.w3schools.com/html/img_girl.jpg"
+    ]
+    
+    # Download
+    results = manager.download_urls(urls, Path("/tmp/test-downloads"))
+    
+    # Print results
+    logger.info(f"Downloaded {len([r for r in results if r.success])} of {len(results)} files")
+    logger.info(f"Total bytes: {manager.stats['total_bytes'] / 1024:.1f} KB")
+    logger.info(f"Total time: {manager.stats['total_time']:.2f} seconds")