Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

940
modules/download_manager.py Executable file
View File

@@ -0,0 +1,940 @@
#!/usr/bin/env python3
"""
Multi-threaded Download Manager
Handles concurrent downloads with rate limiting, retries, and progress tracking
Can be used by forum_downloader, fastdl_module, and other downloaders
"""
import os
import re
import time
import hashlib
import requests
import threading
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Any, Callable
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock, Semaphore
from dataclasses import dataclass
import sqlite3
from urllib.parse import urlparse
from modules.base_module import LoggingMixin
from modules.universal_logger import get_logger
logger = get_logger('DownloadManager') # For standalone/example usage
@dataclass
class DownloadItem:
"""Single download item"""
url: str
save_path: Path
referer: Optional[str] = None
headers: Optional[Dict[str, str]] = None
metadata: Optional[Dict[str, Any]] = None
post_date: Optional[datetime] = None # Timestamp to set on downloaded file
retry_count: int = 0
max_retries: int = 3
@dataclass
class DownloadResult:
"""Result of a download"""
success: bool
item: DownloadItem
file_size: Optional[int] = None
download_time: Optional[float] = None
error: Optional[str] = None
file_hash: Optional[str] = None
class DownloadManager(LoggingMixin):
"""
Multi-threaded download manager with:
- Concurrent downloads
- Rate limiting
- Automatic retries
- Progress tracking
- Database tracking
- Playwright support for authenticated downloads
"""
def __init__(self,
max_workers: int = 5,
rate_limit: float = 0.5,
timeout: int = 30,
chunk_size: int = 8192,
use_database: bool = False,
db_path: str = None,
show_progress: bool = True,
show_debug: bool = False):
"""
Initialize download manager
Args:
max_workers: Maximum concurrent downloads
rate_limit: Seconds between downloads per thread
timeout: Download timeout in seconds
chunk_size: Chunk size for streaming downloads
use_database: Track downloads in database
db_path: Path to database file
show_progress: Show download progress
show_debug: Show debug messages
"""
self.max_workers = max_workers
self.rate_limit = rate_limit
self.timeout = timeout
self.chunk_size = chunk_size
self.use_database = use_database
self.db_path = db_path
self.show_progress = show_progress
# Initialize logging via mixin
self._init_logger('DownloadManager', None, default_module='Download', show_debug=show_debug)
# Thread synchronization
self.download_lock = Lock()
self.rate_limiter = Semaphore(max_workers)
self.last_download_time = {}
# Thread-local storage for ImageBam sessions (each thread gets its own session)
self._imagebam_session_local = threading.local()
# Statistics
self.stats = {
'total': 0,
'successful': 0,
'failed': 0,
'skipped': 0,
'total_bytes': 0,
'total_time': 0
}
# User agent
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
# Playwright context for authenticated downloads
self.playwright_context = None
# Initialize database only if explicitly enabled AND path provided
if self.use_database and self.db_path:
self._init_database()
elif self.use_database and not self.db_path:
# Disable database if no path provided to prevent creating files in CWD
self.use_database = False
def _init_database(self):
"""Initialize download tracking database"""
if not self.db_path:
return
conn = sqlite3.connect(self.db_path)
try:
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS downloads (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
file_path TEXT NOT NULL,
file_hash TEXT,
file_size INTEGER,
download_date DATETIME DEFAULT CURRENT_TIMESTAMP,
metadata TEXT
)
''')
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_downloads_url ON downloads(url)
''')
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_downloads_hash ON downloads(file_hash)
''')
conn.commit()
finally:
conn.close()
def set_playwright_context(self, context):
"""Set Playwright context for authenticated downloads"""
self.playwright_context = context
# Extract cookies from context for requests library
if context:
try:
self.cookies = {}
cookies = context.cookies()
for cookie in cookies:
self.cookies[cookie['name']] = cookie['value']
except Exception:
self.cookies = {}
def _is_already_downloaded(self, url: str, file_path: Path) -> bool:
"""Check if file was already downloaded"""
if not self.use_database:
return file_path.exists() and file_path.stat().st_size > 0
conn = sqlite3.connect(self.db_path)
try:
cursor = conn.cursor()
cursor.execute(
"SELECT file_path, file_size FROM downloads WHERE url = ?",
(url,)
)
result = cursor.fetchone()
finally:
conn.close()
if result:
# Check if file still exists and has expected size
saved_path = Path(result[0])
if saved_path.exists() and saved_path.stat().st_size == result[1]:
return True
return False
def _apply_rate_limit(self, thread_id: int):
"""Apply rate limiting per thread"""
with self.download_lock:
if thread_id in self.last_download_time:
elapsed = time.time() - self.last_download_time[thread_id]
if elapsed < self.rate_limit:
time.sleep(self.rate_limit - elapsed)
self.last_download_time[thread_id] = time.time()
def _extract_pixhost_direct_url(self, show_url: str) -> Optional[str]:
"""Extract direct image URL from pixhost show URL"""
try:
# Pattern to extract ID and filename from show URL
show_pattern = re.compile(r"https?://(?:www\.)?pixhost\.to/show/(\d+)/([^/]+)$", re.IGNORECASE)
match = show_pattern.match(show_url)
if not match:
return None
img_id = match.group(1)
filename = match.group(2)
# Try common hosts in order
common_hosts = [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100]
for host_num in common_hosts:
test_url = f"https://img{host_num}.pixhost.to/images/{img_id}/{filename}"
try:
# Quick HEAD request to check if URL exists
response = requests.head(test_url, timeout=2, allow_redirects=False)
if response.status_code == 200:
return test_url
except requests.RequestException:
continue
# Try sequential scan if common hosts don't work
for host_num in range(1, 121):
if host_num in common_hosts:
continue
test_url = f"https://img{host_num}.pixhost.to/images/{img_id}/{filename}"
try:
response = requests.head(test_url, timeout=1, allow_redirects=False)
if response.status_code == 200:
return test_url
except requests.RequestException:
continue
return None
except Exception as e:
self.log(f"Error extracting pixhost URL: {e}", "error")
return None
def _extract_imagebam_direct_url(self, imagebam_url: str) -> Optional[str]:
"""Extract direct image URL from ImageBam page"""
try:
# Get or create thread-local ImageBam session (thread-safe)
session = getattr(self._imagebam_session_local, 'session', None)
if session is None:
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
# Set cookies to bypass the interstitial ad page (both old and new cookies)
session.cookies.set('nsfw_inter', '1', domain='.imagebam.com')
session.cookies.set('sfw_inter', '1', domain='.imagebam.com')
self._imagebam_session_local.session = session
# ImageBam now requires two requests - first to get session cookies, second to get image
# First request sets up the session
response = session.get(imagebam_url, timeout=5)
if response.status_code != 200:
self.log(f"ImageBam page returned {response.status_code}", "warning")
return None
# Check if we got the interstitial page (contains "Continue to your image")
if 'Continue to your image' in response.text or 'Please wait' in response.text:
# Make sure bypass cookies are set and request again
session.cookies.set('sfw_inter', '1', domain='.imagebam.com')
session.cookies.set('nsfw_inter', '1', domain='.imagebam.com')
response = session.get(imagebam_url, timeout=5)
# Look for the direct image URL in the HTML
# ImageBam stores the full image with _o suffix
# First try to find the full resolution image
full_img_pattern = r'(https?://images\d*\.imagebam\.com/[a-f0-9/]+/[A-Z0-9]+_o\.\w+)'
matches = re.findall(full_img_pattern, response.text, re.IGNORECASE)
if matches:
# Return the first full resolution image found
direct_url = matches[0]
self.log(f"Extracted ImageBam direct URL: {direct_url}", "debug")
return direct_url
# Fallback: look for any image on images*.imagebam.com
fallback_patterns = [
r'<img[^>]+src="(https?://images\d*\.imagebam\.com/[^"]+)"',
r'"(https?://images\d*\.imagebam\.com/[^"]+\.(?:jpg|jpeg|png|gif))"',
]
for pattern in fallback_patterns:
matches = re.findall(pattern, response.text, re.IGNORECASE)
if matches:
direct_url = matches[0]
self.log(f"Extracted ImageBam direct URL (fallback): {direct_url}", "debug")
return direct_url
self.log("No direct image URL found in ImageBam HTML", "warning")
return None
except requests.Timeout:
self.log(f"ImageBam extraction timed out for {imagebam_url}", "warning")
return None
except Exception as e:
self.log(f"Error extracting ImageBam URL: {e}", "error")
return None
def _download_with_gallery_dl(self, item: DownloadItem) -> DownloadResult:
"""Download using gallery-dl for supported hosts (ImageTwist, etc.)"""
import subprocess
start_time = time.time()
try:
# Ensure parent directory exists
item.save_path.parent.mkdir(parents=True, exist_ok=True)
# Build gallery-dl command
cmd = [
"gallery-dl",
"--dest", str(item.save_path.parent),
"--filename", item.save_path.name,
"--no-skip",
"--no-part",
"--quiet"
]
# Add referer if provided
if item.referer:
cmd.extend(["--header", f"Referer: {item.referer}"])
cmd.append(item.url)
# Run gallery-dl with timeout
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60
)
if result.returncode == 0 and item.save_path.exists():
file_size = item.save_path.stat().st_size
download_time = time.time() - start_time
# Calculate hash (SHA256 for consistency with unified database)
with open(item.save_path, 'rb') as f:
file_hash = hashlib.sha256(f.read()).hexdigest()
# Set file timestamp if we have a date
if item.post_date:
try:
timestamp_unix = item.post_date.timestamp()
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
except Exception as e:
self.log(f"Failed to set timestamp: {e}", "warning")
self.log(f"Downloaded via gallery-dl: {item.save_path.name}", "success")
return DownloadResult(
success=True,
item=item,
file_size=file_size,
download_time=download_time,
file_hash=file_hash
)
else:
error_msg = result.stderr or "Unknown error"
return DownloadResult(
success=False,
item=item,
error=f"gallery-dl failed: {error_msg}"
)
except subprocess.TimeoutExpired:
return DownloadResult(
success=False,
item=item,
error="gallery-dl timed out"
)
except Exception as e:
return DownloadResult(
success=False,
item=item,
error=str(e)
)
def _download_from_imagetwist(self, item: DownloadItem) -> DownloadResult:
"""Download image from ImageTwist using gallery-dl for URL resolution"""
import subprocess
start_time = time.time()
# Rate limiting for ImageTwist (they return error images if too fast)
if not hasattr(self, '_imagetwist_last_request'):
self._imagetwist_last_request = 0
with self.download_lock:
elapsed = time.time() - self._imagetwist_last_request
if elapsed < 2.0: # Minimum 2 seconds between ImageTwist requests
time.sleep(2.0 - elapsed)
self._imagetwist_last_request = time.time()
try:
# Use gallery-dl to get the actual image URL
result = subprocess.run(
['/opt/media-downloader/venv/bin/gallery-dl', '-g', item.url],
capture_output=True, text=True, timeout=30
)
if result.returncode != 0 or not result.stdout.strip():
# Fallback to manual parsing
return self._download_from_imagetwist_fallback(item, start_time)
img_url = result.stdout.strip().split('\n')[0]
if not img_url or 'imagetwist' not in img_url:
return self._download_from_imagetwist_fallback(item, start_time)
# Rate limit again before actual download
with self.download_lock:
elapsed = time.time() - self._imagetwist_last_request
if elapsed < 2.0:
time.sleep(2.0 - elapsed)
self._imagetwist_last_request = time.time()
# Download the actual image - use imagetwist page as Referer
item.save_path.parent.mkdir(parents=True, exist_ok=True)
headers = {
'User-Agent': self.user_agent,
'Referer': item.url # Use imagetwist page URL as Referer
}
img_response = requests.get(img_url, headers=headers, timeout=30, stream=True)
img_response.raise_for_status()
# Check for ImageTwist error placeholder (8346 bytes - rate limited or deleted)
content_length = img_response.headers.get('Content-Length', '')
if content_length == '8346':
self.log(f"ImageTwist rate limited or unavailable: {item.url}", "warning")
return DownloadResult(success=False, item=item, error="ImageTwist error image (rate limited)")
# Validate it's an image, not HTML
chunks = []
for chunk in img_response.iter_content(chunk_size=8192):
if not chunks: # First chunk
if chunk[:100].lower().find(b'<html') != -1 or chunk[:100].lower().find(b'<!doctype') != -1:
return DownloadResult(
success=False,
item=item,
error="Got HTML instead of image"
)
chunks.append(chunk)
# Save the image
with open(item.save_path, 'wb') as f:
for chunk in chunks:
f.write(chunk)
file_size = item.save_path.stat().st_size
download_time = time.time() - start_time
# Calculate hash (SHA256 for consistency with unified database)
with open(item.save_path, 'rb') as f:
file_hash = hashlib.sha256(f.read()).hexdigest()
# Set file timestamp if we have a date
if item.post_date:
try:
timestamp_unix = item.post_date.timestamp()
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
except Exception:
pass
self.log(f"Downloaded ImageTwist: {item.save_path.name}", "success")
return DownloadResult(
success=True,
item=item,
file_size=file_size,
download_time=download_time,
file_hash=file_hash
)
except Exception as e:
return DownloadResult(
success=False,
item=item,
error=f"ImageTwist download failed: {e}"
)
def _download_from_imagetwist_fallback(self, item: DownloadItem, start_time: float) -> DownloadResult:
"""Fallback method using manual page parsing"""
from bs4 import BeautifulSoup
import re
try:
headers = {
'User-Agent': self.user_agent,
'Referer': item.referer or 'https://forum.phun.org/'
}
response = requests.get(item.url, headers=headers, timeout=30)
response.raise_for_status()
page_content = response.text
img_url = None
# Method 1: Look for pic class
soup = BeautifulSoup(page_content, 'html.parser')
pic_img = soup.find('img', class_='pic')
if pic_img and pic_img.get('src'):
img_url = pic_img['src']
# Method 2: Regex for i*.imagetwist.com/i/ pattern
if not img_url:
match = re.search(r'(https?://i\d*(?:phun)?\.imagetwist\.com/i/[^"\'>\s]+)', page_content)
if match:
img_url = match.group(1)
if not img_url:
return DownloadResult(
success=False,
item=item,
error="Could not find direct image URL on ImageTwist page"
)
# Download the actual image
item.save_path.parent.mkdir(parents=True, exist_ok=True)
img_response = requests.get(img_url, headers=headers, timeout=30, stream=True)
img_response.raise_for_status()
chunks = []
for chunk in img_response.iter_content(chunk_size=8192):
if not chunks:
if chunk[:100].lower().find(b'<html') != -1:
return DownloadResult(success=False, item=item, error="Got HTML instead of image")
chunks.append(chunk)
with open(item.save_path, 'wb') as f:
for chunk in chunks:
f.write(chunk)
file_size = item.save_path.stat().st_size
download_time = time.time() - start_time
with open(item.save_path, 'rb') as f:
file_hash = hashlib.sha256(f.read()).hexdigest()
self.log(f"Downloaded ImageTwist (fallback): {item.save_path.name}", "success")
return DownloadResult(success=True, item=item, file_size=file_size, download_time=download_time, file_hash=file_hash)
except Exception as e:
return DownloadResult(success=False, item=item, error=f"ImageTwist fallback failed: {e}")
def _download_with_playwright(self, item: DownloadItem) -> DownloadResult:
"""Download using Playwright for authenticated sessions"""
if not self.playwright_context:
return self._download_with_requests(item)
start_time = time.time()
try:
page = self.playwright_context.new_page()
try:
# Set headers
headers = item.headers or {}
if item.referer:
headers['Referer'] = item.referer
if headers:
page.set_extra_http_headers(headers)
# Direct download (pixhost should already be processed)
response = page.goto(item.url, wait_until='networkidle',
timeout=self.timeout * 1000)
if response and response.ok:
content = response.body()
# Check for HTML error pages
if content[:1000].lower().find(b'<!doctype') != -1 or \
content[:1000].lower().find(b'<html') != -1:
return DownloadResult(
success=False,
item=item,
error="Got HTML instead of expected file"
)
# Save file
item.save_path.parent.mkdir(parents=True, exist_ok=True)
item.save_path.write_bytes(content)
# Calculate hash (SHA256 for consistency with unified database)
file_hash = hashlib.sha256(content).hexdigest()
# Update timestamps if we have a date
if item.post_date:
try:
timestamp_unix = item.post_date.timestamp()
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
self.log(f"Set timestamp to {item.post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
except Exception as e:
self.log(f"Failed to set timestamp: {e}", "warning")
download_time = time.time() - start_time
return DownloadResult(
success=True,
item=item,
file_size=len(content),
download_time=download_time,
file_hash=file_hash
)
else:
return DownloadResult(
success=False,
item=item,
error=f"HTTP {response.status if response else 'No response'}"
)
finally:
page.close()
except Exception as e:
return DownloadResult(
success=False,
item=item,
error=str(e)
)
def _download_with_requests(self, item: DownloadItem) -> DownloadResult:
"""Download using requests library"""
start_time = time.time()
try:
headers = item.headers or {}
headers['User-Agent'] = self.user_agent
if item.referer:
headers['Referer'] = item.referer
# Use cookies if available
cookies = getattr(self, 'cookies', {})
response = requests.get(
item.url,
headers=headers,
cookies=cookies if cookies else None,
timeout=self.timeout,
stream=True
)
response.raise_for_status()
# Stream download to memory first to validate content
item.save_path.parent.mkdir(parents=True, exist_ok=True)
content = b''
first_chunk_checked = False
for chunk in response.iter_content(chunk_size=self.chunk_size):
if chunk:
# Check first chunk for HTML error pages
if not first_chunk_checked:
first_chunk_checked = True
if chunk[:100].lower().find(b'<html') != -1 or \
chunk[:100].lower().find(b'<!doctype') != -1 or \
chunk[:100].lower().find(b'<head>') != -1:
return DownloadResult(
success=False,
item=item,
error="Got HTML instead of image"
)
content += chunk
# Save to file only after validation
with open(item.save_path, 'wb') as f:
f.write(content)
# Calculate hash (SHA256 for consistency with unified database)
file_hash = hashlib.sha256(content).hexdigest()
# Set file timestamp if we have a date
if item.post_date:
try:
timestamp_unix = item.post_date.timestamp()
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
self.log(f"Set timestamp to {item.post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
except Exception as e:
self.log(f"Failed to set timestamp: {e}", "warning")
download_time = time.time() - start_time
return DownloadResult(
success=True,
item=item,
file_size=len(content),
download_time=download_time,
file_hash=file_hash
)
except Exception as e:
# Clean up partial download
if item.save_path.exists():
item.save_path.unlink()
return DownloadResult(
success=False,
item=item,
error=str(e)
)
def _download_worker(self, item: DownloadItem, thread_id: int) -> DownloadResult:
"""Worker function for downloading a single item"""
# Process image hosting URLs to get direct URLs
if 'pixhost.to/show/' in item.url:
direct_url = self._extract_pixhost_direct_url(item.url)
if direct_url:
self.log(f"Converted pixhost URL to direct: {direct_url.split('/')[-1]}", "debug")
item.url = direct_url
else:
self.log(f"Failed to extract pixhost direct URL: {item.url}", "warning")
elif 'imagebam.com' in item.url:
direct_url = self._extract_imagebam_direct_url(item.url)
if direct_url:
self.log(f"Converted ImageBam URL to direct: {direct_url.split('/')[-1]}", "debug")
item.url = direct_url
else:
self.log(f"Failed to extract ImageBam direct URL: {item.url}", "warning")
elif 'imagetwist.com' in item.url:
# ImageTwist requires parsing the page to get direct image URL
result = self._download_from_imagetwist(item)
if result.success:
return result
self.log(f"ImageTwist download failed: {item.url}", "warning")
# Check if already downloaded
if self._is_already_downloaded(item.url, item.save_path):
self.log(f"Already downloaded: {item.save_path.name}", "skip")
return DownloadResult(
success=True,
item=item,
file_size=item.save_path.stat().st_size if item.save_path.exists() else 0
)
# Apply rate limiting
self._apply_rate_limit(thread_id)
# Always use requests for direct image downloads (faster)
result = self._download_with_requests(item)
# Handle retries
if not result.success and item.retry_count < item.max_retries:
item.retry_count += 1
self.log(f"Retrying {item.url} ({item.retry_count}/{item.max_retries})", "warning")
time.sleep(self.rate_limit * 2) # Extra delay before retry
return self._download_worker(item, thread_id)
# Save to database if successful
if result.success and self.use_database:
self._save_to_database(result)
# Update statistics
with self.download_lock:
if result.success:
self.stats['successful'] += 1
if result.file_size:
self.stats['total_bytes'] += result.file_size
if result.download_time:
self.stats['total_time'] += result.download_time
else:
self.stats['failed'] += 1
return result
def _save_to_database(self, result: DownloadResult):
"""Save successful download to database"""
conn = sqlite3.connect(self.db_path)
try:
cursor = conn.cursor()
metadata_str = None
if result.item.metadata:
import json
metadata_str = json.dumps(result.item.metadata)
cursor.execute('''
INSERT OR REPLACE INTO downloads
(url, file_path, file_hash, file_size, metadata)
VALUES (?, ?, ?, ?, ?)
''', (
result.item.url,
str(result.item.save_path),
result.file_hash,
result.file_size,
metadata_str
))
conn.commit()
finally:
conn.close()
def download_batch(self, items: List[DownloadItem],
progress_callback: Optional[Callable] = None) -> List[DownloadResult]:
"""
Download multiple items concurrently
Args:
items: List of DownloadItem objects
progress_callback: Optional callback for progress updates
Returns:
List of DownloadResult objects
"""
self.stats['total'] = len(items)
results = []
self.log(f"Starting batch download of {len(items)} items with {self.max_workers} workers", "info")
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# Submit all downloads
futures = {
executor.submit(self._download_worker, item, i % self.max_workers): item
for i, item in enumerate(items)
}
# Process completed downloads
completed = 0
for future in as_completed(futures):
result = future.result()
results.append(result)
completed += 1
# Progress update
if progress_callback:
progress_callback(completed, len(items), result)
if self.show_progress:
pct = (completed / len(items)) * 100
status = "" if result.success else ""
self.log(
f"[{completed}/{len(items)}] {pct:.1f}% - {status} {result.item.save_path.name}",
"success" if result.success else "error"
)
# Summary
self.log(f"Batch complete: {self.stats['successful']} successful, {self.stats['failed']} failed", "info")
if self.stats['successful'] > 0:
avg_speed = self.stats['total_bytes'] / self.stats['total_time'] / 1024 / 1024
self.log(f"Average speed: {avg_speed:.2f} MB/s", "info")
return results
def download_urls(self, urls: List[str], base_path: Path,
referer: Optional[str] = None,
metadata: Optional[Dict] = None) -> List[DownloadResult]:
"""
Convenience method to download URLs to a directory
Args:
urls: List of URLs to download
base_path: Directory to save files
referer: Optional referer header
metadata: Optional metadata for all downloads
Returns:
List of DownloadResult objects
"""
items = []
for url in urls:
filename = os.path.basename(urlparse(url).path) or f"download_{hashlib.sha256(url.encode()).hexdigest()[:8]}"
save_path = base_path / filename
items.append(DownloadItem(
url=url,
save_path=save_path,
referer=referer,
metadata=metadata
))
return self.download_batch(items)
def get_statistics(self) -> Dict:
"""Get download statistics"""
return self.stats.copy()
def cleanup_old_downloads(self, days: int = 30):
"""Remove old download records from database"""
if not self.use_database:
return 0
conn = sqlite3.connect(self.db_path)
try:
cursor = conn.cursor()
cursor.execute('''
DELETE FROM downloads
WHERE download_date < datetime('now', ? || ' days')
''', (-days,))
deleted = cursor.rowcount
conn.commit()
finally:
conn.close()
self.log(f"Cleaned up {deleted} old download records", "info")
return deleted
# Example usage
if __name__ == "__main__":
from pathlib import Path
# Test download manager
manager = DownloadManager(
max_workers=3,
rate_limit=0.5,
show_progress=True
)
# Test URLs
urls = [
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
"https://sample-videos.com/img/Sample-jpg-image-50kb.jpg",
"https://www.w3schools.com/html/img_girl.jpg"
]
# Download
results = manager.download_urls(urls, Path("/tmp/test-downloads"))
# Print results
logger.info(f"Downloaded {len([r for r in results if r.success])} of {len(results)} files")
logger.info(f"Total bytes: {manager.stats['total_bytes'] / 1024:.1f} KB")
logger.info(f"Total time: {manager.stats['total_time']:.2f} seconds")