940
modules/download_manager.py
Executable file
940
modules/download_manager.py
Executable file
@@ -0,0 +1,940 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Multi-threaded Download Manager
|
||||
Handles concurrent downloads with rate limiting, retries, and progress tracking
|
||||
Can be used by forum_downloader, fastdl_module, and other downloaders
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import hashlib
|
||||
import requests
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Any, Callable
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from threading import Lock, Semaphore
|
||||
from dataclasses import dataclass
|
||||
import sqlite3
|
||||
from urllib.parse import urlparse
|
||||
from modules.base_module import LoggingMixin
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('DownloadManager') # For standalone/example usage
|
||||
|
||||
|
||||
@dataclass
|
||||
class DownloadItem:
|
||||
"""Single download item"""
|
||||
url: str
|
||||
save_path: Path
|
||||
referer: Optional[str] = None
|
||||
headers: Optional[Dict[str, str]] = None
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
post_date: Optional[datetime] = None # Timestamp to set on downloaded file
|
||||
retry_count: int = 0
|
||||
max_retries: int = 3
|
||||
|
||||
|
||||
@dataclass
|
||||
class DownloadResult:
|
||||
"""Result of a download"""
|
||||
success: bool
|
||||
item: DownloadItem
|
||||
file_size: Optional[int] = None
|
||||
download_time: Optional[float] = None
|
||||
error: Optional[str] = None
|
||||
file_hash: Optional[str] = None
|
||||
|
||||
|
||||
class DownloadManager(LoggingMixin):
|
||||
"""
|
||||
Multi-threaded download manager with:
|
||||
- Concurrent downloads
|
||||
- Rate limiting
|
||||
- Automatic retries
|
||||
- Progress tracking
|
||||
- Database tracking
|
||||
- Playwright support for authenticated downloads
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
max_workers: int = 5,
|
||||
rate_limit: float = 0.5,
|
||||
timeout: int = 30,
|
||||
chunk_size: int = 8192,
|
||||
use_database: bool = False,
|
||||
db_path: str = None,
|
||||
show_progress: bool = True,
|
||||
show_debug: bool = False):
|
||||
"""
|
||||
Initialize download manager
|
||||
|
||||
Args:
|
||||
max_workers: Maximum concurrent downloads
|
||||
rate_limit: Seconds between downloads per thread
|
||||
timeout: Download timeout in seconds
|
||||
chunk_size: Chunk size for streaming downloads
|
||||
use_database: Track downloads in database
|
||||
db_path: Path to database file
|
||||
show_progress: Show download progress
|
||||
show_debug: Show debug messages
|
||||
"""
|
||||
self.max_workers = max_workers
|
||||
self.rate_limit = rate_limit
|
||||
self.timeout = timeout
|
||||
self.chunk_size = chunk_size
|
||||
self.use_database = use_database
|
||||
self.db_path = db_path
|
||||
self.show_progress = show_progress
|
||||
|
||||
# Initialize logging via mixin
|
||||
self._init_logger('DownloadManager', None, default_module='Download', show_debug=show_debug)
|
||||
|
||||
# Thread synchronization
|
||||
self.download_lock = Lock()
|
||||
self.rate_limiter = Semaphore(max_workers)
|
||||
self.last_download_time = {}
|
||||
|
||||
# Thread-local storage for ImageBam sessions (each thread gets its own session)
|
||||
self._imagebam_session_local = threading.local()
|
||||
|
||||
# Statistics
|
||||
self.stats = {
|
||||
'total': 0,
|
||||
'successful': 0,
|
||||
'failed': 0,
|
||||
'skipped': 0,
|
||||
'total_bytes': 0,
|
||||
'total_time': 0
|
||||
}
|
||||
|
||||
# User agent
|
||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
|
||||
# Playwright context for authenticated downloads
|
||||
self.playwright_context = None
|
||||
|
||||
# Initialize database only if explicitly enabled AND path provided
|
||||
if self.use_database and self.db_path:
|
||||
self._init_database()
|
||||
elif self.use_database and not self.db_path:
|
||||
# Disable database if no path provided to prevent creating files in CWD
|
||||
self.use_database = False
|
||||
|
||||
def _init_database(self):
|
||||
"""Initialize download tracking database"""
|
||||
if not self.db_path:
|
||||
return
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS downloads (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT UNIQUE NOT NULL,
|
||||
file_path TEXT NOT NULL,
|
||||
file_hash TEXT,
|
||||
file_size INTEGER,
|
||||
download_date DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
metadata TEXT
|
||||
)
|
||||
''')
|
||||
|
||||
cursor.execute('''
|
||||
CREATE INDEX IF NOT EXISTS idx_downloads_url ON downloads(url)
|
||||
''')
|
||||
cursor.execute('''
|
||||
CREATE INDEX IF NOT EXISTS idx_downloads_hash ON downloads(file_hash)
|
||||
''')
|
||||
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def set_playwright_context(self, context):
|
||||
"""Set Playwright context for authenticated downloads"""
|
||||
self.playwright_context = context
|
||||
# Extract cookies from context for requests library
|
||||
if context:
|
||||
try:
|
||||
self.cookies = {}
|
||||
cookies = context.cookies()
|
||||
for cookie in cookies:
|
||||
self.cookies[cookie['name']] = cookie['value']
|
||||
except Exception:
|
||||
self.cookies = {}
|
||||
|
||||
def _is_already_downloaded(self, url: str, file_path: Path) -> bool:
|
||||
"""Check if file was already downloaded"""
|
||||
if not self.use_database:
|
||||
return file_path.exists() and file_path.stat().st_size > 0
|
||||
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute(
|
||||
"SELECT file_path, file_size FROM downloads WHERE url = ?",
|
||||
(url,)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if result:
|
||||
# Check if file still exists and has expected size
|
||||
saved_path = Path(result[0])
|
||||
if saved_path.exists() and saved_path.stat().st_size == result[1]:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _apply_rate_limit(self, thread_id: int):
|
||||
"""Apply rate limiting per thread"""
|
||||
with self.download_lock:
|
||||
if thread_id in self.last_download_time:
|
||||
elapsed = time.time() - self.last_download_time[thread_id]
|
||||
if elapsed < self.rate_limit:
|
||||
time.sleep(self.rate_limit - elapsed)
|
||||
self.last_download_time[thread_id] = time.time()
|
||||
|
||||
def _extract_pixhost_direct_url(self, show_url: str) -> Optional[str]:
|
||||
"""Extract direct image URL from pixhost show URL"""
|
||||
try:
|
||||
# Pattern to extract ID and filename from show URL
|
||||
show_pattern = re.compile(r"https?://(?:www\.)?pixhost\.to/show/(\d+)/([^/]+)$", re.IGNORECASE)
|
||||
match = show_pattern.match(show_url)
|
||||
|
||||
if not match:
|
||||
return None
|
||||
|
||||
img_id = match.group(1)
|
||||
filename = match.group(2)
|
||||
|
||||
# Try common hosts in order
|
||||
common_hosts = [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100]
|
||||
|
||||
for host_num in common_hosts:
|
||||
test_url = f"https://img{host_num}.pixhost.to/images/{img_id}/{filename}"
|
||||
|
||||
try:
|
||||
# Quick HEAD request to check if URL exists
|
||||
response = requests.head(test_url, timeout=2, allow_redirects=False)
|
||||
if response.status_code == 200:
|
||||
return test_url
|
||||
except requests.RequestException:
|
||||
continue
|
||||
|
||||
# Try sequential scan if common hosts don't work
|
||||
for host_num in range(1, 121):
|
||||
if host_num in common_hosts:
|
||||
continue
|
||||
|
||||
test_url = f"https://img{host_num}.pixhost.to/images/{img_id}/{filename}"
|
||||
|
||||
try:
|
||||
response = requests.head(test_url, timeout=1, allow_redirects=False)
|
||||
if response.status_code == 200:
|
||||
return test_url
|
||||
except requests.RequestException:
|
||||
continue
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
self.log(f"Error extracting pixhost URL: {e}", "error")
|
||||
return None
|
||||
|
||||
def _extract_imagebam_direct_url(self, imagebam_url: str) -> Optional[str]:
|
||||
"""Extract direct image URL from ImageBam page"""
|
||||
try:
|
||||
# Get or create thread-local ImageBam session (thread-safe)
|
||||
session = getattr(self._imagebam_session_local, 'session', None)
|
||||
if session is None:
|
||||
session = requests.Session()
|
||||
session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
})
|
||||
# Set cookies to bypass the interstitial ad page (both old and new cookies)
|
||||
session.cookies.set('nsfw_inter', '1', domain='.imagebam.com')
|
||||
session.cookies.set('sfw_inter', '1', domain='.imagebam.com')
|
||||
self._imagebam_session_local.session = session
|
||||
|
||||
# ImageBam now requires two requests - first to get session cookies, second to get image
|
||||
# First request sets up the session
|
||||
response = session.get(imagebam_url, timeout=5)
|
||||
|
||||
if response.status_code != 200:
|
||||
self.log(f"ImageBam page returned {response.status_code}", "warning")
|
||||
return None
|
||||
|
||||
# Check if we got the interstitial page (contains "Continue to your image")
|
||||
if 'Continue to your image' in response.text or 'Please wait' in response.text:
|
||||
# Make sure bypass cookies are set and request again
|
||||
session.cookies.set('sfw_inter', '1', domain='.imagebam.com')
|
||||
session.cookies.set('nsfw_inter', '1', domain='.imagebam.com')
|
||||
response = session.get(imagebam_url, timeout=5)
|
||||
|
||||
# Look for the direct image URL in the HTML
|
||||
# ImageBam stores the full image with _o suffix
|
||||
# First try to find the full resolution image
|
||||
full_img_pattern = r'(https?://images\d*\.imagebam\.com/[a-f0-9/]+/[A-Z0-9]+_o\.\w+)'
|
||||
matches = re.findall(full_img_pattern, response.text, re.IGNORECASE)
|
||||
|
||||
if matches:
|
||||
# Return the first full resolution image found
|
||||
direct_url = matches[0]
|
||||
self.log(f"Extracted ImageBam direct URL: {direct_url}", "debug")
|
||||
return direct_url
|
||||
|
||||
# Fallback: look for any image on images*.imagebam.com
|
||||
fallback_patterns = [
|
||||
r'<img[^>]+src="(https?://images\d*\.imagebam\.com/[^"]+)"',
|
||||
r'"(https?://images\d*\.imagebam\.com/[^"]+\.(?:jpg|jpeg|png|gif))"',
|
||||
]
|
||||
|
||||
for pattern in fallback_patterns:
|
||||
matches = re.findall(pattern, response.text, re.IGNORECASE)
|
||||
if matches:
|
||||
direct_url = matches[0]
|
||||
self.log(f"Extracted ImageBam direct URL (fallback): {direct_url}", "debug")
|
||||
return direct_url
|
||||
|
||||
self.log("No direct image URL found in ImageBam HTML", "warning")
|
||||
return None
|
||||
|
||||
except requests.Timeout:
|
||||
self.log(f"ImageBam extraction timed out for {imagebam_url}", "warning")
|
||||
return None
|
||||
except Exception as e:
|
||||
self.log(f"Error extracting ImageBam URL: {e}", "error")
|
||||
return None
|
||||
|
||||
def _download_with_gallery_dl(self, item: DownloadItem) -> DownloadResult:
|
||||
"""Download using gallery-dl for supported hosts (ImageTwist, etc.)"""
|
||||
import subprocess
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Ensure parent directory exists
|
||||
item.save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build gallery-dl command
|
||||
cmd = [
|
||||
"gallery-dl",
|
||||
"--dest", str(item.save_path.parent),
|
||||
"--filename", item.save_path.name,
|
||||
"--no-skip",
|
||||
"--no-part",
|
||||
"--quiet"
|
||||
]
|
||||
|
||||
# Add referer if provided
|
||||
if item.referer:
|
||||
cmd.extend(["--header", f"Referer: {item.referer}"])
|
||||
|
||||
cmd.append(item.url)
|
||||
|
||||
# Run gallery-dl with timeout
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if result.returncode == 0 and item.save_path.exists():
|
||||
file_size = item.save_path.stat().st_size
|
||||
download_time = time.time() - start_time
|
||||
|
||||
# Calculate hash (SHA256 for consistency with unified database)
|
||||
with open(item.save_path, 'rb') as f:
|
||||
file_hash = hashlib.sha256(f.read()).hexdigest()
|
||||
|
||||
# Set file timestamp if we have a date
|
||||
if item.post_date:
|
||||
try:
|
||||
timestamp_unix = item.post_date.timestamp()
|
||||
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
|
||||
except Exception as e:
|
||||
self.log(f"Failed to set timestamp: {e}", "warning")
|
||||
|
||||
self.log(f"Downloaded via gallery-dl: {item.save_path.name}", "success")
|
||||
return DownloadResult(
|
||||
success=True,
|
||||
item=item,
|
||||
file_size=file_size,
|
||||
download_time=download_time,
|
||||
file_hash=file_hash
|
||||
)
|
||||
else:
|
||||
error_msg = result.stderr or "Unknown error"
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error=f"gallery-dl failed: {error_msg}"
|
||||
)
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error="gallery-dl timed out"
|
||||
)
|
||||
except Exception as e:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
def _download_from_imagetwist(self, item: DownloadItem) -> DownloadResult:
|
||||
"""Download image from ImageTwist using gallery-dl for URL resolution"""
|
||||
import subprocess
|
||||
start_time = time.time()
|
||||
|
||||
# Rate limiting for ImageTwist (they return error images if too fast)
|
||||
if not hasattr(self, '_imagetwist_last_request'):
|
||||
self._imagetwist_last_request = 0
|
||||
|
||||
with self.download_lock:
|
||||
elapsed = time.time() - self._imagetwist_last_request
|
||||
if elapsed < 2.0: # Minimum 2 seconds between ImageTwist requests
|
||||
time.sleep(2.0 - elapsed)
|
||||
self._imagetwist_last_request = time.time()
|
||||
|
||||
try:
|
||||
# Use gallery-dl to get the actual image URL
|
||||
result = subprocess.run(
|
||||
['/opt/media-downloader/venv/bin/gallery-dl', '-g', item.url],
|
||||
capture_output=True, text=True, timeout=30
|
||||
)
|
||||
|
||||
if result.returncode != 0 or not result.stdout.strip():
|
||||
# Fallback to manual parsing
|
||||
return self._download_from_imagetwist_fallback(item, start_time)
|
||||
|
||||
img_url = result.stdout.strip().split('\n')[0]
|
||||
|
||||
if not img_url or 'imagetwist' not in img_url:
|
||||
return self._download_from_imagetwist_fallback(item, start_time)
|
||||
|
||||
# Rate limit again before actual download
|
||||
with self.download_lock:
|
||||
elapsed = time.time() - self._imagetwist_last_request
|
||||
if elapsed < 2.0:
|
||||
time.sleep(2.0 - elapsed)
|
||||
self._imagetwist_last_request = time.time()
|
||||
|
||||
# Download the actual image - use imagetwist page as Referer
|
||||
item.save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
headers = {
|
||||
'User-Agent': self.user_agent,
|
||||
'Referer': item.url # Use imagetwist page URL as Referer
|
||||
}
|
||||
|
||||
img_response = requests.get(img_url, headers=headers, timeout=30, stream=True)
|
||||
img_response.raise_for_status()
|
||||
|
||||
# Check for ImageTwist error placeholder (8346 bytes - rate limited or deleted)
|
||||
content_length = img_response.headers.get('Content-Length', '')
|
||||
if content_length == '8346':
|
||||
self.log(f"ImageTwist rate limited or unavailable: {item.url}", "warning")
|
||||
return DownloadResult(success=False, item=item, error="ImageTwist error image (rate limited)")
|
||||
|
||||
# Validate it's an image, not HTML
|
||||
chunks = []
|
||||
for chunk in img_response.iter_content(chunk_size=8192):
|
||||
if not chunks: # First chunk
|
||||
if chunk[:100].lower().find(b'<html') != -1 or chunk[:100].lower().find(b'<!doctype') != -1:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error="Got HTML instead of image"
|
||||
)
|
||||
chunks.append(chunk)
|
||||
|
||||
# Save the image
|
||||
with open(item.save_path, 'wb') as f:
|
||||
for chunk in chunks:
|
||||
f.write(chunk)
|
||||
|
||||
file_size = item.save_path.stat().st_size
|
||||
download_time = time.time() - start_time
|
||||
|
||||
# Calculate hash (SHA256 for consistency with unified database)
|
||||
with open(item.save_path, 'rb') as f:
|
||||
file_hash = hashlib.sha256(f.read()).hexdigest()
|
||||
|
||||
# Set file timestamp if we have a date
|
||||
if item.post_date:
|
||||
try:
|
||||
timestamp_unix = item.post_date.timestamp()
|
||||
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self.log(f"Downloaded ImageTwist: {item.save_path.name}", "success")
|
||||
return DownloadResult(
|
||||
success=True,
|
||||
item=item,
|
||||
file_size=file_size,
|
||||
download_time=download_time,
|
||||
file_hash=file_hash
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error=f"ImageTwist download failed: {e}"
|
||||
)
|
||||
|
||||
def _download_from_imagetwist_fallback(self, item: DownloadItem, start_time: float) -> DownloadResult:
|
||||
"""Fallback method using manual page parsing"""
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': self.user_agent,
|
||||
'Referer': item.referer or 'https://forum.phun.org/'
|
||||
}
|
||||
|
||||
response = requests.get(item.url, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
page_content = response.text
|
||||
img_url = None
|
||||
|
||||
# Method 1: Look for pic class
|
||||
soup = BeautifulSoup(page_content, 'html.parser')
|
||||
pic_img = soup.find('img', class_='pic')
|
||||
if pic_img and pic_img.get('src'):
|
||||
img_url = pic_img['src']
|
||||
|
||||
# Method 2: Regex for i*.imagetwist.com/i/ pattern
|
||||
if not img_url:
|
||||
match = re.search(r'(https?://i\d*(?:phun)?\.imagetwist\.com/i/[^"\'>\s]+)', page_content)
|
||||
if match:
|
||||
img_url = match.group(1)
|
||||
|
||||
if not img_url:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error="Could not find direct image URL on ImageTwist page"
|
||||
)
|
||||
|
||||
# Download the actual image
|
||||
item.save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
img_response = requests.get(img_url, headers=headers, timeout=30, stream=True)
|
||||
img_response.raise_for_status()
|
||||
|
||||
chunks = []
|
||||
for chunk in img_response.iter_content(chunk_size=8192):
|
||||
if not chunks:
|
||||
if chunk[:100].lower().find(b'<html') != -1:
|
||||
return DownloadResult(success=False, item=item, error="Got HTML instead of image")
|
||||
chunks.append(chunk)
|
||||
|
||||
with open(item.save_path, 'wb') as f:
|
||||
for chunk in chunks:
|
||||
f.write(chunk)
|
||||
|
||||
file_size = item.save_path.stat().st_size
|
||||
download_time = time.time() - start_time
|
||||
|
||||
with open(item.save_path, 'rb') as f:
|
||||
file_hash = hashlib.sha256(f.read()).hexdigest()
|
||||
|
||||
self.log(f"Downloaded ImageTwist (fallback): {item.save_path.name}", "success")
|
||||
return DownloadResult(success=True, item=item, file_size=file_size, download_time=download_time, file_hash=file_hash)
|
||||
|
||||
except Exception as e:
|
||||
return DownloadResult(success=False, item=item, error=f"ImageTwist fallback failed: {e}")
|
||||
|
||||
def _download_with_playwright(self, item: DownloadItem) -> DownloadResult:
|
||||
"""Download using Playwright for authenticated sessions"""
|
||||
if not self.playwright_context:
|
||||
return self._download_with_requests(item)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
page = self.playwright_context.new_page()
|
||||
try:
|
||||
# Set headers
|
||||
headers = item.headers or {}
|
||||
if item.referer:
|
||||
headers['Referer'] = item.referer
|
||||
if headers:
|
||||
page.set_extra_http_headers(headers)
|
||||
|
||||
# Direct download (pixhost should already be processed)
|
||||
response = page.goto(item.url, wait_until='networkidle',
|
||||
timeout=self.timeout * 1000)
|
||||
|
||||
if response and response.ok:
|
||||
content = response.body()
|
||||
|
||||
# Check for HTML error pages
|
||||
if content[:1000].lower().find(b'<!doctype') != -1 or \
|
||||
content[:1000].lower().find(b'<html') != -1:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error="Got HTML instead of expected file"
|
||||
)
|
||||
|
||||
# Save file
|
||||
item.save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
item.save_path.write_bytes(content)
|
||||
|
||||
# Calculate hash (SHA256 for consistency with unified database)
|
||||
file_hash = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Update timestamps if we have a date
|
||||
if item.post_date:
|
||||
try:
|
||||
timestamp_unix = item.post_date.timestamp()
|
||||
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
|
||||
self.log(f"Set timestamp to {item.post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
|
||||
except Exception as e:
|
||||
self.log(f"Failed to set timestamp: {e}", "warning")
|
||||
|
||||
download_time = time.time() - start_time
|
||||
|
||||
return DownloadResult(
|
||||
success=True,
|
||||
item=item,
|
||||
file_size=len(content),
|
||||
download_time=download_time,
|
||||
file_hash=file_hash
|
||||
)
|
||||
else:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error=f"HTTP {response.status if response else 'No response'}"
|
||||
)
|
||||
|
||||
finally:
|
||||
page.close()
|
||||
|
||||
except Exception as e:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
def _download_with_requests(self, item: DownloadItem) -> DownloadResult:
|
||||
"""Download using requests library"""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
headers = item.headers or {}
|
||||
headers['User-Agent'] = self.user_agent
|
||||
if item.referer:
|
||||
headers['Referer'] = item.referer
|
||||
|
||||
# Use cookies if available
|
||||
cookies = getattr(self, 'cookies', {})
|
||||
|
||||
response = requests.get(
|
||||
item.url,
|
||||
headers=headers,
|
||||
cookies=cookies if cookies else None,
|
||||
timeout=self.timeout,
|
||||
stream=True
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Stream download to memory first to validate content
|
||||
item.save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
content = b''
|
||||
first_chunk_checked = False
|
||||
|
||||
for chunk in response.iter_content(chunk_size=self.chunk_size):
|
||||
if chunk:
|
||||
# Check first chunk for HTML error pages
|
||||
if not first_chunk_checked:
|
||||
first_chunk_checked = True
|
||||
if chunk[:100].lower().find(b'<html') != -1 or \
|
||||
chunk[:100].lower().find(b'<!doctype') != -1 or \
|
||||
chunk[:100].lower().find(b'<head>') != -1:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error="Got HTML instead of image"
|
||||
)
|
||||
content += chunk
|
||||
|
||||
# Save to file only after validation
|
||||
with open(item.save_path, 'wb') as f:
|
||||
f.write(content)
|
||||
|
||||
# Calculate hash (SHA256 for consistency with unified database)
|
||||
file_hash = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Set file timestamp if we have a date
|
||||
if item.post_date:
|
||||
try:
|
||||
timestamp_unix = item.post_date.timestamp()
|
||||
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
|
||||
self.log(f"Set timestamp to {item.post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
|
||||
except Exception as e:
|
||||
self.log(f"Failed to set timestamp: {e}", "warning")
|
||||
|
||||
download_time = time.time() - start_time
|
||||
|
||||
return DownloadResult(
|
||||
success=True,
|
||||
item=item,
|
||||
file_size=len(content),
|
||||
download_time=download_time,
|
||||
file_hash=file_hash
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# Clean up partial download
|
||||
if item.save_path.exists():
|
||||
item.save_path.unlink()
|
||||
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
def _download_worker(self, item: DownloadItem, thread_id: int) -> DownloadResult:
|
||||
"""Worker function for downloading a single item"""
|
||||
# Process image hosting URLs to get direct URLs
|
||||
if 'pixhost.to/show/' in item.url:
|
||||
direct_url = self._extract_pixhost_direct_url(item.url)
|
||||
if direct_url:
|
||||
self.log(f"Converted pixhost URL to direct: {direct_url.split('/')[-1]}", "debug")
|
||||
item.url = direct_url
|
||||
else:
|
||||
self.log(f"Failed to extract pixhost direct URL: {item.url}", "warning")
|
||||
|
||||
elif 'imagebam.com' in item.url:
|
||||
direct_url = self._extract_imagebam_direct_url(item.url)
|
||||
if direct_url:
|
||||
self.log(f"Converted ImageBam URL to direct: {direct_url.split('/')[-1]}", "debug")
|
||||
item.url = direct_url
|
||||
else:
|
||||
self.log(f"Failed to extract ImageBam direct URL: {item.url}", "warning")
|
||||
|
||||
elif 'imagetwist.com' in item.url:
|
||||
# ImageTwist requires parsing the page to get direct image URL
|
||||
result = self._download_from_imagetwist(item)
|
||||
if result.success:
|
||||
return result
|
||||
self.log(f"ImageTwist download failed: {item.url}", "warning")
|
||||
|
||||
# Check if already downloaded
|
||||
if self._is_already_downloaded(item.url, item.save_path):
|
||||
self.log(f"Already downloaded: {item.save_path.name}", "skip")
|
||||
return DownloadResult(
|
||||
success=True,
|
||||
item=item,
|
||||
file_size=item.save_path.stat().st_size if item.save_path.exists() else 0
|
||||
)
|
||||
|
||||
# Apply rate limiting
|
||||
self._apply_rate_limit(thread_id)
|
||||
|
||||
# Always use requests for direct image downloads (faster)
|
||||
result = self._download_with_requests(item)
|
||||
|
||||
# Handle retries
|
||||
if not result.success and item.retry_count < item.max_retries:
|
||||
item.retry_count += 1
|
||||
self.log(f"Retrying {item.url} ({item.retry_count}/{item.max_retries})", "warning")
|
||||
time.sleep(self.rate_limit * 2) # Extra delay before retry
|
||||
return self._download_worker(item, thread_id)
|
||||
|
||||
# Save to database if successful
|
||||
if result.success and self.use_database:
|
||||
self._save_to_database(result)
|
||||
|
||||
# Update statistics
|
||||
with self.download_lock:
|
||||
if result.success:
|
||||
self.stats['successful'] += 1
|
||||
if result.file_size:
|
||||
self.stats['total_bytes'] += result.file_size
|
||||
if result.download_time:
|
||||
self.stats['total_time'] += result.download_time
|
||||
else:
|
||||
self.stats['failed'] += 1
|
||||
|
||||
return result
|
||||
|
||||
def _save_to_database(self, result: DownloadResult):
|
||||
"""Save successful download to database"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
|
||||
metadata_str = None
|
||||
if result.item.metadata:
|
||||
import json
|
||||
metadata_str = json.dumps(result.item.metadata)
|
||||
|
||||
cursor.execute('''
|
||||
INSERT OR REPLACE INTO downloads
|
||||
(url, file_path, file_hash, file_size, metadata)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
''', (
|
||||
result.item.url,
|
||||
str(result.item.save_path),
|
||||
result.file_hash,
|
||||
result.file_size,
|
||||
metadata_str
|
||||
))
|
||||
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def download_batch(self, items: List[DownloadItem],
|
||||
progress_callback: Optional[Callable] = None) -> List[DownloadResult]:
|
||||
"""
|
||||
Download multiple items concurrently
|
||||
|
||||
Args:
|
||||
items: List of DownloadItem objects
|
||||
progress_callback: Optional callback for progress updates
|
||||
|
||||
Returns:
|
||||
List of DownloadResult objects
|
||||
"""
|
||||
self.stats['total'] = len(items)
|
||||
results = []
|
||||
|
||||
self.log(f"Starting batch download of {len(items)} items with {self.max_workers} workers", "info")
|
||||
|
||||
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
||||
# Submit all downloads
|
||||
futures = {
|
||||
executor.submit(self._download_worker, item, i % self.max_workers): item
|
||||
for i, item in enumerate(items)
|
||||
}
|
||||
|
||||
# Process completed downloads
|
||||
completed = 0
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
results.append(result)
|
||||
completed += 1
|
||||
|
||||
# Progress update
|
||||
if progress_callback:
|
||||
progress_callback(completed, len(items), result)
|
||||
|
||||
if self.show_progress:
|
||||
pct = (completed / len(items)) * 100
|
||||
status = "✓" if result.success else "✗"
|
||||
self.log(
|
||||
f"[{completed}/{len(items)}] {pct:.1f}% - {status} {result.item.save_path.name}",
|
||||
"success" if result.success else "error"
|
||||
)
|
||||
|
||||
# Summary
|
||||
self.log(f"Batch complete: {self.stats['successful']} successful, {self.stats['failed']} failed", "info")
|
||||
|
||||
if self.stats['successful'] > 0:
|
||||
avg_speed = self.stats['total_bytes'] / self.stats['total_time'] / 1024 / 1024
|
||||
self.log(f"Average speed: {avg_speed:.2f} MB/s", "info")
|
||||
|
||||
return results
|
||||
|
||||
def download_urls(self, urls: List[str], base_path: Path,
|
||||
referer: Optional[str] = None,
|
||||
metadata: Optional[Dict] = None) -> List[DownloadResult]:
|
||||
"""
|
||||
Convenience method to download URLs to a directory
|
||||
|
||||
Args:
|
||||
urls: List of URLs to download
|
||||
base_path: Directory to save files
|
||||
referer: Optional referer header
|
||||
metadata: Optional metadata for all downloads
|
||||
|
||||
Returns:
|
||||
List of DownloadResult objects
|
||||
"""
|
||||
items = []
|
||||
for url in urls:
|
||||
filename = os.path.basename(urlparse(url).path) or f"download_{hashlib.sha256(url.encode()).hexdigest()[:8]}"
|
||||
save_path = base_path / filename
|
||||
|
||||
items.append(DownloadItem(
|
||||
url=url,
|
||||
save_path=save_path,
|
||||
referer=referer,
|
||||
metadata=metadata
|
||||
))
|
||||
|
||||
return self.download_batch(items)
|
||||
|
||||
def get_statistics(self) -> Dict:
|
||||
"""Get download statistics"""
|
||||
return self.stats.copy()
|
||||
|
||||
def cleanup_old_downloads(self, days: int = 30):
|
||||
"""Remove old download records from database"""
|
||||
if not self.use_database:
|
||||
return 0
|
||||
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
DELETE FROM downloads
|
||||
WHERE download_date < datetime('now', ? || ' days')
|
||||
''', (-days,))
|
||||
|
||||
deleted = cursor.rowcount
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
self.log(f"Cleaned up {deleted} old download records", "info")
|
||||
return deleted
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
from pathlib import Path
|
||||
|
||||
# Test download manager
|
||||
manager = DownloadManager(
|
||||
max_workers=3,
|
||||
rate_limit=0.5,
|
||||
show_progress=True
|
||||
)
|
||||
|
||||
# Test URLs
|
||||
urls = [
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
||||
"https://sample-videos.com/img/Sample-jpg-image-50kb.jpg",
|
||||
"https://www.w3schools.com/html/img_girl.jpg"
|
||||
]
|
||||
|
||||
# Download
|
||||
results = manager.download_urls(urls, Path("/tmp/test-downloads"))
|
||||
|
||||
# Print results
|
||||
logger.info(f"Downloaded {len([r for r in results if r.success])} of {len(results)} files")
|
||||
logger.info(f"Total bytes: {manager.stats['total_bytes'] / 1024:.1f} KB")
|
||||
logger.info(f"Total time: {manager.stats['total_time']:.2f} seconds")
|
||||
Reference in New Issue
Block a user