874 lines
36 KiB
Python
874 lines
36 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Coppermine Photo Gallery Downloader Module
|
|
Downloads full-resolution images from Coppermine-based galleries
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import time
|
|
import hashlib
|
|
import requests
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, List, Optional, Set
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin, urlparse, parse_qs
|
|
from modules.base_module import LoggingMixin
|
|
from modules.cloudflare_handler import CloudflareHandler, SiteStatus, get_flaresolverr_user_agent
|
|
|
|
|
|
class CoppermineDownloader(LoggingMixin):
|
|
"""
|
|
Coppermine Photo Gallery downloader
|
|
|
|
Example usage:
|
|
from coppermine_module import CoppermineDownloader
|
|
|
|
downloader = CoppermineDownloader()
|
|
count = downloader.download(
|
|
gallery_url="https://hqdiesel.net/thumbnails.php?album=lastup&cat=123",
|
|
output_dir="downloads/coppermine",
|
|
days_back=7
|
|
)
|
|
print(f"Downloaded {count} items")
|
|
"""
|
|
|
|
def __init__(self, show_progress=True, use_database=True,
|
|
log_callback=None, unified_db=None, config=None):
|
|
"""
|
|
Initialize the downloader
|
|
|
|
Args:
|
|
show_progress: Print progress messages
|
|
use_database: Use database to track downloads
|
|
log_callback: Optional callback function for logging
|
|
unified_db: Optional UnifiedDatabase instance
|
|
config: Optional config dict with flaresolverr settings
|
|
"""
|
|
# Initialize logging via mixin
|
|
self._init_logger('Coppermine', log_callback, default_module='Download')
|
|
|
|
self.show_progress = show_progress
|
|
self.use_database = use_database
|
|
self.downloaded_files = set()
|
|
self.download_count = 0
|
|
self.unified_db = unified_db # Store for scraper config access
|
|
self.scraper_id = 'coppermine' # Scraper ID in database
|
|
|
|
# Use unified database if provided
|
|
if unified_db and use_database:
|
|
from modules.unified_database import CoppermineDatabaseAdapter
|
|
self.db = CoppermineDatabaseAdapter(unified_db)
|
|
else:
|
|
self.db = None
|
|
self.use_database = False
|
|
|
|
# Initialize activity status manager for real-time updates
|
|
from modules.activity_status import get_activity_manager
|
|
self.activity_manager = get_activity_manager(unified_db)
|
|
|
|
# Rate limiting
|
|
self.min_delay = 1
|
|
self.max_delay = 3
|
|
|
|
self.pending_downloads = [] # Track downloads for deferred database recording
|
|
|
|
# Load scraper configuration from database if available
|
|
self.proxy_url = None
|
|
self.cookie_file = None # Default to None (use database)
|
|
|
|
if unified_db:
|
|
scraper_config = unified_db.get_scraper(self.scraper_id)
|
|
if scraper_config:
|
|
# Get proxy configuration
|
|
if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
|
|
self.proxy_url = scraper_config['proxy_url']
|
|
self.log(f"Using proxy: {self.proxy_url}", "info")
|
|
|
|
# Fall back to config file for cookie_file if database not available
|
|
if not unified_db and config:
|
|
self.cookie_file = config.get('cookie_file', '/opt/media-downloader/cookies/coppermine_cookies.json')
|
|
|
|
# Session with proper headers
|
|
self.session = requests.Session()
|
|
self.user_agent = get_flaresolverr_user_agent()
|
|
self.session.headers.update({
|
|
'User-Agent': self.user_agent,
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1'
|
|
})
|
|
|
|
# Configure session proxy if available
|
|
if self.proxy_url:
|
|
self.session.proxies = {
|
|
'http': self.proxy_url,
|
|
'https': self.proxy_url
|
|
}
|
|
|
|
# Initialize universal Cloudflare handler with conservative expiry
|
|
# Pass proxy_url if configured, and cookie_file=None for database storage
|
|
self.cf_handler = CloudflareHandler(
|
|
module_name="Coppermine",
|
|
cookie_file=self.cookie_file, # None when using database
|
|
user_agent=self.user_agent,
|
|
logger=self.logger,
|
|
aggressive_expiry=False, # Conservative mode for Coppermine
|
|
proxy_url=self.proxy_url # Pass proxy to FlareSolverr
|
|
)
|
|
|
|
# Keep for backwards compatibility
|
|
self.flaresolverr_url = self.cf_handler.flaresolverr_url
|
|
self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled
|
|
|
|
# Load cookies from file if exists
|
|
self._load_cookies()
|
|
|
|
def _record_download(self, url: str, platform: str, source: str, content_type: str,
|
|
filename: str, file_path: str, file_size: int, file_hash: str,
|
|
post_date=None, metadata: dict = None, deferred: bool = False):
|
|
"""Record a download in the database
|
|
|
|
Args:
|
|
deferred: If True, don't record to database now - add to pending_downloads list
|
|
for later recording after file move is complete
|
|
"""
|
|
# If deferred, store for later recording instead of recording now
|
|
if deferred:
|
|
self.pending_downloads.append({
|
|
'url': url,
|
|
'platform': platform,
|
|
'source': source,
|
|
'content_type': content_type,
|
|
'filename': filename,
|
|
'file_path': file_path,
|
|
'file_size': file_size,
|
|
'file_hash': file_hash,
|
|
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
|
|
'metadata': metadata
|
|
})
|
|
self.log(f"Deferred recording for {filename}", "debug")
|
|
return True
|
|
|
|
if not self.use_database or not self.db:
|
|
return
|
|
|
|
try:
|
|
self.db.add_download(
|
|
url=url,
|
|
platform=platform,
|
|
source=source,
|
|
content_type=content_type,
|
|
filename=filename,
|
|
file_path=file_path,
|
|
file_size=file_size,
|
|
file_hash=file_hash,
|
|
post_date=post_date,
|
|
metadata=metadata
|
|
)
|
|
except Exception as e:
|
|
self.log(f"Failed to record download: {e}", "debug")
|
|
|
|
def get_pending_downloads(self):
|
|
"""Get list of downloads that were deferred for later recording"""
|
|
return self.pending_downloads.copy()
|
|
|
|
def clear_pending_downloads(self):
|
|
"""Clear the pending downloads list after they've been recorded"""
|
|
self.pending_downloads = []
|
|
|
|
def _load_cookies(self):
|
|
"""Load cookies from database or file"""
|
|
# Try database first if available
|
|
if self.unified_db:
|
|
try:
|
|
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
|
if cookies:
|
|
cf_clearance_found = False
|
|
for cookie in cookies:
|
|
try:
|
|
self.session.cookies.set(
|
|
cookie['name'],
|
|
cookie['value'],
|
|
domain=cookie.get('domain', ''),
|
|
path=cookie.get('path', '/')
|
|
)
|
|
if cookie['name'] == 'cf_clearance':
|
|
cf_clearance_found = True
|
|
except Exception as e:
|
|
self.log(f"Error setting cookie {cookie.get('name')}: {e}", "warning")
|
|
|
|
if cf_clearance_found:
|
|
self.log(f"✓ Loaded {len(cookies)} cookies including cf_clearance from database", "info")
|
|
else:
|
|
self.log(f"⚠ Loaded {len(cookies)} cookies from database but cf_clearance NOT found", "warning")
|
|
|
|
# Also load cookies into CloudflareHandler for consistency
|
|
self.cf_handler._cookies = cookies
|
|
return
|
|
else:
|
|
self.log("No cookies found in database", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies from database: {e}", "warning")
|
|
|
|
# Fall back to cookie file if no database
|
|
if not self.cookie_file:
|
|
self.log("No cookie file configured", "debug")
|
|
return
|
|
|
|
cookie_path = Path(self.cookie_file)
|
|
if not cookie_path.exists():
|
|
self.log(f"Cookie file does not exist: {self.cookie_file}", "info")
|
|
return
|
|
|
|
try:
|
|
import json
|
|
with open(cookie_path, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
# Handle both old format (list) and new format (dict with 'cookies' and 'timestamp')
|
|
if isinstance(data, dict) and 'cookies' in data:
|
|
cookies = data['cookies']
|
|
elif isinstance(data, list):
|
|
cookies = data
|
|
else:
|
|
self.log(f"Invalid cookie file format", "warning")
|
|
return
|
|
|
|
# Count critical cookies
|
|
cf_clearance_found = False
|
|
for cookie in cookies:
|
|
try:
|
|
# Set cookie with basic attributes (requests.Session compatible)
|
|
self.session.cookies.set(
|
|
cookie['name'],
|
|
cookie['value'],
|
|
domain=cookie.get('domain', ''),
|
|
path=cookie.get('path', '/')
|
|
)
|
|
if cookie['name'] == 'cf_clearance':
|
|
cf_clearance_found = True
|
|
except Exception as e:
|
|
self.log(f"Error setting cookie {cookie.get('name')}: {e}", "warning")
|
|
|
|
if cf_clearance_found:
|
|
self.log(f"✓ Loaded {len(cookies)} cookies including cf_clearance from {self.cookie_file}", "info")
|
|
else:
|
|
self.log(f"⚠ Loaded {len(cookies)} cookies but cf_clearance NOT found", "warning")
|
|
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies: {e}", "warning")
|
|
|
|
def _cookies_expired(self):
|
|
"""Check if cookies are expired - delegates to CloudflareHandler"""
|
|
return self.cf_handler.cookies_expired()
|
|
|
|
def _save_cookies(self, cookies: list, user_agent: str = None):
|
|
"""Save cookies to database or file with timestamp
|
|
|
|
Args:
|
|
cookies: List of cookie dictionaries
|
|
user_agent: User agent to associate with cookies (important for cf_clearance).
|
|
If not provided, uses self.user_agent as fallback.
|
|
"""
|
|
# Use provided user_agent or fall back to self.user_agent
|
|
ua = user_agent or self.user_agent
|
|
|
|
# Try database first if available
|
|
if self.unified_db:
|
|
try:
|
|
self.unified_db.save_scraper_cookies(
|
|
self.scraper_id,
|
|
cookies,
|
|
user_agent=ua,
|
|
merge=True # Merge with existing cookies
|
|
)
|
|
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50] if ua else 'None'}...)", "debug")
|
|
return
|
|
except Exception as e:
|
|
self.log(f"Error saving cookies to database: {e}", "warning")
|
|
|
|
# Fall back to file
|
|
if not self.cookie_file:
|
|
return
|
|
|
|
try:
|
|
import json
|
|
from datetime import datetime
|
|
cookie_path = Path(self.cookie_file)
|
|
cookie_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
storage_data = {
|
|
'cookies': cookies,
|
|
'timestamp': datetime.now().isoformat()
|
|
}
|
|
|
|
with open(cookie_path, 'w') as f:
|
|
json.dump(storage_data, f, indent=2)
|
|
self.log(f"Saved {len(cookies)} cookies to {self.cookie_file}", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error saving cookies: {e}", "warning")
|
|
|
|
def _get_cookies_via_flaresolverr(self, url: str, max_retries: int = 2) -> bool:
|
|
"""Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler
|
|
|
|
Args:
|
|
url: URL to fetch
|
|
max_retries: Maximum number of retry attempts (default: 2)
|
|
|
|
Returns:
|
|
True if cookies obtained successfully, False otherwise
|
|
"""
|
|
# Delegate to CloudflareHandler
|
|
success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries)
|
|
|
|
# If successful, also load cookies into the session and save to database
|
|
if success:
|
|
cookies_dict = self.cf_handler.get_cookies_dict()
|
|
for name, value in cookies_dict.items():
|
|
# Extract domain from URL
|
|
from urllib.parse import urlparse
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc
|
|
self.session.cookies.set(name, value, domain=domain, path='/')
|
|
|
|
# Save cookies to database (the handler already saved to file if configured)
|
|
if self.unified_db:
|
|
cookies_list = self.cf_handler.get_cookies_list()
|
|
if cookies_list:
|
|
# CRITICAL: Get the user_agent from FlareSolverr solution, not self.user_agent
|
|
# cf_clearance cookies are fingerprinted to the browser that solved the challenge
|
|
flaresolverr_ua = self.cf_handler.get_user_agent()
|
|
self._save_cookies(cookies_list, user_agent=flaresolverr_ua)
|
|
|
|
return success
|
|
|
|
def _request_with_retry(self, url: str, timeout: int = 30, max_attempts: int = 2):
|
|
"""Make HTTP request with automatic Cloudflare challenge retry
|
|
|
|
Args:
|
|
url: URL to fetch
|
|
timeout: Request timeout in seconds
|
|
max_attempts: Maximum number of attempts (default: 2)
|
|
|
|
Returns:
|
|
requests.Response object
|
|
|
|
Raises:
|
|
Exception if all retry attempts fail
|
|
"""
|
|
last_error = None
|
|
|
|
for attempt in range(1, max_attempts + 1):
|
|
try:
|
|
response = self.session.get(url, timeout=timeout)
|
|
|
|
# Detect Cloudflare challenges
|
|
is_cloudflare = False
|
|
if response.status_code in [403, 503]:
|
|
is_cloudflare = True
|
|
self.log(f"Cloudflare challenge detected (HTTP {response.status_code})", "warning")
|
|
elif len(response.text) < 1000:
|
|
is_cloudflare = True
|
|
self.log(f"Cloudflare challenge detected (short response: {len(response.text)} bytes)", "warning")
|
|
elif 'challenge' in response.text.lower()[:500]:
|
|
is_cloudflare = True
|
|
self.log("Cloudflare challenge detected in HTML", "warning")
|
|
|
|
# If Cloudflare detected and we have retry attempts left
|
|
if is_cloudflare and attempt < max_attempts:
|
|
if self.flaresolverr_enabled:
|
|
self.log(f"Attempt {attempt}/{max_attempts}: Refreshing cookies via FlareSolverr...", "info")
|
|
if self._get_cookies_via_flaresolverr(url):
|
|
self.log("Cookies refreshed, retrying request...", "info")
|
|
continue # Retry the request
|
|
else:
|
|
raise Exception("Failed to refresh cookies via FlareSolverr")
|
|
else:
|
|
raise Exception("Cloudflare challenge detected but FlareSolverr is disabled")
|
|
|
|
# No Cloudflare challenge or final attempt - check status and return
|
|
response.raise_for_status()
|
|
return response
|
|
|
|
except Exception as e:
|
|
last_error = e
|
|
if attempt < max_attempts:
|
|
self.log(f"Attempt {attempt}/{max_attempts} failed: {e}", "warning")
|
|
else:
|
|
self.log(f"All {max_attempts} attempts failed", "error")
|
|
|
|
# All attempts failed
|
|
raise last_error
|
|
|
|
def _parse_date(self, date_str: str) -> Optional[datetime]:
|
|
"""
|
|
Parse Coppermine date format: 'Date added=Sep 29, 2025'
|
|
|
|
Args:
|
|
date_str: Date string from Coppermine
|
|
|
|
Returns:
|
|
datetime object or None
|
|
"""
|
|
try:
|
|
# Extract date from "Date added=Sep 29, 2025" format
|
|
match = re.search(r'Date added=([A-Za-z]+ \d+, \d{4})', date_str)
|
|
if match:
|
|
date_part = match.group(1)
|
|
return datetime.strptime(date_part, '%b %d, %Y')
|
|
except Exception as e:
|
|
self.log(f"Error parsing date '{date_str}': {e}", "debug")
|
|
return None
|
|
|
|
def _extract_full_image_url(self, base_url: str, thumbnail_url: str) -> str:
|
|
"""
|
|
Convert thumbnail URL to full-resolution URL
|
|
|
|
Pattern:
|
|
Thumbnail: albums/userpics/1052219/thumb_1000523798.jpg
|
|
Normal: albums/userpics/1052219/normal_1000523798.jpg
|
|
Full: albums/userpics/1052219/1000523798.jpg
|
|
|
|
Args:
|
|
base_url: Base URL of the gallery (e.g., https://hqdiesel.net)
|
|
thumbnail_url: Relative thumbnail URL
|
|
|
|
Returns:
|
|
Full-resolution image URL
|
|
"""
|
|
# Remove thumb_ or normal_ prefix
|
|
full_path = re.sub(r'/(thumb_|normal_)', '/', thumbnail_url)
|
|
return urljoin(base_url, full_path)
|
|
|
|
def _parse_gallery_page(self, html: str, base_url: str) -> List[Dict]:
|
|
"""
|
|
Parse a Coppermine gallery page to extract image information
|
|
|
|
Args:
|
|
html: HTML content of the page
|
|
base_url: Base URL of the gallery
|
|
|
|
Returns:
|
|
List of dicts with image info
|
|
"""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
images = []
|
|
|
|
# Find all thumbnail cells
|
|
thumbnail_cells = soup.find_all('td', class_='thumbnails')
|
|
self.log(f"Found {len(thumbnail_cells)} thumbnail cells on page", "debug")
|
|
|
|
for cell in thumbnail_cells:
|
|
try:
|
|
# Find image link
|
|
link = cell.find('a', href=re.compile(r'displayimage\.php'))
|
|
if not link:
|
|
continue
|
|
|
|
# Extract PID from URL
|
|
href = link.get('href', '')
|
|
parsed = parse_qs(urlparse(href).query)
|
|
pid = parsed.get('pid', [None])[0]
|
|
|
|
if not pid:
|
|
continue
|
|
|
|
# Find thumbnail image
|
|
img = link.find('img')
|
|
if not img:
|
|
continue
|
|
|
|
thumbnail_url = img.get('src', '')
|
|
if not thumbnail_url:
|
|
continue
|
|
|
|
# Get image title (contains metadata)
|
|
title = img.get('title', '')
|
|
|
|
# Extract filename
|
|
filename_match = re.search(r'Filename=([^\s]+)', title)
|
|
filename = filename_match.group(1) if filename_match else None
|
|
|
|
# Extract date from dedicated span (more reliable)
|
|
upload_date = None
|
|
date_span = cell.find('span', class_='thumb_caption_ctime')
|
|
if date_span and date_span.text.strip():
|
|
try:
|
|
upload_date = datetime.strptime(date_span.text.strip(), '%b %d, %Y')
|
|
except Exception:
|
|
# Fallback to title parsing
|
|
upload_date = self._parse_date(title)
|
|
else:
|
|
upload_date = self._parse_date(title)
|
|
|
|
# Extract uploader
|
|
uploader = None
|
|
uploader_link = cell.find('a', href=re.compile(r'profile\.php'))
|
|
if uploader_link:
|
|
uploader = uploader_link.text.strip()
|
|
|
|
# Extract dimensions
|
|
dimensions_match = re.search(r'Dimensions=(\d+x\d+)', title)
|
|
dimensions = dimensions_match.group(1) if dimensions_match else None
|
|
|
|
# Extract filesize
|
|
filesize_match = re.search(r'Filesize=([^\s]+)', title)
|
|
filesize = filesize_match.group(1) if filesize_match else None
|
|
|
|
# Extract views
|
|
views = None
|
|
views_span = cell.find('span', class_='thumb_title_views')
|
|
if views_span:
|
|
views_match = re.search(r'(\d+)\s+views?', views_span.text)
|
|
if views_match:
|
|
views = int(views_match.group(1))
|
|
|
|
# Construct full-resolution URL
|
|
full_url = self._extract_full_image_url(base_url, thumbnail_url)
|
|
|
|
images.append({
|
|
'pid': pid,
|
|
'filename': filename,
|
|
'thumbnail_url': urljoin(base_url, thumbnail_url),
|
|
'full_url': full_url,
|
|
'upload_date': upload_date,
|
|
'dimensions': dimensions,
|
|
'filesize': filesize,
|
|
'uploader': uploader,
|
|
'views': views,
|
|
'title': title
|
|
})
|
|
|
|
except Exception as e:
|
|
self.log(f"Error parsing thumbnail cell: {e}", "debug")
|
|
continue
|
|
|
|
return images
|
|
|
|
def _get_total_pages(self, html: str) -> int:
|
|
"""
|
|
Extract total number of pages from gallery
|
|
|
|
Args:
|
|
html: HTML content
|
|
|
|
Returns:
|
|
Number of pages
|
|
"""
|
|
try:
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
# Look for pagination info like "2005 files on 20 page(s)"
|
|
text = soup.get_text()
|
|
match = re.search(r'(\d+)\s+files?\s+on\s+(\d+)\s+page', text)
|
|
if match:
|
|
return int(match.group(2))
|
|
except Exception as e:
|
|
self.log(f"Error extracting page count: {e}", "debug")
|
|
return 1
|
|
|
|
def _download_image(self, image_info: Dict, output_dir: Path,
|
|
gallery_name: str) -> Optional[str]:
|
|
"""
|
|
Download a single image
|
|
|
|
Args:
|
|
image_info: Image information dict
|
|
output_dir: Output directory
|
|
gallery_name: Name of gallery for database tracking
|
|
|
|
Returns:
|
|
Path to downloaded file or None
|
|
"""
|
|
try:
|
|
url = image_info['full_url']
|
|
pid = image_info['pid']
|
|
filename = image_info['filename']
|
|
|
|
# Check if already downloaded
|
|
if self.use_database and self.db:
|
|
if self.db.is_downloaded(url, platform='coppermine'):
|
|
self.log(f"Already downloaded (database): {filename} (PID: {pid})", "info")
|
|
return None
|
|
|
|
# Create output directory
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Construct output filename
|
|
output_file = output_dir / filename
|
|
|
|
# Skip if file exists
|
|
if output_file.exists():
|
|
self.log(f"File already exists: {filename}", "info")
|
|
return str(output_file)
|
|
|
|
# Download image
|
|
self.log(f"Downloading: {filename} (PID: {pid})", "info")
|
|
|
|
response = self._request_with_retry(url, timeout=30)
|
|
|
|
# Save image
|
|
with open(output_file, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
# Check for duplicate hash before recording
|
|
if self.db and hasattr(self.db, 'unified_db'):
|
|
from pathlib import Path as PathLib
|
|
# Check for duplicate hash (hash blacklist persists even if original deleted)
|
|
file_hash_check = self.db.unified_db.get_file_hash(str(output_file))
|
|
if file_hash_check:
|
|
existing = self.db.unified_db.get_download_by_file_hash(file_hash_check)
|
|
if existing and existing.get('file_path') and str(output_file) != existing.get('file_path'):
|
|
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
|
|
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
|
# Delete the duplicate regardless of whether original file still exists
|
|
try:
|
|
output_file.unlink()
|
|
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
|
|
return
|
|
except Exception as e:
|
|
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
|
|
return
|
|
|
|
# Calculate SHA256 file hash from saved file (consistent with other modules)
|
|
file_hash = None
|
|
if self.db and hasattr(self.db, 'unified_db'):
|
|
try:
|
|
file_hash = self.db.unified_db.get_file_hash(str(output_file))
|
|
except Exception as e:
|
|
self.log(f"Failed to calculate file hash: {e}", "warning")
|
|
|
|
# Track timestamp for this file
|
|
if image_info.get('upload_date'):
|
|
self.file_timestamps[filename] = image_info['upload_date']
|
|
|
|
# Record in database
|
|
self._record_download(
|
|
url=url,
|
|
platform='coppermine',
|
|
source=gallery_name,
|
|
content_type='image',
|
|
filename=filename,
|
|
file_path=str(output_file),
|
|
file_size=len(response.content),
|
|
file_hash=file_hash,
|
|
post_date=image_info.get('upload_date'),
|
|
metadata={
|
|
'pid': pid,
|
|
'dimensions': image_info.get('dimensions'),
|
|
'filesize': image_info.get('filesize')
|
|
},
|
|
deferred=getattr(self, 'defer_database', False)
|
|
)
|
|
|
|
self.download_count += 1
|
|
time.sleep(self.min_delay + (self.max_delay - self.min_delay) * __import__('random').random())
|
|
|
|
return str(output_file)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading {image_info.get('filename', 'unknown')}: {e}", "error")
|
|
return None
|
|
|
|
def download(self, gallery_url: str, output_dir: str,
|
|
days_back: Optional[int] = None, max_pages: Optional[int] = None,
|
|
gallery_name: Optional[str] = None, defer_database: bool = False) -> tuple:
|
|
"""
|
|
Download images from a Coppermine gallery
|
|
|
|
Args:
|
|
gallery_url: URL to the gallery page (e.g., thumbnails.php?album=lastup&cat=123)
|
|
output_dir: Directory to save images
|
|
days_back: Only download images from last N days (None = all)
|
|
max_pages: Maximum number of pages to process (None = all)
|
|
gallery_name: Name for database tracking (extracted from URL if not provided)
|
|
defer_database: If True, don't record to database immediately - store in
|
|
pending_downloads for later recording after file move is complete
|
|
|
|
Returns:
|
|
Tuple of (file_timestamps dict, download_count)
|
|
file_timestamps: Dict mapping filename -> upload_date
|
|
"""
|
|
self.defer_database = defer_database # Store for use in download methods
|
|
# Clear downloaded_files cache between galleries to prevent memory growth
|
|
self.downloaded_files.clear()
|
|
|
|
# Check site status before doing anything else
|
|
self.log("Checking Coppermine gallery site status...", "debug")
|
|
site_status, error_msg = self.cf_handler.check_site_status(gallery_url, timeout=10)
|
|
|
|
if self.cf_handler.should_skip_download(site_status):
|
|
self.log(f"Skipping download - Coppermine gallery is unavailable: {error_msg}", "warning")
|
|
return ({}, 0)
|
|
elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
|
|
self.log("Cloudflare challenge detected, will attempt bypass during download", "info")
|
|
|
|
self.download_count = 0
|
|
self.file_timestamps = {} # Track timestamps for each file
|
|
output_path = Path(output_dir)
|
|
|
|
# Extract base URL and gallery name
|
|
parsed_url = urlparse(gallery_url)
|
|
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
|
|
|
if not gallery_name:
|
|
# Extract category from URL
|
|
query_params = parse_qs(parsed_url.query)
|
|
cat = query_params.get('cat', ['unknown'])[0]
|
|
album = query_params.get('album', ['unknown'])[0]
|
|
gallery_name = f"{parsed_url.netloc}_cat{cat}_{album}"
|
|
|
|
self.log(f"Starting download from: {gallery_url}", "info")
|
|
self.activity_manager.update_status(f"Checking gallery: {gallery_name}")
|
|
self.log(f"Gallery: {gallery_name}", "info")
|
|
if days_back:
|
|
self.log(f"Filtering: Last {days_back} days", "info")
|
|
|
|
# Calculate cutoff date
|
|
cutoff_date = None
|
|
if days_back:
|
|
cutoff_date = datetime.now() - timedelta(days=days_back)
|
|
|
|
# Check if cookies have expired before testing
|
|
cookies_valid = False
|
|
cookie_count = len(self.session.cookies)
|
|
|
|
# Check for short-lived session cookies that may have expired
|
|
if self.cf_handler.cookies_expired():
|
|
self.log(f"Cookies expired, skipping test and refreshing via FlareSolverr", "info")
|
|
else:
|
|
self.log(f"Testing with {cookie_count} existing cookies...", "info")
|
|
|
|
try:
|
|
# Try with existing cookies first (short timeout for fast fail)
|
|
test_response = self.session.get(gallery_url, timeout=5)
|
|
|
|
# Check if we got a Cloudflare challenge or error
|
|
if test_response.status_code == 403 or test_response.status_code == 503:
|
|
self.log(f"Existing cookies failed (HTTP {test_response.status_code}), need FlareSolverr", "info")
|
|
elif len(test_response.text) < 1000:
|
|
self.log(f"Response too short ({len(test_response.text)} bytes), likely Cloudflare challenge", "info")
|
|
elif 'challenge' in test_response.text.lower()[:500]:
|
|
self.log("Cloudflare challenge detected in response", "info")
|
|
else:
|
|
# Cookies work (or no challenge presented)!
|
|
cookies_valid = True
|
|
self.log(f"✓ Existing cookies valid ({cookie_count} cookies, skipped FlareSolverr)", "info")
|
|
response = test_response
|
|
except Exception as e:
|
|
self.log(f"Test request failed ({type(e).__name__}: {e}), need FlareSolverr", "info")
|
|
|
|
# Only call FlareSolverr if existing cookies don't work
|
|
if not cookies_valid:
|
|
if self.flaresolverr_enabled:
|
|
self.log("Calling FlareSolverr to get fresh cookies...", "info")
|
|
if not self._get_cookies_via_flaresolverr(gallery_url):
|
|
self.log("Failed to bypass Cloudflare", "error")
|
|
return ({}, 0)
|
|
else:
|
|
self.log("FlareSolverr disabled and cookies invalid", "error")
|
|
return ({}, 0)
|
|
|
|
# Fetch first page to get total pages (reuse response if cookies were valid)
|
|
try:
|
|
if not cookies_valid:
|
|
response = self._request_with_retry(gallery_url, timeout=30)
|
|
|
|
total_pages = self._get_total_pages(response.text)
|
|
|
|
if max_pages:
|
|
total_pages = min(total_pages, max_pages)
|
|
|
|
self.log(f"Total pages to process: {total_pages}", "info")
|
|
|
|
except Exception as e:
|
|
self.log(f"Error fetching gallery: {e}", "error")
|
|
return ({}, 0)
|
|
|
|
# Set initial progress so dashboard shows 0/N immediately
|
|
self.activity_manager.update_status(
|
|
"Downloading images",
|
|
progress_current=0,
|
|
progress_total=total_pages
|
|
)
|
|
|
|
# Process each page
|
|
for page_num in range(1, total_pages + 1):
|
|
try:
|
|
# Construct page URL
|
|
if page_num == 1:
|
|
page_url = gallery_url
|
|
else:
|
|
separator = '&' if '?' in gallery_url else '?'
|
|
page_url = f"{gallery_url}{separator}page={page_num}"
|
|
|
|
self.log(f"Processing page {page_num}/{total_pages}...", "info")
|
|
|
|
# Fetch page with automatic Cloudflare retry
|
|
response = self._request_with_retry(page_url, timeout=30)
|
|
|
|
# Debug: Check what we received
|
|
self.log(f"Fetched page, status: {response.status_code}, length: {len(response.text)} bytes", "debug")
|
|
if len(response.text) < 10000:
|
|
self.log(f"WARNING: Response seems too short! First 1000 chars: {response.text[:1000]}", "warning")
|
|
|
|
# Parse images
|
|
images = self._parse_gallery_page(response.text, base_url)
|
|
self.log(f"Found {len(images)} images on page {page_num}", "info")
|
|
|
|
# Track if we found any new images on this page
|
|
found_new_images = False
|
|
skipped_old_images = 0
|
|
|
|
# Filter by date and download
|
|
for image_info in images:
|
|
# Apply date filter
|
|
if cutoff_date and image_info.get('upload_date'):
|
|
if image_info['upload_date'] < cutoff_date:
|
|
skipped_old_images += 1
|
|
self.log(f"Skipping old image: {image_info['filename']} "
|
|
f"(uploaded {image_info['upload_date'].date()})", "debug")
|
|
continue
|
|
|
|
# Log image being processed
|
|
upload_date_str = image_info.get('upload_date').strftime('%Y-%m-%d') if image_info.get('upload_date') else 'unknown'
|
|
self.log(f"Processing image: {image_info['filename']} (uploaded {upload_date_str})", "info")
|
|
|
|
# This image is within date range
|
|
found_new_images = True
|
|
|
|
# Download image
|
|
self._download_image(image_info, output_path, gallery_name)
|
|
|
|
# If using date filter and ALL images on this page were too old, stop processing
|
|
# (assumes gallery is sorted newest-first, which is true for album=lastup)
|
|
if cutoff_date and not found_new_images and len(images) > 0:
|
|
self.log(f"All {skipped_old_images} images on page {page_num} are older than {days_back} days. "
|
|
f"Stopping pagination (assuming chronological order).", "info")
|
|
break
|
|
|
|
# Update activity status with page progress
|
|
self.activity_manager.update_status(
|
|
"Downloading images",
|
|
progress_current=page_num,
|
|
progress_total=total_pages
|
|
)
|
|
|
|
# Rate limiting between pages
|
|
if page_num < total_pages:
|
|
time.sleep(self.min_delay)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error processing page {page_num}: {e}", "error")
|
|
continue
|
|
|
|
self.log(f"Download complete! Total: {self.download_count} images", "info")
|
|
return (self.file_timestamps, self.download_count)
|
|
|
|
def cleanup(self):
|
|
"""Cleanup resources"""
|
|
if self.session:
|
|
self.session.close()
|