Files
media-downloader/modules/coppermine_module.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

874 lines
36 KiB
Python

#!/usr/bin/env python3
"""
Coppermine Photo Gallery Downloader Module
Downloads full-resolution images from Coppermine-based galleries
"""
import os
import re
import time
import hashlib
import requests
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Set
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs
from modules.base_module import LoggingMixin
from modules.cloudflare_handler import CloudflareHandler, SiteStatus, get_flaresolverr_user_agent
class CoppermineDownloader(LoggingMixin):
"""
Coppermine Photo Gallery downloader
Example usage:
from coppermine_module import CoppermineDownloader
downloader = CoppermineDownloader()
count = downloader.download(
gallery_url="https://hqdiesel.net/thumbnails.php?album=lastup&cat=123",
output_dir="downloads/coppermine",
days_back=7
)
print(f"Downloaded {count} items")
"""
def __init__(self, show_progress=True, use_database=True,
log_callback=None, unified_db=None, config=None):
"""
Initialize the downloader
Args:
show_progress: Print progress messages
use_database: Use database to track downloads
log_callback: Optional callback function for logging
unified_db: Optional UnifiedDatabase instance
config: Optional config dict with flaresolverr settings
"""
# Initialize logging via mixin
self._init_logger('Coppermine', log_callback, default_module='Download')
self.show_progress = show_progress
self.use_database = use_database
self.downloaded_files = set()
self.download_count = 0
self.unified_db = unified_db # Store for scraper config access
self.scraper_id = 'coppermine' # Scraper ID in database
# Use unified database if provided
if unified_db and use_database:
from modules.unified_database import CoppermineDatabaseAdapter
self.db = CoppermineDatabaseAdapter(unified_db)
else:
self.db = None
self.use_database = False
# Initialize activity status manager for real-time updates
from modules.activity_status import get_activity_manager
self.activity_manager = get_activity_manager(unified_db)
# Rate limiting
self.min_delay = 1
self.max_delay = 3
self.pending_downloads = [] # Track downloads for deferred database recording
# Load scraper configuration from database if available
self.proxy_url = None
self.cookie_file = None # Default to None (use database)
if unified_db:
scraper_config = unified_db.get_scraper(self.scraper_id)
if scraper_config:
# Get proxy configuration
if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
self.proxy_url = scraper_config['proxy_url']
self.log(f"Using proxy: {self.proxy_url}", "info")
# Fall back to config file for cookie_file if database not available
if not unified_db and config:
self.cookie_file = config.get('cookie_file', '/opt/media-downloader/cookies/coppermine_cookies.json')
# Session with proper headers
self.session = requests.Session()
self.user_agent = get_flaresolverr_user_agent()
self.session.headers.update({
'User-Agent': self.user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
})
# Configure session proxy if available
if self.proxy_url:
self.session.proxies = {
'http': self.proxy_url,
'https': self.proxy_url
}
# Initialize universal Cloudflare handler with conservative expiry
# Pass proxy_url if configured, and cookie_file=None for database storage
self.cf_handler = CloudflareHandler(
module_name="Coppermine",
cookie_file=self.cookie_file, # None when using database
user_agent=self.user_agent,
logger=self.logger,
aggressive_expiry=False, # Conservative mode for Coppermine
proxy_url=self.proxy_url # Pass proxy to FlareSolverr
)
# Keep for backwards compatibility
self.flaresolverr_url = self.cf_handler.flaresolverr_url
self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled
# Load cookies from file if exists
self._load_cookies()
def _record_download(self, url: str, platform: str, source: str, content_type: str,
filename: str, file_path: str, file_size: int, file_hash: str,
post_date=None, metadata: dict = None, deferred: bool = False):
"""Record a download in the database
Args:
deferred: If True, don't record to database now - add to pending_downloads list
for later recording after file move is complete
"""
# If deferred, store for later recording instead of recording now
if deferred:
self.pending_downloads.append({
'url': url,
'platform': platform,
'source': source,
'content_type': content_type,
'filename': filename,
'file_path': file_path,
'file_size': file_size,
'file_hash': file_hash,
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
'metadata': metadata
})
self.log(f"Deferred recording for {filename}", "debug")
return True
if not self.use_database or not self.db:
return
try:
self.db.add_download(
url=url,
platform=platform,
source=source,
content_type=content_type,
filename=filename,
file_path=file_path,
file_size=file_size,
file_hash=file_hash,
post_date=post_date,
metadata=metadata
)
except Exception as e:
self.log(f"Failed to record download: {e}", "debug")
def get_pending_downloads(self):
"""Get list of downloads that were deferred for later recording"""
return self.pending_downloads.copy()
def clear_pending_downloads(self):
"""Clear the pending downloads list after they've been recorded"""
self.pending_downloads = []
def _load_cookies(self):
"""Load cookies from database or file"""
# Try database first if available
if self.unified_db:
try:
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
if cookies:
cf_clearance_found = False
for cookie in cookies:
try:
self.session.cookies.set(
cookie['name'],
cookie['value'],
domain=cookie.get('domain', ''),
path=cookie.get('path', '/')
)
if cookie['name'] == 'cf_clearance':
cf_clearance_found = True
except Exception as e:
self.log(f"Error setting cookie {cookie.get('name')}: {e}", "warning")
if cf_clearance_found:
self.log(f"✓ Loaded {len(cookies)} cookies including cf_clearance from database", "info")
else:
self.log(f"⚠ Loaded {len(cookies)} cookies from database but cf_clearance NOT found", "warning")
# Also load cookies into CloudflareHandler for consistency
self.cf_handler._cookies = cookies
return
else:
self.log("No cookies found in database", "debug")
except Exception as e:
self.log(f"Error loading cookies from database: {e}", "warning")
# Fall back to cookie file if no database
if not self.cookie_file:
self.log("No cookie file configured", "debug")
return
cookie_path = Path(self.cookie_file)
if not cookie_path.exists():
self.log(f"Cookie file does not exist: {self.cookie_file}", "info")
return
try:
import json
with open(cookie_path, 'r') as f:
data = json.load(f)
# Handle both old format (list) and new format (dict with 'cookies' and 'timestamp')
if isinstance(data, dict) and 'cookies' in data:
cookies = data['cookies']
elif isinstance(data, list):
cookies = data
else:
self.log(f"Invalid cookie file format", "warning")
return
# Count critical cookies
cf_clearance_found = False
for cookie in cookies:
try:
# Set cookie with basic attributes (requests.Session compatible)
self.session.cookies.set(
cookie['name'],
cookie['value'],
domain=cookie.get('domain', ''),
path=cookie.get('path', '/')
)
if cookie['name'] == 'cf_clearance':
cf_clearance_found = True
except Exception as e:
self.log(f"Error setting cookie {cookie.get('name')}: {e}", "warning")
if cf_clearance_found:
self.log(f"✓ Loaded {len(cookies)} cookies including cf_clearance from {self.cookie_file}", "info")
else:
self.log(f"⚠ Loaded {len(cookies)} cookies but cf_clearance NOT found", "warning")
except Exception as e:
self.log(f"Error loading cookies: {e}", "warning")
def _cookies_expired(self):
"""Check if cookies are expired - delegates to CloudflareHandler"""
return self.cf_handler.cookies_expired()
def _save_cookies(self, cookies: list, user_agent: str = None):
"""Save cookies to database or file with timestamp
Args:
cookies: List of cookie dictionaries
user_agent: User agent to associate with cookies (important for cf_clearance).
If not provided, uses self.user_agent as fallback.
"""
# Use provided user_agent or fall back to self.user_agent
ua = user_agent or self.user_agent
# Try database first if available
if self.unified_db:
try:
self.unified_db.save_scraper_cookies(
self.scraper_id,
cookies,
user_agent=ua,
merge=True # Merge with existing cookies
)
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50] if ua else 'None'}...)", "debug")
return
except Exception as e:
self.log(f"Error saving cookies to database: {e}", "warning")
# Fall back to file
if not self.cookie_file:
return
try:
import json
from datetime import datetime
cookie_path = Path(self.cookie_file)
cookie_path.parent.mkdir(parents=True, exist_ok=True)
storage_data = {
'cookies': cookies,
'timestamp': datetime.now().isoformat()
}
with open(cookie_path, 'w') as f:
json.dump(storage_data, f, indent=2)
self.log(f"Saved {len(cookies)} cookies to {self.cookie_file}", "debug")
except Exception as e:
self.log(f"Error saving cookies: {e}", "warning")
def _get_cookies_via_flaresolverr(self, url: str, max_retries: int = 2) -> bool:
"""Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler
Args:
url: URL to fetch
max_retries: Maximum number of retry attempts (default: 2)
Returns:
True if cookies obtained successfully, False otherwise
"""
# Delegate to CloudflareHandler
success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries)
# If successful, also load cookies into the session and save to database
if success:
cookies_dict = self.cf_handler.get_cookies_dict()
for name, value in cookies_dict.items():
# Extract domain from URL
from urllib.parse import urlparse
parsed = urlparse(url)
domain = parsed.netloc
self.session.cookies.set(name, value, domain=domain, path='/')
# Save cookies to database (the handler already saved to file if configured)
if self.unified_db:
cookies_list = self.cf_handler.get_cookies_list()
if cookies_list:
# CRITICAL: Get the user_agent from FlareSolverr solution, not self.user_agent
# cf_clearance cookies are fingerprinted to the browser that solved the challenge
flaresolverr_ua = self.cf_handler.get_user_agent()
self._save_cookies(cookies_list, user_agent=flaresolverr_ua)
return success
def _request_with_retry(self, url: str, timeout: int = 30, max_attempts: int = 2):
"""Make HTTP request with automatic Cloudflare challenge retry
Args:
url: URL to fetch
timeout: Request timeout in seconds
max_attempts: Maximum number of attempts (default: 2)
Returns:
requests.Response object
Raises:
Exception if all retry attempts fail
"""
last_error = None
for attempt in range(1, max_attempts + 1):
try:
response = self.session.get(url, timeout=timeout)
# Detect Cloudflare challenges
is_cloudflare = False
if response.status_code in [403, 503]:
is_cloudflare = True
self.log(f"Cloudflare challenge detected (HTTP {response.status_code})", "warning")
elif len(response.text) < 1000:
is_cloudflare = True
self.log(f"Cloudflare challenge detected (short response: {len(response.text)} bytes)", "warning")
elif 'challenge' in response.text.lower()[:500]:
is_cloudflare = True
self.log("Cloudflare challenge detected in HTML", "warning")
# If Cloudflare detected and we have retry attempts left
if is_cloudflare and attempt < max_attempts:
if self.flaresolverr_enabled:
self.log(f"Attempt {attempt}/{max_attempts}: Refreshing cookies via FlareSolverr...", "info")
if self._get_cookies_via_flaresolverr(url):
self.log("Cookies refreshed, retrying request...", "info")
continue # Retry the request
else:
raise Exception("Failed to refresh cookies via FlareSolverr")
else:
raise Exception("Cloudflare challenge detected but FlareSolverr is disabled")
# No Cloudflare challenge or final attempt - check status and return
response.raise_for_status()
return response
except Exception as e:
last_error = e
if attempt < max_attempts:
self.log(f"Attempt {attempt}/{max_attempts} failed: {e}", "warning")
else:
self.log(f"All {max_attempts} attempts failed", "error")
# All attempts failed
raise last_error
def _parse_date(self, date_str: str) -> Optional[datetime]:
"""
Parse Coppermine date format: 'Date added=Sep 29, 2025'
Args:
date_str: Date string from Coppermine
Returns:
datetime object or None
"""
try:
# Extract date from "Date added=Sep 29, 2025" format
match = re.search(r'Date added=([A-Za-z]+ \d+, \d{4})', date_str)
if match:
date_part = match.group(1)
return datetime.strptime(date_part, '%b %d, %Y')
except Exception as e:
self.log(f"Error parsing date '{date_str}': {e}", "debug")
return None
def _extract_full_image_url(self, base_url: str, thumbnail_url: str) -> str:
"""
Convert thumbnail URL to full-resolution URL
Pattern:
Thumbnail: albums/userpics/1052219/thumb_1000523798.jpg
Normal: albums/userpics/1052219/normal_1000523798.jpg
Full: albums/userpics/1052219/1000523798.jpg
Args:
base_url: Base URL of the gallery (e.g., https://hqdiesel.net)
thumbnail_url: Relative thumbnail URL
Returns:
Full-resolution image URL
"""
# Remove thumb_ or normal_ prefix
full_path = re.sub(r'/(thumb_|normal_)', '/', thumbnail_url)
return urljoin(base_url, full_path)
def _parse_gallery_page(self, html: str, base_url: str) -> List[Dict]:
"""
Parse a Coppermine gallery page to extract image information
Args:
html: HTML content of the page
base_url: Base URL of the gallery
Returns:
List of dicts with image info
"""
soup = BeautifulSoup(html, 'html.parser')
images = []
# Find all thumbnail cells
thumbnail_cells = soup.find_all('td', class_='thumbnails')
self.log(f"Found {len(thumbnail_cells)} thumbnail cells on page", "debug")
for cell in thumbnail_cells:
try:
# Find image link
link = cell.find('a', href=re.compile(r'displayimage\.php'))
if not link:
continue
# Extract PID from URL
href = link.get('href', '')
parsed = parse_qs(urlparse(href).query)
pid = parsed.get('pid', [None])[0]
if not pid:
continue
# Find thumbnail image
img = link.find('img')
if not img:
continue
thumbnail_url = img.get('src', '')
if not thumbnail_url:
continue
# Get image title (contains metadata)
title = img.get('title', '')
# Extract filename
filename_match = re.search(r'Filename=([^\s]+)', title)
filename = filename_match.group(1) if filename_match else None
# Extract date from dedicated span (more reliable)
upload_date = None
date_span = cell.find('span', class_='thumb_caption_ctime')
if date_span and date_span.text.strip():
try:
upload_date = datetime.strptime(date_span.text.strip(), '%b %d, %Y')
except Exception:
# Fallback to title parsing
upload_date = self._parse_date(title)
else:
upload_date = self._parse_date(title)
# Extract uploader
uploader = None
uploader_link = cell.find('a', href=re.compile(r'profile\.php'))
if uploader_link:
uploader = uploader_link.text.strip()
# Extract dimensions
dimensions_match = re.search(r'Dimensions=(\d+x\d+)', title)
dimensions = dimensions_match.group(1) if dimensions_match else None
# Extract filesize
filesize_match = re.search(r'Filesize=([^\s]+)', title)
filesize = filesize_match.group(1) if filesize_match else None
# Extract views
views = None
views_span = cell.find('span', class_='thumb_title_views')
if views_span:
views_match = re.search(r'(\d+)\s+views?', views_span.text)
if views_match:
views = int(views_match.group(1))
# Construct full-resolution URL
full_url = self._extract_full_image_url(base_url, thumbnail_url)
images.append({
'pid': pid,
'filename': filename,
'thumbnail_url': urljoin(base_url, thumbnail_url),
'full_url': full_url,
'upload_date': upload_date,
'dimensions': dimensions,
'filesize': filesize,
'uploader': uploader,
'views': views,
'title': title
})
except Exception as e:
self.log(f"Error parsing thumbnail cell: {e}", "debug")
continue
return images
def _get_total_pages(self, html: str) -> int:
"""
Extract total number of pages from gallery
Args:
html: HTML content
Returns:
Number of pages
"""
try:
soup = BeautifulSoup(html, 'html.parser')
# Look for pagination info like "2005 files on 20 page(s)"
text = soup.get_text()
match = re.search(r'(\d+)\s+files?\s+on\s+(\d+)\s+page', text)
if match:
return int(match.group(2))
except Exception as e:
self.log(f"Error extracting page count: {e}", "debug")
return 1
def _download_image(self, image_info: Dict, output_dir: Path,
gallery_name: str) -> Optional[str]:
"""
Download a single image
Args:
image_info: Image information dict
output_dir: Output directory
gallery_name: Name of gallery for database tracking
Returns:
Path to downloaded file or None
"""
try:
url = image_info['full_url']
pid = image_info['pid']
filename = image_info['filename']
# Check if already downloaded
if self.use_database and self.db:
if self.db.is_downloaded(url, platform='coppermine'):
self.log(f"Already downloaded (database): {filename} (PID: {pid})", "info")
return None
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)
# Construct output filename
output_file = output_dir / filename
# Skip if file exists
if output_file.exists():
self.log(f"File already exists: {filename}", "info")
return str(output_file)
# Download image
self.log(f"Downloading: {filename} (PID: {pid})", "info")
response = self._request_with_retry(url, timeout=30)
# Save image
with open(output_file, 'wb') as f:
f.write(response.content)
# Check for duplicate hash before recording
if self.db and hasattr(self.db, 'unified_db'):
from pathlib import Path as PathLib
# Check for duplicate hash (hash blacklist persists even if original deleted)
file_hash_check = self.db.unified_db.get_file_hash(str(output_file))
if file_hash_check:
existing = self.db.unified_db.get_download_by_file_hash(file_hash_check)
if existing and existing.get('file_path') and str(output_file) != existing.get('file_path'):
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
# Delete the duplicate regardless of whether original file still exists
try:
output_file.unlink()
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
return
except Exception as e:
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
return
# Calculate SHA256 file hash from saved file (consistent with other modules)
file_hash = None
if self.db and hasattr(self.db, 'unified_db'):
try:
file_hash = self.db.unified_db.get_file_hash(str(output_file))
except Exception as e:
self.log(f"Failed to calculate file hash: {e}", "warning")
# Track timestamp for this file
if image_info.get('upload_date'):
self.file_timestamps[filename] = image_info['upload_date']
# Record in database
self._record_download(
url=url,
platform='coppermine',
source=gallery_name,
content_type='image',
filename=filename,
file_path=str(output_file),
file_size=len(response.content),
file_hash=file_hash,
post_date=image_info.get('upload_date'),
metadata={
'pid': pid,
'dimensions': image_info.get('dimensions'),
'filesize': image_info.get('filesize')
},
deferred=getattr(self, 'defer_database', False)
)
self.download_count += 1
time.sleep(self.min_delay + (self.max_delay - self.min_delay) * __import__('random').random())
return str(output_file)
except Exception as e:
self.log(f"Error downloading {image_info.get('filename', 'unknown')}: {e}", "error")
return None
def download(self, gallery_url: str, output_dir: str,
days_back: Optional[int] = None, max_pages: Optional[int] = None,
gallery_name: Optional[str] = None, defer_database: bool = False) -> tuple:
"""
Download images from a Coppermine gallery
Args:
gallery_url: URL to the gallery page (e.g., thumbnails.php?album=lastup&cat=123)
output_dir: Directory to save images
days_back: Only download images from last N days (None = all)
max_pages: Maximum number of pages to process (None = all)
gallery_name: Name for database tracking (extracted from URL if not provided)
defer_database: If True, don't record to database immediately - store in
pending_downloads for later recording after file move is complete
Returns:
Tuple of (file_timestamps dict, download_count)
file_timestamps: Dict mapping filename -> upload_date
"""
self.defer_database = defer_database # Store for use in download methods
# Clear downloaded_files cache between galleries to prevent memory growth
self.downloaded_files.clear()
# Check site status before doing anything else
self.log("Checking Coppermine gallery site status...", "debug")
site_status, error_msg = self.cf_handler.check_site_status(gallery_url, timeout=10)
if self.cf_handler.should_skip_download(site_status):
self.log(f"Skipping download - Coppermine gallery is unavailable: {error_msg}", "warning")
return ({}, 0)
elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
self.log("Cloudflare challenge detected, will attempt bypass during download", "info")
self.download_count = 0
self.file_timestamps = {} # Track timestamps for each file
output_path = Path(output_dir)
# Extract base URL and gallery name
parsed_url = urlparse(gallery_url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
if not gallery_name:
# Extract category from URL
query_params = parse_qs(parsed_url.query)
cat = query_params.get('cat', ['unknown'])[0]
album = query_params.get('album', ['unknown'])[0]
gallery_name = f"{parsed_url.netloc}_cat{cat}_{album}"
self.log(f"Starting download from: {gallery_url}", "info")
self.activity_manager.update_status(f"Checking gallery: {gallery_name}")
self.log(f"Gallery: {gallery_name}", "info")
if days_back:
self.log(f"Filtering: Last {days_back} days", "info")
# Calculate cutoff date
cutoff_date = None
if days_back:
cutoff_date = datetime.now() - timedelta(days=days_back)
# Check if cookies have expired before testing
cookies_valid = False
cookie_count = len(self.session.cookies)
# Check for short-lived session cookies that may have expired
if self.cf_handler.cookies_expired():
self.log(f"Cookies expired, skipping test and refreshing via FlareSolverr", "info")
else:
self.log(f"Testing with {cookie_count} existing cookies...", "info")
try:
# Try with existing cookies first (short timeout for fast fail)
test_response = self.session.get(gallery_url, timeout=5)
# Check if we got a Cloudflare challenge or error
if test_response.status_code == 403 or test_response.status_code == 503:
self.log(f"Existing cookies failed (HTTP {test_response.status_code}), need FlareSolverr", "info")
elif len(test_response.text) < 1000:
self.log(f"Response too short ({len(test_response.text)} bytes), likely Cloudflare challenge", "info")
elif 'challenge' in test_response.text.lower()[:500]:
self.log("Cloudflare challenge detected in response", "info")
else:
# Cookies work (or no challenge presented)!
cookies_valid = True
self.log(f"✓ Existing cookies valid ({cookie_count} cookies, skipped FlareSolverr)", "info")
response = test_response
except Exception as e:
self.log(f"Test request failed ({type(e).__name__}: {e}), need FlareSolverr", "info")
# Only call FlareSolverr if existing cookies don't work
if not cookies_valid:
if self.flaresolverr_enabled:
self.log("Calling FlareSolverr to get fresh cookies...", "info")
if not self._get_cookies_via_flaresolverr(gallery_url):
self.log("Failed to bypass Cloudflare", "error")
return ({}, 0)
else:
self.log("FlareSolverr disabled and cookies invalid", "error")
return ({}, 0)
# Fetch first page to get total pages (reuse response if cookies were valid)
try:
if not cookies_valid:
response = self._request_with_retry(gallery_url, timeout=30)
total_pages = self._get_total_pages(response.text)
if max_pages:
total_pages = min(total_pages, max_pages)
self.log(f"Total pages to process: {total_pages}", "info")
except Exception as e:
self.log(f"Error fetching gallery: {e}", "error")
return ({}, 0)
# Set initial progress so dashboard shows 0/N immediately
self.activity_manager.update_status(
"Downloading images",
progress_current=0,
progress_total=total_pages
)
# Process each page
for page_num in range(1, total_pages + 1):
try:
# Construct page URL
if page_num == 1:
page_url = gallery_url
else:
separator = '&' if '?' in gallery_url else '?'
page_url = f"{gallery_url}{separator}page={page_num}"
self.log(f"Processing page {page_num}/{total_pages}...", "info")
# Fetch page with automatic Cloudflare retry
response = self._request_with_retry(page_url, timeout=30)
# Debug: Check what we received
self.log(f"Fetched page, status: {response.status_code}, length: {len(response.text)} bytes", "debug")
if len(response.text) < 10000:
self.log(f"WARNING: Response seems too short! First 1000 chars: {response.text[:1000]}", "warning")
# Parse images
images = self._parse_gallery_page(response.text, base_url)
self.log(f"Found {len(images)} images on page {page_num}", "info")
# Track if we found any new images on this page
found_new_images = False
skipped_old_images = 0
# Filter by date and download
for image_info in images:
# Apply date filter
if cutoff_date and image_info.get('upload_date'):
if image_info['upload_date'] < cutoff_date:
skipped_old_images += 1
self.log(f"Skipping old image: {image_info['filename']} "
f"(uploaded {image_info['upload_date'].date()})", "debug")
continue
# Log image being processed
upload_date_str = image_info.get('upload_date').strftime('%Y-%m-%d') if image_info.get('upload_date') else 'unknown'
self.log(f"Processing image: {image_info['filename']} (uploaded {upload_date_str})", "info")
# This image is within date range
found_new_images = True
# Download image
self._download_image(image_info, output_path, gallery_name)
# If using date filter and ALL images on this page were too old, stop processing
# (assumes gallery is sorted newest-first, which is true for album=lastup)
if cutoff_date and not found_new_images and len(images) > 0:
self.log(f"All {skipped_old_images} images on page {page_num} are older than {days_back} days. "
f"Stopping pagination (assuming chronological order).", "info")
break
# Update activity status with page progress
self.activity_manager.update_status(
"Downloading images",
progress_current=page_num,
progress_total=total_pages
)
# Rate limiting between pages
if page_num < total_pages:
time.sleep(self.min_delay)
except Exception as e:
self.log(f"Error processing page {page_num}: {e}", "error")
continue
self.log(f"Download complete! Total: {self.download_count} images", "info")
return (self.file_timestamps, self.download_count)
def cleanup(self):
"""Cleanup resources"""
if self.session:
self.session.close()