#!/usr/bin/env python3 """ Coppermine Photo Gallery Downloader Module Downloads full-resolution images from Coppermine-based galleries """ import os import re import time import hashlib import requests from pathlib import Path from datetime import datetime, timedelta from typing import Dict, List, Optional, Set from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse, parse_qs from modules.base_module import LoggingMixin from modules.cloudflare_handler import CloudflareHandler, SiteStatus, get_flaresolverr_user_agent class CoppermineDownloader(LoggingMixin): """ Coppermine Photo Gallery downloader Example usage: from coppermine_module import CoppermineDownloader downloader = CoppermineDownloader() count = downloader.download( gallery_url="https://hqdiesel.net/thumbnails.php?album=lastup&cat=123", output_dir="downloads/coppermine", days_back=7 ) print(f"Downloaded {count} items") """ def __init__(self, show_progress=True, use_database=True, log_callback=None, unified_db=None, config=None): """ Initialize the downloader Args: show_progress: Print progress messages use_database: Use database to track downloads log_callback: Optional callback function for logging unified_db: Optional UnifiedDatabase instance config: Optional config dict with flaresolverr settings """ # Initialize logging via mixin self._init_logger('Coppermine', log_callback, default_module='Download') self.show_progress = show_progress self.use_database = use_database self.downloaded_files = set() self.download_count = 0 self.unified_db = unified_db # Store for scraper config access self.scraper_id = 'coppermine' # Scraper ID in database # Use unified database if provided if unified_db and use_database: from modules.unified_database import CoppermineDatabaseAdapter self.db = CoppermineDatabaseAdapter(unified_db) else: self.db = None self.use_database = False # Initialize activity status manager for real-time updates from modules.activity_status import get_activity_manager self.activity_manager = get_activity_manager(unified_db) # Rate limiting self.min_delay = 1 self.max_delay = 3 self.pending_downloads = [] # Track downloads for deferred database recording # Load scraper configuration from database if available self.proxy_url = None self.cookie_file = None # Default to None (use database) if unified_db: scraper_config = unified_db.get_scraper(self.scraper_id) if scraper_config: # Get proxy configuration if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'): self.proxy_url = scraper_config['proxy_url'] self.log(f"Using proxy: {self.proxy_url}", "info") # Fall back to config file for cookie_file if database not available if not unified_db and config: self.cookie_file = config.get('cookie_file', '/opt/media-downloader/cookies/coppermine_cookies.json') # Session with proper headers self.session = requests.Session() self.user_agent = get_flaresolverr_user_agent() self.session.headers.update({ 'User-Agent': self.user_agent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }) # Configure session proxy if available if self.proxy_url: self.session.proxies = { 'http': self.proxy_url, 'https': self.proxy_url } # Initialize universal Cloudflare handler with conservative expiry # Pass proxy_url if configured, and cookie_file=None for database storage self.cf_handler = CloudflareHandler( module_name="Coppermine", cookie_file=self.cookie_file, # None when using database user_agent=self.user_agent, logger=self.logger, aggressive_expiry=False, # Conservative mode for Coppermine proxy_url=self.proxy_url # Pass proxy to FlareSolverr ) # Keep for backwards compatibility self.flaresolverr_url = self.cf_handler.flaresolverr_url self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled # Load cookies from file if exists self._load_cookies() def _record_download(self, url: str, platform: str, source: str, content_type: str, filename: str, file_path: str, file_size: int, file_hash: str, post_date=None, metadata: dict = None, deferred: bool = False): """Record a download in the database Args: deferred: If True, don't record to database now - add to pending_downloads list for later recording after file move is complete """ # If deferred, store for later recording instead of recording now if deferred: self.pending_downloads.append({ 'url': url, 'platform': platform, 'source': source, 'content_type': content_type, 'filename': filename, 'file_path': file_path, 'file_size': file_size, 'file_hash': file_hash, 'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date, 'metadata': metadata }) self.log(f"Deferred recording for {filename}", "debug") return True if not self.use_database or not self.db: return try: self.db.add_download( url=url, platform=platform, source=source, content_type=content_type, filename=filename, file_path=file_path, file_size=file_size, file_hash=file_hash, post_date=post_date, metadata=metadata ) except Exception as e: self.log(f"Failed to record download: {e}", "debug") def get_pending_downloads(self): """Get list of downloads that were deferred for later recording""" return self.pending_downloads.copy() def clear_pending_downloads(self): """Clear the pending downloads list after they've been recorded""" self.pending_downloads = [] def _load_cookies(self): """Load cookies from database or file""" # Try database first if available if self.unified_db: try: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) if cookies: cf_clearance_found = False for cookie in cookies: try: self.session.cookies.set( cookie['name'], cookie['value'], domain=cookie.get('domain', ''), path=cookie.get('path', '/') ) if cookie['name'] == 'cf_clearance': cf_clearance_found = True except Exception as e: self.log(f"Error setting cookie {cookie.get('name')}: {e}", "warning") if cf_clearance_found: self.log(f"✓ Loaded {len(cookies)} cookies including cf_clearance from database", "info") else: self.log(f"⚠ Loaded {len(cookies)} cookies from database but cf_clearance NOT found", "warning") # Also load cookies into CloudflareHandler for consistency self.cf_handler._cookies = cookies return else: self.log("No cookies found in database", "debug") except Exception as e: self.log(f"Error loading cookies from database: {e}", "warning") # Fall back to cookie file if no database if not self.cookie_file: self.log("No cookie file configured", "debug") return cookie_path = Path(self.cookie_file) if not cookie_path.exists(): self.log(f"Cookie file does not exist: {self.cookie_file}", "info") return try: import json with open(cookie_path, 'r') as f: data = json.load(f) # Handle both old format (list) and new format (dict with 'cookies' and 'timestamp') if isinstance(data, dict) and 'cookies' in data: cookies = data['cookies'] elif isinstance(data, list): cookies = data else: self.log(f"Invalid cookie file format", "warning") return # Count critical cookies cf_clearance_found = False for cookie in cookies: try: # Set cookie with basic attributes (requests.Session compatible) self.session.cookies.set( cookie['name'], cookie['value'], domain=cookie.get('domain', ''), path=cookie.get('path', '/') ) if cookie['name'] == 'cf_clearance': cf_clearance_found = True except Exception as e: self.log(f"Error setting cookie {cookie.get('name')}: {e}", "warning") if cf_clearance_found: self.log(f"✓ Loaded {len(cookies)} cookies including cf_clearance from {self.cookie_file}", "info") else: self.log(f"⚠ Loaded {len(cookies)} cookies but cf_clearance NOT found", "warning") except Exception as e: self.log(f"Error loading cookies: {e}", "warning") def _cookies_expired(self): """Check if cookies are expired - delegates to CloudflareHandler""" return self.cf_handler.cookies_expired() def _save_cookies(self, cookies: list, user_agent: str = None): """Save cookies to database or file with timestamp Args: cookies: List of cookie dictionaries user_agent: User agent to associate with cookies (important for cf_clearance). If not provided, uses self.user_agent as fallback. """ # Use provided user_agent or fall back to self.user_agent ua = user_agent or self.user_agent # Try database first if available if self.unified_db: try: self.unified_db.save_scraper_cookies( self.scraper_id, cookies, user_agent=ua, merge=True # Merge with existing cookies ) self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50] if ua else 'None'}...)", "debug") return except Exception as e: self.log(f"Error saving cookies to database: {e}", "warning") # Fall back to file if not self.cookie_file: return try: import json from datetime import datetime cookie_path = Path(self.cookie_file) cookie_path.parent.mkdir(parents=True, exist_ok=True) storage_data = { 'cookies': cookies, 'timestamp': datetime.now().isoformat() } with open(cookie_path, 'w') as f: json.dump(storage_data, f, indent=2) self.log(f"Saved {len(cookies)} cookies to {self.cookie_file}", "debug") except Exception as e: self.log(f"Error saving cookies: {e}", "warning") def _get_cookies_via_flaresolverr(self, url: str, max_retries: int = 2) -> bool: """Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler Args: url: URL to fetch max_retries: Maximum number of retry attempts (default: 2) Returns: True if cookies obtained successfully, False otherwise """ # Delegate to CloudflareHandler success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries) # If successful, also load cookies into the session and save to database if success: cookies_dict = self.cf_handler.get_cookies_dict() for name, value in cookies_dict.items(): # Extract domain from URL from urllib.parse import urlparse parsed = urlparse(url) domain = parsed.netloc self.session.cookies.set(name, value, domain=domain, path='/') # Save cookies to database (the handler already saved to file if configured) if self.unified_db: cookies_list = self.cf_handler.get_cookies_list() if cookies_list: # CRITICAL: Get the user_agent from FlareSolverr solution, not self.user_agent # cf_clearance cookies are fingerprinted to the browser that solved the challenge flaresolverr_ua = self.cf_handler.get_user_agent() self._save_cookies(cookies_list, user_agent=flaresolverr_ua) return success def _request_with_retry(self, url: str, timeout: int = 30, max_attempts: int = 2): """Make HTTP request with automatic Cloudflare challenge retry Args: url: URL to fetch timeout: Request timeout in seconds max_attempts: Maximum number of attempts (default: 2) Returns: requests.Response object Raises: Exception if all retry attempts fail """ last_error = None for attempt in range(1, max_attempts + 1): try: response = self.session.get(url, timeout=timeout) # Detect Cloudflare challenges is_cloudflare = False if response.status_code in [403, 503]: is_cloudflare = True self.log(f"Cloudflare challenge detected (HTTP {response.status_code})", "warning") elif len(response.text) < 1000: is_cloudflare = True self.log(f"Cloudflare challenge detected (short response: {len(response.text)} bytes)", "warning") elif 'challenge' in response.text.lower()[:500]: is_cloudflare = True self.log("Cloudflare challenge detected in HTML", "warning") # If Cloudflare detected and we have retry attempts left if is_cloudflare and attempt < max_attempts: if self.flaresolverr_enabled: self.log(f"Attempt {attempt}/{max_attempts}: Refreshing cookies via FlareSolverr...", "info") if self._get_cookies_via_flaresolverr(url): self.log("Cookies refreshed, retrying request...", "info") continue # Retry the request else: raise Exception("Failed to refresh cookies via FlareSolverr") else: raise Exception("Cloudflare challenge detected but FlareSolverr is disabled") # No Cloudflare challenge or final attempt - check status and return response.raise_for_status() return response except Exception as e: last_error = e if attempt < max_attempts: self.log(f"Attempt {attempt}/{max_attempts} failed: {e}", "warning") else: self.log(f"All {max_attempts} attempts failed", "error") # All attempts failed raise last_error def _parse_date(self, date_str: str) -> Optional[datetime]: """ Parse Coppermine date format: 'Date added=Sep 29, 2025' Args: date_str: Date string from Coppermine Returns: datetime object or None """ try: # Extract date from "Date added=Sep 29, 2025" format match = re.search(r'Date added=([A-Za-z]+ \d+, \d{4})', date_str) if match: date_part = match.group(1) return datetime.strptime(date_part, '%b %d, %Y') except Exception as e: self.log(f"Error parsing date '{date_str}': {e}", "debug") return None def _extract_full_image_url(self, base_url: str, thumbnail_url: str) -> str: """ Convert thumbnail URL to full-resolution URL Pattern: Thumbnail: albums/userpics/1052219/thumb_1000523798.jpg Normal: albums/userpics/1052219/normal_1000523798.jpg Full: albums/userpics/1052219/1000523798.jpg Args: base_url: Base URL of the gallery (e.g., https://hqdiesel.net) thumbnail_url: Relative thumbnail URL Returns: Full-resolution image URL """ # Remove thumb_ or normal_ prefix full_path = re.sub(r'/(thumb_|normal_)', '/', thumbnail_url) return urljoin(base_url, full_path) def _parse_gallery_page(self, html: str, base_url: str) -> List[Dict]: """ Parse a Coppermine gallery page to extract image information Args: html: HTML content of the page base_url: Base URL of the gallery Returns: List of dicts with image info """ soup = BeautifulSoup(html, 'html.parser') images = [] # Find all thumbnail cells thumbnail_cells = soup.find_all('td', class_='thumbnails') self.log(f"Found {len(thumbnail_cells)} thumbnail cells on page", "debug") for cell in thumbnail_cells: try: # Find image link link = cell.find('a', href=re.compile(r'displayimage\.php')) if not link: continue # Extract PID from URL href = link.get('href', '') parsed = parse_qs(urlparse(href).query) pid = parsed.get('pid', [None])[0] if not pid: continue # Find thumbnail image img = link.find('img') if not img: continue thumbnail_url = img.get('src', '') if not thumbnail_url: continue # Get image title (contains metadata) title = img.get('title', '') # Extract filename filename_match = re.search(r'Filename=([^\s]+)', title) filename = filename_match.group(1) if filename_match else None # Extract date from dedicated span (more reliable) upload_date = None date_span = cell.find('span', class_='thumb_caption_ctime') if date_span and date_span.text.strip(): try: upload_date = datetime.strptime(date_span.text.strip(), '%b %d, %Y') except Exception: # Fallback to title parsing upload_date = self._parse_date(title) else: upload_date = self._parse_date(title) # Extract uploader uploader = None uploader_link = cell.find('a', href=re.compile(r'profile\.php')) if uploader_link: uploader = uploader_link.text.strip() # Extract dimensions dimensions_match = re.search(r'Dimensions=(\d+x\d+)', title) dimensions = dimensions_match.group(1) if dimensions_match else None # Extract filesize filesize_match = re.search(r'Filesize=([^\s]+)', title) filesize = filesize_match.group(1) if filesize_match else None # Extract views views = None views_span = cell.find('span', class_='thumb_title_views') if views_span: views_match = re.search(r'(\d+)\s+views?', views_span.text) if views_match: views = int(views_match.group(1)) # Construct full-resolution URL full_url = self._extract_full_image_url(base_url, thumbnail_url) images.append({ 'pid': pid, 'filename': filename, 'thumbnail_url': urljoin(base_url, thumbnail_url), 'full_url': full_url, 'upload_date': upload_date, 'dimensions': dimensions, 'filesize': filesize, 'uploader': uploader, 'views': views, 'title': title }) except Exception as e: self.log(f"Error parsing thumbnail cell: {e}", "debug") continue return images def _get_total_pages(self, html: str) -> int: """ Extract total number of pages from gallery Args: html: HTML content Returns: Number of pages """ try: soup = BeautifulSoup(html, 'html.parser') # Look for pagination info like "2005 files on 20 page(s)" text = soup.get_text() match = re.search(r'(\d+)\s+files?\s+on\s+(\d+)\s+page', text) if match: return int(match.group(2)) except Exception as e: self.log(f"Error extracting page count: {e}", "debug") return 1 def _download_image(self, image_info: Dict, output_dir: Path, gallery_name: str) -> Optional[str]: """ Download a single image Args: image_info: Image information dict output_dir: Output directory gallery_name: Name of gallery for database tracking Returns: Path to downloaded file or None """ try: url = image_info['full_url'] pid = image_info['pid'] filename = image_info['filename'] # Check if already downloaded if self.use_database and self.db: if self.db.is_downloaded(url, platform='coppermine'): self.log(f"Already downloaded (database): {filename} (PID: {pid})", "info") return None # Create output directory output_dir.mkdir(parents=True, exist_ok=True) # Construct output filename output_file = output_dir / filename # Skip if file exists if output_file.exists(): self.log(f"File already exists: {filename}", "info") return str(output_file) # Download image self.log(f"Downloading: {filename} (PID: {pid})", "info") response = self._request_with_retry(url, timeout=30) # Save image with open(output_file, 'wb') as f: f.write(response.content) # Check for duplicate hash before recording if self.db and hasattr(self.db, 'unified_db'): from pathlib import Path as PathLib # Check for duplicate hash (hash blacklist persists even if original deleted) file_hash_check = self.db.unified_db.get_file_hash(str(output_file)) if file_hash_check: existing = self.db.unified_db.get_download_by_file_hash(file_hash_check) if existing and existing.get('file_path') and str(output_file) != existing.get('file_path'): # Duplicate hash found - content was already downloaded (prevents redownload of deleted content) self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") # Delete the duplicate regardless of whether original file still exists try: output_file.unlink() self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug") return except Exception as e: self.log(f"Failed to delete duplicate {filename}: {e}", "warning") return # Calculate SHA256 file hash from saved file (consistent with other modules) file_hash = None if self.db and hasattr(self.db, 'unified_db'): try: file_hash = self.db.unified_db.get_file_hash(str(output_file)) except Exception as e: self.log(f"Failed to calculate file hash: {e}", "warning") # Track timestamp for this file if image_info.get('upload_date'): self.file_timestamps[filename] = image_info['upload_date'] # Record in database self._record_download( url=url, platform='coppermine', source=gallery_name, content_type='image', filename=filename, file_path=str(output_file), file_size=len(response.content), file_hash=file_hash, post_date=image_info.get('upload_date'), metadata={ 'pid': pid, 'dimensions': image_info.get('dimensions'), 'filesize': image_info.get('filesize') }, deferred=getattr(self, 'defer_database', False) ) self.download_count += 1 time.sleep(self.min_delay + (self.max_delay - self.min_delay) * __import__('random').random()) return str(output_file) except Exception as e: self.log(f"Error downloading {image_info.get('filename', 'unknown')}: {e}", "error") return None def download(self, gallery_url: str, output_dir: str, days_back: Optional[int] = None, max_pages: Optional[int] = None, gallery_name: Optional[str] = None, defer_database: bool = False) -> tuple: """ Download images from a Coppermine gallery Args: gallery_url: URL to the gallery page (e.g., thumbnails.php?album=lastup&cat=123) output_dir: Directory to save images days_back: Only download images from last N days (None = all) max_pages: Maximum number of pages to process (None = all) gallery_name: Name for database tracking (extracted from URL if not provided) defer_database: If True, don't record to database immediately - store in pending_downloads for later recording after file move is complete Returns: Tuple of (file_timestamps dict, download_count) file_timestamps: Dict mapping filename -> upload_date """ self.defer_database = defer_database # Store for use in download methods # Clear downloaded_files cache between galleries to prevent memory growth self.downloaded_files.clear() # Check site status before doing anything else self.log("Checking Coppermine gallery site status...", "debug") site_status, error_msg = self.cf_handler.check_site_status(gallery_url, timeout=10) if self.cf_handler.should_skip_download(site_status): self.log(f"Skipping download - Coppermine gallery is unavailable: {error_msg}", "warning") return ({}, 0) elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE: self.log("Cloudflare challenge detected, will attempt bypass during download", "info") self.download_count = 0 self.file_timestamps = {} # Track timestamps for each file output_path = Path(output_dir) # Extract base URL and gallery name parsed_url = urlparse(gallery_url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" if not gallery_name: # Extract category from URL query_params = parse_qs(parsed_url.query) cat = query_params.get('cat', ['unknown'])[0] album = query_params.get('album', ['unknown'])[0] gallery_name = f"{parsed_url.netloc}_cat{cat}_{album}" self.log(f"Starting download from: {gallery_url}", "info") self.activity_manager.update_status(f"Checking gallery: {gallery_name}") self.log(f"Gallery: {gallery_name}", "info") if days_back: self.log(f"Filtering: Last {days_back} days", "info") # Calculate cutoff date cutoff_date = None if days_back: cutoff_date = datetime.now() - timedelta(days=days_back) # Check if cookies have expired before testing cookies_valid = False cookie_count = len(self.session.cookies) # Check for short-lived session cookies that may have expired if self.cf_handler.cookies_expired(): self.log(f"Cookies expired, skipping test and refreshing via FlareSolverr", "info") else: self.log(f"Testing with {cookie_count} existing cookies...", "info") try: # Try with existing cookies first (short timeout for fast fail) test_response = self.session.get(gallery_url, timeout=5) # Check if we got a Cloudflare challenge or error if test_response.status_code == 403 or test_response.status_code == 503: self.log(f"Existing cookies failed (HTTP {test_response.status_code}), need FlareSolverr", "info") elif len(test_response.text) < 1000: self.log(f"Response too short ({len(test_response.text)} bytes), likely Cloudflare challenge", "info") elif 'challenge' in test_response.text.lower()[:500]: self.log("Cloudflare challenge detected in response", "info") else: # Cookies work (or no challenge presented)! cookies_valid = True self.log(f"✓ Existing cookies valid ({cookie_count} cookies, skipped FlareSolverr)", "info") response = test_response except Exception as e: self.log(f"Test request failed ({type(e).__name__}: {e}), need FlareSolverr", "info") # Only call FlareSolverr if existing cookies don't work if not cookies_valid: if self.flaresolverr_enabled: self.log("Calling FlareSolverr to get fresh cookies...", "info") if not self._get_cookies_via_flaresolverr(gallery_url): self.log("Failed to bypass Cloudflare", "error") return ({}, 0) else: self.log("FlareSolverr disabled and cookies invalid", "error") return ({}, 0) # Fetch first page to get total pages (reuse response if cookies were valid) try: if not cookies_valid: response = self._request_with_retry(gallery_url, timeout=30) total_pages = self._get_total_pages(response.text) if max_pages: total_pages = min(total_pages, max_pages) self.log(f"Total pages to process: {total_pages}", "info") except Exception as e: self.log(f"Error fetching gallery: {e}", "error") return ({}, 0) # Set initial progress so dashboard shows 0/N immediately self.activity_manager.update_status( "Downloading images", progress_current=0, progress_total=total_pages ) # Process each page for page_num in range(1, total_pages + 1): try: # Construct page URL if page_num == 1: page_url = gallery_url else: separator = '&' if '?' in gallery_url else '?' page_url = f"{gallery_url}{separator}page={page_num}" self.log(f"Processing page {page_num}/{total_pages}...", "info") # Fetch page with automatic Cloudflare retry response = self._request_with_retry(page_url, timeout=30) # Debug: Check what we received self.log(f"Fetched page, status: {response.status_code}, length: {len(response.text)} bytes", "debug") if len(response.text) < 10000: self.log(f"WARNING: Response seems too short! First 1000 chars: {response.text[:1000]}", "warning") # Parse images images = self._parse_gallery_page(response.text, base_url) self.log(f"Found {len(images)} images on page {page_num}", "info") # Track if we found any new images on this page found_new_images = False skipped_old_images = 0 # Filter by date and download for image_info in images: # Apply date filter if cutoff_date and image_info.get('upload_date'): if image_info['upload_date'] < cutoff_date: skipped_old_images += 1 self.log(f"Skipping old image: {image_info['filename']} " f"(uploaded {image_info['upload_date'].date()})", "debug") continue # Log image being processed upload_date_str = image_info.get('upload_date').strftime('%Y-%m-%d') if image_info.get('upload_date') else 'unknown' self.log(f"Processing image: {image_info['filename']} (uploaded {upload_date_str})", "info") # This image is within date range found_new_images = True # Download image self._download_image(image_info, output_path, gallery_name) # If using date filter and ALL images on this page were too old, stop processing # (assumes gallery is sorted newest-first, which is true for album=lastup) if cutoff_date and not found_new_images and len(images) > 0: self.log(f"All {skipped_old_images} images on page {page_num} are older than {days_back} days. " f"Stopping pagination (assuming chronological order).", "info") break # Update activity status with page progress self.activity_manager.update_status( "Downloading images", progress_current=page_num, progress_total=total_pages ) # Rate limiting between pages if page_num < total_pages: time.sleep(self.min_delay) except Exception as e: self.log(f"Error processing page {page_num}: {e}", "error") continue self.log(f"Download complete! Total: {self.download_count} images", "info") return (self.file_timestamps, self.download_count) def cleanup(self): """Cleanup resources""" if self.session: self.session.close()