media-downloader/modules/coppermine_module.py

#!/usr/bin/env python3
"""
Coppermine Photo Gallery Downloader Module
Downloads full-resolution images from Coppermine-based galleries
"""

import os
import re
import time
import hashlib
import requests
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Set
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs
from modules.base_module import LoggingMixin
from modules.cloudflare_handler import CloudflareHandler, SiteStatus, get_flaresolverr_user_agent


class CoppermineDownloader(LoggingMixin):
    """
    Coppermine Photo Gallery downloader

    Example usage:
        from coppermine_module import CoppermineDownloader

        downloader = CoppermineDownloader()
        count = downloader.download(
            gallery_url="https://hqdiesel.net/thumbnails.php?album=lastup&cat=123",
            output_dir="downloads/coppermine",
            days_back=7
        )
        print(f"Downloaded {count} items")
    """

    def __init__(self, show_progress=True, use_database=True,
                 log_callback=None, unified_db=None, config=None):
        """
        Initialize the downloader

        Args:
            show_progress: Print progress messages
            use_database: Use database to track downloads
            log_callback: Optional callback function for logging
            unified_db: Optional UnifiedDatabase instance
            config: Optional config dict with flaresolverr settings
        """
        # Initialize logging via mixin
        self._init_logger('Coppermine', log_callback, default_module='Download')

        self.show_progress = show_progress
        self.use_database = use_database
        self.downloaded_files = set()
        self.download_count = 0
        self.unified_db = unified_db  # Store for scraper config access
        self.scraper_id = 'coppermine'  # Scraper ID in database

        # Use unified database if provided
        if unified_db and use_database:
            from modules.unified_database import CoppermineDatabaseAdapter
            self.db = CoppermineDatabaseAdapter(unified_db)
        else:
            self.db = None
            self.use_database = False

        # Initialize activity status manager for real-time updates
        from modules.activity_status import get_activity_manager
        self.activity_manager = get_activity_manager(unified_db)

        # Rate limiting
        self.min_delay = 1
        self.max_delay = 3

        self.pending_downloads = []  # Track downloads for deferred database recording

        # Load scraper configuration from database if available
        self.proxy_url = None
        self.cookie_file = None  # Default to None (use database)

        if unified_db:
            scraper_config = unified_db.get_scraper(self.scraper_id)
            if scraper_config:
                # Get proxy configuration
                if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
                    self.proxy_url = scraper_config['proxy_url']
                    self.log(f"Using proxy: {self.proxy_url}", "info")

        # Fall back to config file for cookie_file if database not available
        if not unified_db and config:
            self.cookie_file = config.get('cookie_file', '/opt/media-downloader/cookies/coppermine_cookies.json')

        # Session with proper headers
        self.session = requests.Session()
        self.user_agent = get_flaresolverr_user_agent()
        self.session.headers.update({
            'User-Agent': self.user_agent,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        })

        # Configure session proxy if available
        if self.proxy_url:
            self.session.proxies = {
                'http': self.proxy_url,
                'https': self.proxy_url
            }

        # Initialize universal Cloudflare handler with conservative expiry
        # Pass proxy_url if configured, and cookie_file=None for database storage
        self.cf_handler = CloudflareHandler(
            module_name="Coppermine",
            cookie_file=self.cookie_file,  # None when using database
            user_agent=self.user_agent,
            logger=self.logger,
            aggressive_expiry=False,  # Conservative mode for Coppermine
            proxy_url=self.proxy_url  # Pass proxy to FlareSolverr
        )

        # Keep for backwards compatibility
        self.flaresolverr_url = self.cf_handler.flaresolverr_url
        self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled

        # Load cookies from file if exists
        self._load_cookies()

    def _record_download(self, url: str, platform: str, source: str, content_type: str,
                        filename: str, file_path: str, file_size: int, file_hash: str,
                        post_date=None, metadata: dict = None, deferred: bool = False):
        """Record a download in the database

        Args:
            deferred: If True, don't record to database now - add to pending_downloads list
                     for later recording after file move is complete
        """
        # If deferred, store for later recording instead of recording now
        if deferred:
            self.pending_downloads.append({
                'url': url,
                'platform': platform,
                'source': source,
                'content_type': content_type,
                'filename': filename,
                'file_path': file_path,
                'file_size': file_size,
                'file_hash': file_hash,
                'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
                'metadata': metadata
            })
            self.log(f"Deferred recording for {filename}", "debug")
            return True

        if not self.use_database or not self.db:
            return

        try:
            self.db.add_download(
                url=url,
                platform=platform,
                source=source,
                content_type=content_type,
                filename=filename,
                file_path=file_path,
                file_size=file_size,
                file_hash=file_hash,
                post_date=post_date,
                metadata=metadata
            )
        except Exception as e:
            self.log(f"Failed to record download: {e}", "debug")

    def get_pending_downloads(self):
        """Get list of downloads that were deferred for later recording"""
        return self.pending_downloads.copy()

    def clear_pending_downloads(self):
        """Clear the pending downloads list after they've been recorded"""
        self.pending_downloads = []

    def _load_cookies(self):
        """Load cookies from database or file"""
        # Try database first if available
        if self.unified_db:
            try:
                cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
                if cookies:
                    cf_clearance_found = False
                    for cookie in cookies:
                        try:
                            self.session.cookies.set(
                                cookie['name'],
                                cookie['value'],
                                domain=cookie.get('domain', ''),
                                path=cookie.get('path', '/')
                            )
                            if cookie['name'] == 'cf_clearance':
                                cf_clearance_found = True
                        except Exception as e:
                            self.log(f"Error setting cookie {cookie.get('name')}: {e}", "warning")

                    if cf_clearance_found:
                        self.log(f"✓ Loaded {len(cookies)} cookies including cf_clearance from database", "info")
                    else:
                        self.log(f"⚠ Loaded {len(cookies)} cookies from database but cf_clearance NOT found", "warning")

                    # Also load cookies into CloudflareHandler for consistency
                    self.cf_handler._cookies = cookies
                    return
                else:
                    self.log("No cookies found in database", "debug")
            except Exception as e:
                self.log(f"Error loading cookies from database: {e}", "warning")

        # Fall back to cookie file if no database
        if not self.cookie_file:
            self.log("No cookie file configured", "debug")
            return

        cookie_path = Path(self.cookie_file)
        if not cookie_path.exists():
            self.log(f"Cookie file does not exist: {self.cookie_file}", "info")
            return

        try:
            import json
            with open(cookie_path, 'r') as f:
                data = json.load(f)

            # Handle both old format (list) and new format (dict with 'cookies' and 'timestamp')
            if isinstance(data, dict) and 'cookies' in data:
                cookies = data['cookies']
            elif isinstance(data, list):
                cookies = data
            else:
                self.log(f"Invalid cookie file format", "warning")
                return

            # Count critical cookies
            cf_clearance_found = False
            for cookie in cookies:
                try:
                    # Set cookie with basic attributes (requests.Session compatible)
                    self.session.cookies.set(
                        cookie['name'],
                        cookie['value'],
                        domain=cookie.get('domain', ''),
                        path=cookie.get('path', '/')
                    )
                    if cookie['name'] == 'cf_clearance':
                        cf_clearance_found = True
                except Exception as e:
                    self.log(f"Error setting cookie {cookie.get('name')}: {e}", "warning")

            if cf_clearance_found:
                self.log(f"✓ Loaded {len(cookies)} cookies including cf_clearance from {self.cookie_file}", "info")
            else:
                self.log(f"⚠ Loaded {len(cookies)} cookies but cf_clearance NOT found", "warning")

        except Exception as e:
            self.log(f"Error loading cookies: {e}", "warning")

    def _cookies_expired(self):
        """Check if cookies are expired - delegates to CloudflareHandler"""
        return self.cf_handler.cookies_expired()

    def _save_cookies(self, cookies: list, user_agent: str = None):
        """Save cookies to database or file with timestamp

        Args:
            cookies: List of cookie dictionaries
            user_agent: User agent to associate with cookies (important for cf_clearance).
                       If not provided, uses self.user_agent as fallback.
        """
        # Use provided user_agent or fall back to self.user_agent
        ua = user_agent or self.user_agent

        # Try database first if available
        if self.unified_db:
            try:
                self.unified_db.save_scraper_cookies(
                    self.scraper_id,
                    cookies,
                    user_agent=ua,
                    merge=True  # Merge with existing cookies
                )
                self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50] if ua else 'None'}...)", "debug")
                return
            except Exception as e:
                self.log(f"Error saving cookies to database: {e}", "warning")

        # Fall back to file
        if not self.cookie_file:
            return

        try:
            import json
            from datetime import datetime
            cookie_path = Path(self.cookie_file)
            cookie_path.parent.mkdir(parents=True, exist_ok=True)

            storage_data = {
                'cookies': cookies,
                'timestamp': datetime.now().isoformat()
            }

            with open(cookie_path, 'w') as f:
                json.dump(storage_data, f, indent=2)
            self.log(f"Saved {len(cookies)} cookies to {self.cookie_file}", "debug")
        except Exception as e:
            self.log(f"Error saving cookies: {e}", "warning")

    def _get_cookies_via_flaresolverr(self, url: str, max_retries: int = 2) -> bool:
        """Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler

        Args:
            url: URL to fetch
            max_retries: Maximum number of retry attempts (default: 2)

        Returns:
            True if cookies obtained successfully, False otherwise
        """
        # Delegate to CloudflareHandler
        success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries)

        # If successful, also load cookies into the session and save to database
        if success:
            cookies_dict = self.cf_handler.get_cookies_dict()
            for name, value in cookies_dict.items():
                # Extract domain from URL
                from urllib.parse import urlparse
                parsed = urlparse(url)
                domain = parsed.netloc
                self.session.cookies.set(name, value, domain=domain, path='/')

            # Save cookies to database (the handler already saved to file if configured)
            if self.unified_db:
                cookies_list = self.cf_handler.get_cookies_list()
                if cookies_list:
                    # CRITICAL: Get the user_agent from FlareSolverr solution, not self.user_agent
                    # cf_clearance cookies are fingerprinted to the browser that solved the challenge
                    flaresolverr_ua = self.cf_handler.get_user_agent()
                    self._save_cookies(cookies_list, user_agent=flaresolverr_ua)

        return success

    def _request_with_retry(self, url: str, timeout: int = 30, max_attempts: int = 2):
        """Make HTTP request with automatic Cloudflare challenge retry

        Args:
            url: URL to fetch
            timeout: Request timeout in seconds
            max_attempts: Maximum number of attempts (default: 2)

        Returns:
            requests.Response object

        Raises:
            Exception if all retry attempts fail
        """
        last_error = None

        for attempt in range(1, max_attempts + 1):
            try:
                response = self.session.get(url, timeout=timeout)

                # Detect Cloudflare challenges
                is_cloudflare = False
                if response.status_code in [403, 503]:
                    is_cloudflare = True
                    self.log(f"Cloudflare challenge detected (HTTP {response.status_code})", "warning")
                elif len(response.text) < 1000:
                    is_cloudflare = True
                    self.log(f"Cloudflare challenge detected (short response: {len(response.text)} bytes)", "warning")
                elif 'challenge' in response.text.lower()[:500]:
                    is_cloudflare = True
                    self.log("Cloudflare challenge detected in HTML", "warning")

                # If Cloudflare detected and we have retry attempts left
                if is_cloudflare and attempt < max_attempts:
                    if self.flaresolverr_enabled:
                        self.log(f"Attempt {attempt}/{max_attempts}: Refreshing cookies via FlareSolverr...", "info")
                        if self._get_cookies_via_flaresolverr(url):
                            self.log("Cookies refreshed, retrying request...", "info")
                            continue  # Retry the request
                        else:
                            raise Exception("Failed to refresh cookies via FlareSolverr")
                    else:
                        raise Exception("Cloudflare challenge detected but FlareSolverr is disabled")

                # No Cloudflare challenge or final attempt - check status and return
                response.raise_for_status()
                return response

            except Exception as e:
                last_error = e
                if attempt < max_attempts:
                    self.log(f"Attempt {attempt}/{max_attempts} failed: {e}", "warning")
                else:
                    self.log(f"All {max_attempts} attempts failed", "error")

        # All attempts failed
        raise last_error

    def _parse_date(self, date_str: str) -> Optional[datetime]:
        """
        Parse Coppermine date format: 'Date added=Sep 29, 2025'

        Args:
            date_str: Date string from Coppermine

        Returns:
            datetime object or None
        """
        try:
            # Extract date from "Date added=Sep 29, 2025" format
            match = re.search(r'Date added=([A-Za-z]+ \d+, \d{4})', date_str)
            if match:
                date_part = match.group(1)
                return datetime.strptime(date_part, '%b %d, %Y')
        except Exception as e:
            self.log(f"Error parsing date '{date_str}': {e}", "debug")
        return None

    def _extract_full_image_url(self, base_url: str, thumbnail_url: str) -> str:
        """
        Convert thumbnail URL to full-resolution URL

        Pattern:
            Thumbnail: albums/userpics/1052219/thumb_1000523798.jpg
            Normal:    albums/userpics/1052219/normal_1000523798.jpg
            Full:      albums/userpics/1052219/1000523798.jpg

        Args:
            base_url: Base URL of the gallery (e.g., https://hqdiesel.net)
            thumbnail_url: Relative thumbnail URL

        Returns:
            Full-resolution image URL
        """
        # Remove thumb_ or normal_ prefix
        full_path = re.sub(r'/(thumb_|normal_)', '/', thumbnail_url)
        return urljoin(base_url, full_path)

    def _parse_gallery_page(self, html: str, base_url: str) -> List[Dict]:
        """
        Parse a Coppermine gallery page to extract image information

        Args:
            html: HTML content of the page
            base_url: Base URL of the gallery

        Returns:
            List of dicts with image info
        """
        soup = BeautifulSoup(html, 'html.parser')
        images = []

        # Find all thumbnail cells
        thumbnail_cells = soup.find_all('td', class_='thumbnails')
        self.log(f"Found {len(thumbnail_cells)} thumbnail cells on page", "debug")

        for cell in thumbnail_cells:
            try:
                # Find image link
                link = cell.find('a', href=re.compile(r'displayimage\.php'))
                if not link:
                    continue

                # Extract PID from URL
                href = link.get('href', '')
                parsed = parse_qs(urlparse(href).query)
                pid = parsed.get('pid', [None])[0]

                if not pid:
                    continue

                # Find thumbnail image
                img = link.find('img')
                if not img:
                    continue

                thumbnail_url = img.get('src', '')
                if not thumbnail_url:
                    continue

                # Get image title (contains metadata)
                title = img.get('title', '')

                # Extract filename
                filename_match = re.search(r'Filename=([^\s]+)', title)
                filename = filename_match.group(1) if filename_match else None

                # Extract date from dedicated span (more reliable)
                upload_date = None
                date_span = cell.find('span', class_='thumb_caption_ctime')
                if date_span and date_span.text.strip():
                    try:
                        upload_date = datetime.strptime(date_span.text.strip(), '%b %d, %Y')
                    except Exception:
                        # Fallback to title parsing
                        upload_date = self._parse_date(title)
                else:
                    upload_date = self._parse_date(title)

                # Extract uploader
                uploader = None
                uploader_link = cell.find('a', href=re.compile(r'profile\.php'))
                if uploader_link:
                    uploader = uploader_link.text.strip()

                # Extract dimensions
                dimensions_match = re.search(r'Dimensions=(\d+x\d+)', title)
                dimensions = dimensions_match.group(1) if dimensions_match else None

                # Extract filesize
                filesize_match = re.search(r'Filesize=([^\s]+)', title)
                filesize = filesize_match.group(1) if filesize_match else None

                # Extract views
                views = None
                views_span = cell.find('span', class_='thumb_title_views')
                if views_span:
                    views_match = re.search(r'(\d+)\s+views?', views_span.text)
                    if views_match:
                        views = int(views_match.group(1))

                # Construct full-resolution URL
                full_url = self._extract_full_image_url(base_url, thumbnail_url)

                images.append({
                    'pid': pid,
                    'filename': filename,
                    'thumbnail_url': urljoin(base_url, thumbnail_url),
                    'full_url': full_url,
                    'upload_date': upload_date,
                    'dimensions': dimensions,
                    'filesize': filesize,
                    'uploader': uploader,
                    'views': views,
                    'title': title
                })

            except Exception as e:
                self.log(f"Error parsing thumbnail cell: {e}", "debug")
                continue

        return images

    def _get_total_pages(self, html: str) -> int:
        """
        Extract total number of pages from gallery

        Args:
            html: HTML content

        Returns:
            Number of pages
        """
        try:
            soup = BeautifulSoup(html, 'html.parser')
            # Look for pagination info like "2005 files on 20 page(s)"
            text = soup.get_text()
            match = re.search(r'(\d+)\s+files?\s+on\s+(\d+)\s+page', text)
            if match:
                return int(match.group(2))
        except Exception as e:
            self.log(f"Error extracting page count: {e}", "debug")
        return 1

    def _download_image(self, image_info: Dict, output_dir: Path,
                       gallery_name: str) -> Optional[str]:
        """
        Download a single image

        Args:
            image_info: Image information dict
            output_dir: Output directory
            gallery_name: Name of gallery for database tracking

        Returns:
            Path to downloaded file or None
        """
        try:
            url = image_info['full_url']
            pid = image_info['pid']
            filename = image_info['filename']

            # Check if already downloaded
            if self.use_database and self.db:
                if self.db.is_downloaded(url, platform='coppermine'):
                    self.log(f"Already downloaded (database): {filename} (PID: {pid})", "info")
                    return None

            # Create output directory
            output_dir.mkdir(parents=True, exist_ok=True)

            # Construct output filename
            output_file = output_dir / filename

            # Skip if file exists
            if output_file.exists():
                self.log(f"File already exists: {filename}", "info")
                return str(output_file)

            # Download image
            self.log(f"Downloading: {filename} (PID: {pid})", "info")

            response = self._request_with_retry(url, timeout=30)

            # Save image
            with open(output_file, 'wb') as f:
                f.write(response.content)

            # Check for duplicate hash before recording
            if self.db and hasattr(self.db, 'unified_db'):
                from pathlib import Path as PathLib
                # Check for duplicate hash (hash blacklist persists even if original deleted)
                file_hash_check = self.db.unified_db.get_file_hash(str(output_file))
                if file_hash_check:
                    existing = self.db.unified_db.get_download_by_file_hash(file_hash_check)
                    if existing and existing.get('file_path') and str(output_file) != existing.get('file_path'):
                        # Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
                        self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
                        # Delete the duplicate regardless of whether original file still exists
                        try:
                            output_file.unlink()
                            self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
                            return
                        except Exception as e:
                            self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
                            return

            # Calculate SHA256 file hash from saved file (consistent with other modules)
            file_hash = None
            if self.db and hasattr(self.db, 'unified_db'):
                try:
                    file_hash = self.db.unified_db.get_file_hash(str(output_file))
                except Exception as e:
                    self.log(f"Failed to calculate file hash: {e}", "warning")

            # Track timestamp for this file
            if image_info.get('upload_date'):
                self.file_timestamps[filename] = image_info['upload_date']

            # Record in database
            self._record_download(
                url=url,
                platform='coppermine',
                source=gallery_name,
                content_type='image',
                filename=filename,
                file_path=str(output_file),
                file_size=len(response.content),
                file_hash=file_hash,
                post_date=image_info.get('upload_date'),
                metadata={
                    'pid': pid,
                    'dimensions': image_info.get('dimensions'),
                    'filesize': image_info.get('filesize')
                },
                deferred=getattr(self, 'defer_database', False)
            )

            self.download_count += 1
            time.sleep(self.min_delay + (self.max_delay - self.min_delay) * __import__('random').random())

            return str(output_file)

        except Exception as e:
            self.log(f"Error downloading {image_info.get('filename', 'unknown')}: {e}", "error")
            return None

    def download(self, gallery_url: str, output_dir: str,
                 days_back: Optional[int] = None, max_pages: Optional[int] = None,
                 gallery_name: Optional[str] = None, defer_database: bool = False) -> tuple:
        """
        Download images from a Coppermine gallery

        Args:
            gallery_url: URL to the gallery page (e.g., thumbnails.php?album=lastup&cat=123)
            output_dir: Directory to save images
            days_back: Only download images from last N days (None = all)
            max_pages: Maximum number of pages to process (None = all)
            gallery_name: Name for database tracking (extracted from URL if not provided)
            defer_database: If True, don't record to database immediately - store in
                           pending_downloads for later recording after file move is complete

        Returns:
            Tuple of (file_timestamps dict, download_count)
            file_timestamps: Dict mapping filename -> upload_date
        """
        self.defer_database = defer_database  # Store for use in download methods
        # Clear downloaded_files cache between galleries to prevent memory growth
        self.downloaded_files.clear()

        # Check site status before doing anything else
        self.log("Checking Coppermine gallery site status...", "debug")
        site_status, error_msg = self.cf_handler.check_site_status(gallery_url, timeout=10)

        if self.cf_handler.should_skip_download(site_status):
            self.log(f"Skipping download - Coppermine gallery is unavailable: {error_msg}", "warning")
            return ({}, 0)
        elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
            self.log("Cloudflare challenge detected, will attempt bypass during download", "info")

        self.download_count = 0
        self.file_timestamps = {}  # Track timestamps for each file
        output_path = Path(output_dir)

        # Extract base URL and gallery name
        parsed_url = urlparse(gallery_url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"

        if not gallery_name:
            # Extract category from URL
            query_params = parse_qs(parsed_url.query)
            cat = query_params.get('cat', ['unknown'])[0]
            album = query_params.get('album', ['unknown'])[0]
            gallery_name = f"{parsed_url.netloc}_cat{cat}_{album}"

        self.log(f"Starting download from: {gallery_url}", "info")
        self.activity_manager.update_status(f"Checking gallery: {gallery_name}")
        self.log(f"Gallery: {gallery_name}", "info")
        if days_back:
            self.log(f"Filtering: Last {days_back} days", "info")

        # Calculate cutoff date
        cutoff_date = None
        if days_back:
            cutoff_date = datetime.now() - timedelta(days=days_back)

        # Check if cookies have expired before testing
        cookies_valid = False
        cookie_count = len(self.session.cookies)

        # Check for short-lived session cookies that may have expired
        if self.cf_handler.cookies_expired():
            self.log(f"Cookies expired, skipping test and refreshing via FlareSolverr", "info")
        else:
            self.log(f"Testing with {cookie_count} existing cookies...", "info")

            try:
                # Try with existing cookies first (short timeout for fast fail)
                test_response = self.session.get(gallery_url, timeout=5)

                # Check if we got a Cloudflare challenge or error
                if test_response.status_code == 403 or test_response.status_code == 503:
                    self.log(f"Existing cookies failed (HTTP {test_response.status_code}), need FlareSolverr", "info")
                elif len(test_response.text) < 1000:
                    self.log(f"Response too short ({len(test_response.text)} bytes), likely Cloudflare challenge", "info")
                elif 'challenge' in test_response.text.lower()[:500]:
                    self.log("Cloudflare challenge detected in response", "info")
                else:
                    # Cookies work (or no challenge presented)!
                    cookies_valid = True
                    self.log(f"✓ Existing cookies valid ({cookie_count} cookies, skipped FlareSolverr)", "info")
                    response = test_response
            except Exception as e:
                self.log(f"Test request failed ({type(e).__name__}: {e}), need FlareSolverr", "info")

        # Only call FlareSolverr if existing cookies don't work
        if not cookies_valid:
            if self.flaresolverr_enabled:
                self.log("Calling FlareSolverr to get fresh cookies...", "info")
                if not self._get_cookies_via_flaresolverr(gallery_url):
                    self.log("Failed to bypass Cloudflare", "error")
                    return ({}, 0)
            else:
                self.log("FlareSolverr disabled and cookies invalid", "error")
                return ({}, 0)

        # Fetch first page to get total pages (reuse response if cookies were valid)
        try:
            if not cookies_valid:
                response = self._request_with_retry(gallery_url, timeout=30)

            total_pages = self._get_total_pages(response.text)

            if max_pages:
                total_pages = min(total_pages, max_pages)

            self.log(f"Total pages to process: {total_pages}", "info")

        except Exception as e:
            self.log(f"Error fetching gallery: {e}", "error")
            return ({}, 0)

        # Set initial progress so dashboard shows 0/N immediately
        self.activity_manager.update_status(
            "Downloading images",
            progress_current=0,
            progress_total=total_pages
        )

        # Process each page
        for page_num in range(1, total_pages + 1):
            try:
                # Construct page URL
                if page_num == 1:
                    page_url = gallery_url
                else:
                    separator = '&' if '?' in gallery_url else '?'
                    page_url = f"{gallery_url}{separator}page={page_num}"

                self.log(f"Processing page {page_num}/{total_pages}...", "info")

                # Fetch page with automatic Cloudflare retry
                response = self._request_with_retry(page_url, timeout=30)

                # Debug: Check what we received
                self.log(f"Fetched page, status: {response.status_code}, length: {len(response.text)} bytes", "debug")
                if len(response.text) < 10000:
                    self.log(f"WARNING: Response seems too short! First 1000 chars: {response.text[:1000]}", "warning")

                # Parse images
                images = self._parse_gallery_page(response.text, base_url)
                self.log(f"Found {len(images)} images on page {page_num}", "info")

                # Track if we found any new images on this page
                found_new_images = False
                skipped_old_images = 0

                # Filter by date and download
                for image_info in images:
                    # Apply date filter
                    if cutoff_date and image_info.get('upload_date'):
                        if image_info['upload_date'] < cutoff_date:
                            skipped_old_images += 1
                            self.log(f"Skipping old image: {image_info['filename']} "
                                   f"(uploaded {image_info['upload_date'].date()})", "debug")
                            continue

                    # Log image being processed
                    upload_date_str = image_info.get('upload_date').strftime('%Y-%m-%d') if image_info.get('upload_date') else 'unknown'
                    self.log(f"Processing image: {image_info['filename']} (uploaded {upload_date_str})", "info")

                    # This image is within date range
                    found_new_images = True

                    # Download image
                    self._download_image(image_info, output_path, gallery_name)

                # If using date filter and ALL images on this page were too old, stop processing
                # (assumes gallery is sorted newest-first, which is true for album=lastup)
                if cutoff_date and not found_new_images and len(images) > 0:
                    self.log(f"All {skipped_old_images} images on page {page_num} are older than {days_back} days. "
                           f"Stopping pagination (assuming chronological order).", "info")
                    break

                # Update activity status with page progress
                self.activity_manager.update_status(
                    "Downloading images",
                    progress_current=page_num,
                    progress_total=total_pages
                )

                # Rate limiting between pages
                if page_num < total_pages:
                    time.sleep(self.min_delay)

            except Exception as e:
                self.log(f"Error processing page {page_num}: {e}", "error")
                continue

        self.log(f"Download complete! Total: {self.download_count} images", "info")
        return (self.file_timestamps, self.download_count)

    def cleanup(self):
        """Cleanup resources"""
        if self.session:
            self.session.close()