#!/usr/bin/env python3 """ Universal Cloudflare Handler Module Provides centralized Cloudflare bypass, error detection, and cookie management for all download modules (imginn, fastdl, toolzu, snapchat, coppermine). Features: - FlareSolverr integration with retry logic - Site status detection (down, challenge, working) - Cookie management for Playwright and requests - Standardized error handling (500, 403, 503, timeouts) - Skip logic for when sites are unavailable """ import json import time import requests from pathlib import Path from datetime import datetime, timedelta from typing import Dict, List, Optional, Tuple, Union from enum import Enum from modules.universal_logger import get_logger # Module-level cache for FlareSolverr browser fingerprint _flaresolverr_fingerprint_cache = { 'fingerprint': None, 'timestamp': None, 'chrome_version': None, 'loaded_from_db': False } # Database instance for persistent storage (set via set_fingerprint_database) _fingerprint_db = None # Default fallback fingerprint (Chrome 142 on Linux) DEFAULT_FINGERPRINT = { 'user_agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'sec_ch_ua': '"Not_A Brand";v="99", "Chromium";v="142"', 'sec_ch_ua_mobile': '?0', 'sec_ch_ua_platform': '"Linux"', 'accept_language': 'en-US,en;q=0.9', 'platform': 'Linux', 'viewport': {'width': 1920, 'height': 1080}, 'locale': 'en-US', 'timezone_id': 'America/New_York', 'color_scheme': 'light' } # Default fallback user-agent (for backwards compatibility) DEFAULT_USER_AGENT = DEFAULT_FINGERPRINT['user_agent'] def _extract_chrome_version(user_agent: str) -> Optional[str]: """Extract Chrome version from user-agent string.""" import re match = re.search(r'Chrome/(\d+)', user_agent) return match.group(1) if match else None def set_fingerprint_database(unified_db): """Set the database instance for persistent fingerprint storage. Call this once at application startup with your UnifiedDatabase instance. """ global _fingerprint_db _fingerprint_db = unified_db def _load_fingerprint_from_db() -> Optional[Dict]: """Load fingerprint from database (instant).""" global _fingerprint_db if not _fingerprint_db: return None try: data = _fingerprint_db.get_setting('flaresolverr_fingerprint') if data: return json.loads(data) if isinstance(data, str) else data except Exception: pass return None def _save_fingerprint_to_db(fingerprint: Dict): """Save fingerprint to database.""" global _fingerprint_db if not _fingerprint_db: return try: _fingerprint_db.set_setting('flaresolverr_fingerprint', json.dumps(fingerprint)) except Exception: pass def _fetch_fingerprint_from_flaresolverr(flaresolverr_url: str) -> Optional[Dict]: """Fetch fingerprint directly from FlareSolverr (blocking call).""" try: response = requests.post( flaresolverr_url, json={ "cmd": "request.get", "url": "https://httpbin.org/headers", "maxTimeout": 15000 }, timeout=20 ) if response.status_code == 200: data = response.json() if data.get('status') == 'ok' and data.get('solution'): solution = data['solution'] user_agent = solution.get('userAgent', DEFAULT_USER_AGENT) new_version = _extract_chrome_version(user_agent) # Parse headers from response response_text = solution.get('response', '') headers = {} try: import re json_start = response_text.find('{"headers"') if json_start >= 0: brace_count = 0 json_end = json_start for i, char in enumerate(response_text[json_start:]): if char == '{': brace_count += 1 elif char == '}': brace_count -= 1 if brace_count == 0: json_end = json_start + i + 1 break json_str = response_text[json_start:json_end] headers_data = json.loads(json_str) headers = headers_data.get('headers', {}) except Exception: pass return { 'user_agent': user_agent, 'sec_ch_ua': headers.get('Sec-Ch-Ua', DEFAULT_FINGERPRINT['sec_ch_ua']), 'sec_ch_ua_mobile': headers.get('Sec-Ch-Ua-Mobile', DEFAULT_FINGERPRINT['sec_ch_ua_mobile']), 'sec_ch_ua_platform': headers.get('Sec-Ch-Ua-Platform', DEFAULT_FINGERPRINT['sec_ch_ua_platform']), 'accept_language': headers.get('Accept-Language', DEFAULT_FINGERPRINT['accept_language']), 'platform': 'Linux' if 'Linux' in user_agent else ('Windows' if 'Windows' in user_agent else 'macOS'), 'viewport': DEFAULT_FINGERPRINT['viewport'], 'locale': 'en-US', 'timezone_id': 'America/New_York', 'color_scheme': 'light', 'chrome_version': new_version } except Exception: pass return None def get_flaresolverr_fingerprint(flaresolverr_url: str = "http://localhost:8191/v1", force_refresh: bool = False) -> Dict: """ Get FlareSolverr's complete browser fingerprint - INSTANT from cache. On first call: loads from database (instant, survives restarts). Use force_refresh=True or refresh_fingerprint_if_changed() to update. Args: flaresolverr_url: FlareSolverr API endpoint force_refresh: Force fetch from FlareSolverr (blocking) Returns: Dictionary containing full browser fingerprint """ global _flaresolverr_fingerprint_cache # Force refresh - fetch directly from FlareSolverr if force_refresh: fingerprint = _fetch_fingerprint_from_flaresolverr(flaresolverr_url) if fingerprint: old_version = _flaresolverr_fingerprint_cache.get('chrome_version') new_version = fingerprint.get('chrome_version') if old_version and new_version and old_version != new_version: print(f"[Cloudflare] FlareSolverr Chrome version changed: {old_version} -> {new_version}") _flaresolverr_fingerprint_cache['fingerprint'] = fingerprint _flaresolverr_fingerprint_cache['timestamp'] = datetime.now() _flaresolverr_fingerprint_cache['chrome_version'] = new_version _save_fingerprint_to_db(fingerprint) return fingerprint # Fall through to cache if fetch failed # Return memory cache if available (instant) if _flaresolverr_fingerprint_cache['fingerprint']: return _flaresolverr_fingerprint_cache['fingerprint'] # Load from database (instant, survives restarts) if not _flaresolverr_fingerprint_cache['loaded_from_db']: _flaresolverr_fingerprint_cache['loaded_from_db'] = True db_fingerprint = _load_fingerprint_from_db() if db_fingerprint: _flaresolverr_fingerprint_cache['fingerprint'] = db_fingerprint _flaresolverr_fingerprint_cache['chrome_version'] = db_fingerprint.get('chrome_version') return db_fingerprint # No cache available - fetch from FlareSolverr (first time only, blocking) fingerprint = _fetch_fingerprint_from_flaresolverr(flaresolverr_url) if fingerprint: _flaresolverr_fingerprint_cache['fingerprint'] = fingerprint _flaresolverr_fingerprint_cache['timestamp'] = datetime.now() _flaresolverr_fingerprint_cache['chrome_version'] = fingerprint.get('chrome_version') _save_fingerprint_to_db(fingerprint) return fingerprint # Last resort - return default return DEFAULT_FINGERPRINT.copy() def refresh_fingerprint_if_changed(flaresolverr_url: str = "http://localhost:8191/v1") -> bool: """ Check if FlareSolverr's Chrome version changed and refresh if needed. Call this periodically (e.g., before each scraping session) for instant updates. Returns: True if fingerprint was updated, False otherwise """ global _flaresolverr_fingerprint_cache new_fingerprint = _fetch_fingerprint_from_flaresolverr(flaresolverr_url) if not new_fingerprint: return False new_version = new_fingerprint.get('chrome_version') old_version = _flaresolverr_fingerprint_cache.get('chrome_version') if new_version and old_version and new_version != old_version: print(f"[Cloudflare] FlareSolverr Chrome version updated: {old_version} -> {new_version}") _flaresolverr_fingerprint_cache['fingerprint'] = new_fingerprint _flaresolverr_fingerprint_cache['timestamp'] = datetime.now() _flaresolverr_fingerprint_cache['chrome_version'] = new_version _save_fingerprint_to_db(new_fingerprint) return True # Update cache even if version same (refresh timestamp) if new_fingerprint: _flaresolverr_fingerprint_cache['fingerprint'] = new_fingerprint _flaresolverr_fingerprint_cache['timestamp'] = datetime.now() _save_fingerprint_to_db(new_fingerprint) return False def get_flaresolverr_user_agent(flaresolverr_url: str = "http://localhost:8191/v1") -> str: """ Get FlareSolverr's user-agent dynamically. This is a convenience wrapper around get_flaresolverr_fingerprint() for backwards compatibility. Args: flaresolverr_url: FlareSolverr API endpoint Returns: User-agent string from FlareSolverr, or default fallback """ fingerprint = get_flaresolverr_fingerprint(flaresolverr_url) return fingerprint.get('user_agent', DEFAULT_USER_AGENT) def get_playwright_context_options(flaresolverr_url: str = "http://localhost:8191/v1") -> Dict: """ Get Playwright browser context options that match FlareSolverr's fingerprint. Use this when creating a Playwright browser context to ensure the browser fingerprint matches FlareSolverr, making cookies work correctly. Args: flaresolverr_url: FlareSolverr API endpoint Returns: Dictionary of options for browser.new_context() Example: context_options = get_playwright_context_options() context = browser.new_context(**context_options) """ fingerprint = get_flaresolverr_fingerprint(flaresolverr_url) return { 'viewport': fingerprint['viewport'], 'user_agent': fingerprint['user_agent'], 'locale': fingerprint['locale'], 'timezone_id': fingerprint['timezone_id'], 'color_scheme': fingerprint['color_scheme'], 'extra_http_headers': { 'Accept-Language': fingerprint['accept_language'], 'Sec-Ch-Ua': fingerprint['sec_ch_ua'], 'Sec-Ch-Ua-Mobile': fingerprint['sec_ch_ua_mobile'], 'Sec-Ch-Ua-Platform': fingerprint['sec_ch_ua_platform'] } } def get_playwright_stealth_scripts() -> str: """ Get JavaScript code to inject into Playwright pages for anti-detection. Use this with page.add_init_script() to make Playwright harder to detect. Returns: JavaScript code string Example: page.add_init_script(get_playwright_stealth_scripts()) """ return """ // Hide webdriver property Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); // Override plugins to look more realistic Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); // Override languages to match fingerprint Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); // Fix permissions API const originalQuery = window.navigator.permissions.query; window.navigator.permissions.query = (parameters) => ( parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters) ); // Add chrome runtime stub window.chrome = { runtime: {} }; // Override connection property Object.defineProperty(navigator, 'connection', { get: () => ({ effectiveType: '4g', rtt: 50, downlink: 10, saveData: false }) }); // Fix iframe contentWindow access const originalContentWindow = Object.getOwnPropertyDescriptor(HTMLIFrameElement.prototype, 'contentWindow'); Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', { get: function() { try { return originalContentWindow.get.call(this); } catch (e) { return window; } } }); // Override hardware concurrency Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 }); // Override device memory Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 }); """ def invalidate_fingerprint_cache(): """ Invalidate the fingerprint cache, forcing a refresh on next request. Call this if FlareSolverr is updated or if you suspect fingerprint mismatch. """ global _flaresolverr_fingerprint_cache _flaresolverr_fingerprint_cache['fingerprint'] = None _flaresolverr_fingerprint_cache['timestamp'] = None class SiteStatus(Enum): """Site status enumeration""" WORKING = "working" # Site is accessible CLOUDFLARE_CHALLENGE = "challenge" # Cloudflare challenge detected SERVER_ERROR = "server_error" # 500, 502, 503, 504 errors FORBIDDEN = "forbidden" # 403 forbidden TIMEOUT = "timeout" # Request timeout UNKNOWN_ERROR = "unknown_error" # Other errors class CloudflareHandler: """Universal Cloudflare bypass and error handler""" def __init__(self, module_name: str, cookie_file: str = None, flaresolverr_url: str = "http://localhost:8191/v1", flaresolverr_enabled: bool = True, user_agent: str = None, logger=None, aggressive_expiry: bool = True, proxy_url: str = None): """ Initialize Cloudflare handler Args: module_name: Name of the module (for logging) cookie_file: Path to cookie storage file (optional, can be None for DB storage) flaresolverr_url: FlareSolverr API endpoint flaresolverr_enabled: Enable/disable FlareSolverr user_agent: User-Agent string (default: Chrome 141) logger: Logger instance (optional) aggressive_expiry: Use aggressive cookie expiration (7 days) vs conservative (only expired) proxy_url: Proxy URL for FlareSolverr and requests (e.g., "socks5://user:pass@host:port") """ self.module_name = module_name # Cookie file is optional (can use DB storage instead) if cookie_file: self.cookie_file = Path(cookie_file) self.cookie_file.parent.mkdir(parents=True, exist_ok=True) else: self.cookie_file = None self.flaresolverr_url = flaresolverr_url self.flaresolverr_enabled = flaresolverr_enabled # Get User-Agent dynamically from FlareSolverr (or use provided/default) self.user_agent = user_agent or get_flaresolverr_user_agent(flaresolverr_url) self.aggressive_expiry = aggressive_expiry # Proxy support for FlareSolverr and requests self.proxy_url = proxy_url # In-memory cookie storage (for when cookie_file is None) self._cookies = [] self._cookies_user_agent = None # Use provided logger or create universal logger if logger: self.logger = logger else: self.logger = get_logger(f'Cloudflare.{module_name}') def log(self, message: str, level: str = "info", module: str = "Core"): """Log message using universal logger Args: message: Log message level: Log level (debug, info, warning, error) module: Module context """ level_upper = level.upper() if level_upper == "DEBUG": self.logger.debug(message, module=module) elif level_upper == "WARNING": self.logger.warning(message, module=module) elif level_upper == "ERROR": self.logger.error(message, module=module) else: self.logger.info(message, module=module) # ==================== Cookie Management ==================== def cookies_expired(self) -> bool: """ Check if cookies are expired Returns: True if cookies need refresh, False otherwise """ # If no cookie file configured, check in-memory cookies if not self.cookie_file: if not self._cookies: return True cookies = self._cookies data = {'cookies': cookies, 'timestamp': datetime.now().isoformat()} elif not self.cookie_file.exists(): return True else: try: with open(self.cookie_file, 'r') as f: data = json.load(f) except Exception as e: self.log(f"Error reading cookie file: {e}", "warning") return True try: # Check if cookies array is empty cookies = data.get('cookies', []) if not cookies or len(cookies) == 0: return True if self.aggressive_expiry: # Aggressive: refresh if cf_clearance or site-functional cookies expire within 7 days current_timestamp = int(time.time()) days_threshold = 7 * 24 * 60 * 60 # 7 days in seconds # Only check cookies that matter for Cloudflare bypass and site functionality. # Third-party ad/tracking cookies (panoramaId, __cf_bm on ad domains, # KTPCACOOKIE, etc.) are short-lived by design and irrelevant. # # Strategy: find the primary site domain (most common domain in cookies), # then only check cf_clearance + first-party non-ad cookies. from collections import Counter domain_counts = Counter(c.get('domain', '') for c in cookies) site_domain = domain_counts.most_common(1)[0][0] if domain_counts else '' # Known ad/tracking cookie name prefixes (even on first-party domains) ad_prefixes = ('_ga', '_gat', '_gid', '_gcl', '__utm', 'panoramaId', '_cc_', '_pubcid', 'cto_', 'FCCDCF', 'FCOEC', 'FCNEC', 'IABGPP', 'usprivacy', '__gads', '__gpi', '__eoi') for cookie in cookies: cookie_name = cookie.get('name', '') cookie_domain = cookie.get('domain', '') # Only check cf_clearance (critical) and first-party site cookies is_cf = cookie_name == 'cf_clearance' is_first_party = site_domain and site_domain == cookie_domain if not is_cf and not is_first_party: continue # Skip known ad/tracking cookies even on the site's domain if any(cookie_name.startswith(p) for p in ad_prefixes): continue expiry = cookie.get('expiry') or cookie.get('expires') # Skip session cookies (expires = -1) if expiry and expiry != -1: if expiry < current_timestamp: self.log(f"Cookie '{cookie_name}' has expired", "debug") return True elif expiry < (current_timestamp + days_threshold): days_remaining = (expiry - current_timestamp) / (24 * 60 * 60) self.log(f"Cookie '{cookie_name}' expires in {days_remaining:.1f} days, forcing refresh", "debug") return True # Check file age as fallback saved_time = datetime.fromisoformat(data['timestamp']) age = datetime.now() - saved_time return age > timedelta(hours=12) else: # Conservative: refresh if cf_clearance OR any critical short-lived cookie expired current_timestamp = int(time.time()) one_hour = 3600 # 1 hour in seconds # Check cf_clearance (main Cloudflare cookie) cf_clearance = next((c for c in cookies if c['name'] == 'cf_clearance'), None) if cf_clearance: expiry = cf_clearance.get('expiry') or cf_clearance.get('expires') if expiry and expiry != -1 and expiry < current_timestamp: self.log("cf_clearance cookie has expired", "debug") return True # Check for short-lived session cookies (common names: gc_session_id, session, sid, etc) # If any cookie expires within 1 hour, consider it expired # Skip analytics cookies that naturally expire quickly (_ga, _gat, _gid) analytics_prefixes = ('_ga', '_gat', '_gid', '_gcl', '__utm') for cookie in cookies: cookie_name = cookie.get('name', 'unknown') # Skip Google Analytics and tracking cookies - they're not critical for CF bypass if cookie_name.startswith(analytics_prefixes): continue expiry = cookie.get('expiry') or cookie.get('expires') if expiry and expiry != -1: if expiry < (current_timestamp + one_hour): time_remaining = (expiry - current_timestamp) / 60 # minutes if time_remaining <= 0: self.log(f"Cookie '{cookie_name}' has expired", "debug") return True elif 'session' in cookie_name.lower(): # Session cookies expiring soon are critical self.log(f"Session cookie '{cookie_name}' expires in {time_remaining:.0f} min, refreshing", "debug") return True return False except Exception as e: self.log(f"Error checking cookie expiration: {e}", "warning") return True def get_cookies_dict(self) -> Dict[str, str]: """ Get cookies as dictionary for requests library Returns: Dictionary of cookie name->value pairs """ # If no cookie file, use in-memory cookies if not self.cookie_file: return {c['name']: c['value'] for c in self._cookies} if not self.cookie_file.exists(): return {} try: with open(self.cookie_file, 'r') as f: data = json.load(f) cookies = {} for cookie in data.get('cookies', []): cookies[cookie['name']] = cookie['value'] return cookies except Exception as e: self.log(f"Error loading cookies: {e}", "warning") return {} def get_cookies_list(self) -> List[Dict]: """ Get cookies as list for Playwright Returns: List of cookie dictionaries with 'expiry' converted to 'expires' for Playwright compatibility """ # If no cookie file, use in-memory cookies if not self.cookie_file: cookies = self._cookies.copy() elif not self.cookie_file.exists(): return [] else: try: with open(self.cookie_file, 'r') as f: data = json.load(f) cookies = data.get('cookies', []) except Exception as e: self.log(f"Error loading cookies: {e}", "warning") return [] # Convert 'expiry' to 'expires' for Playwright compatibility # FlareSolverr uses 'expiry' but Playwright expects 'expires' converted = [] for cookie in cookies: c = dict(cookie) if 'expiry' in c and 'expires' not in c: c['expires'] = c.pop('expiry') converted.append(c) return converted def get_user_agent(self) -> Optional[str]: """ Get the user agent associated with the stored cookies. This is important because cf_clearance cookies are tied to browser fingerprint. Returns: User agent string or None if not stored """ # If no cookie file, use in-memory user agent if not self.cookie_file: return self._cookies_user_agent if not self.cookie_file.exists(): return None try: with open(self.cookie_file, 'r') as f: data = json.load(f) return data.get('user_agent') except Exception: return None def save_cookies(self, cookies: Union[List[Dict], Dict[str, str]], user_agent: str = None): """ Save cookies to file or in-memory storage Args: cookies: Either list of cookie dicts (Playwright) or dict (requests) user_agent: Browser user agent (important for cf_clearance cookies) """ # Convert dict to list format if needed if isinstance(cookies, dict): cookies_list = [ {'name': k, 'value': v, 'domain': '', 'path': '/'} for k, v in cookies.items() ] else: cookies_list = cookies # If no cookie file, store in memory if not self.cookie_file: self._cookies = cookies_list self._cookies_user_agent = user_agent self.log(f"Saved {len(cookies_list)} cookies to memory", "debug") return storage_data = { 'cookies': cookies_list, 'timestamp': datetime.now().isoformat() } if user_agent: storage_data['user_agent'] = user_agent try: with open(self.cookie_file, 'w') as f: json.dump(storage_data, f, indent=2) self.log(f"Saved {len(cookies_list)} cookies to {self.cookie_file}", "debug") except Exception as e: self.log(f"Error saving cookies: {e}", "error") def load_cookies_to_playwright(self, context): """ Load cookies into Playwright browser context Args: context: Playwright browser context """ cookies = self.get_cookies_list() if cookies: # CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect try: context.clear_cookies() except Exception: pass context.add_cookies(cookies) self.log(f"Loaded {len(cookies)} cookies into browser", "debug") def save_cookies_from_playwright(self, context): """ Save cookies from Playwright browser context Args: context: Playwright browser context """ cookies = context.cookies() self.save_cookies(cookies) def load_cookies_to_requests(self, session: requests.Session): """ Load cookies into requests Session Args: session: requests.Session object """ cookies = self.get_cookies_dict() session.cookies.update(cookies) self.log(f"Loaded {len(cookies)} cookies into requests session", "debug") # ==================== FlareSolverr Integration ==================== def get_cookies_via_flaresolverr(self, url: str, max_retries: int = 2) -> bool: """ Use FlareSolverr to bypass Cloudflare and get fresh cookies Args: url: URL to fetch max_retries: Maximum number of retry attempts Returns: True if cookies obtained successfully, False otherwise """ if not self.flaresolverr_enabled: self.log("FlareSolverr is disabled", "debug") return False for attempt in range(1, max_retries + 1): try: if attempt > 1: self.log(f"Retrying FlareSolverr (attempt {attempt}/{max_retries})...", "info") else: self.log("Using FlareSolverr to bypass Cloudflare...", "info") payload = { "cmd": "request.get", "url": url, "maxTimeout": 120000 # 120 seconds for difficult challenges } # Add proxy if configured if self.proxy_url: payload["proxy"] = {"url": self.proxy_url} self.log(f"Using proxy: {self.proxy_url}", "debug") response = requests.post(self.flaresolverr_url, json=payload, timeout=130) data = response.json() if data.get('status') == 'ok' and data.get('solution'): solution = data['solution'] cookies = solution.get('cookies', []) user_agent = solution.get('userAgent') if cookies: has_cf_clearance = any(c['name'] == 'cf_clearance' for c in cookies) if has_cf_clearance: self.log(f"✓ FlareSolverr bypassed Cloudflare! Got {len(cookies)} cookies", "info") else: self.log(f"✓ FlareSolverr succeeded! Got {len(cookies)} cookies", "info") # Save cookies with user_agent (important for cf_clearance) self.save_cookies(cookies, user_agent=user_agent) return True else: self.log("FlareSolverr returned no cookies", "warning") return False else: error_msg = data.get('message', 'Unknown error') self.log(f"FlareSolverr failed: {error_msg}", "warning") # Retry on timeout errors if 'timeout' in error_msg.lower() and attempt < max_retries: continue return False except requests.exceptions.Timeout: self.log(f"FlareSolverr request timed out (attempt {attempt}/{max_retries})", "warning") if attempt < max_retries: continue return False except Exception as e: self.log(f"FlareSolverr error: {e}", "error") return False return False # ==================== Site Status Detection ==================== def check_site_status(self, url: str, timeout: int = 10) -> Tuple[SiteStatus, Optional[str]]: """ Check site status and detect Cloudflare challenges or errors Args: url: URL to check timeout: Request timeout in seconds Returns: Tuple of (SiteStatus, error_message) """ try: response = requests.get( url, timeout=timeout, headers={'User-Agent': self.user_agent}, cookies=self.get_cookies_dict(), allow_redirects=True ) status_code = response.status_code content = response.text.lower() # Check for server errors (5xx) if status_code >= 500: error_msg = f"Server error {status_code}" if status_code == 500: error_msg = "Internal server error (500)" elif status_code == 502: error_msg = "Bad gateway (502)" elif status_code == 503: error_msg = "Service unavailable (503)" elif status_code == 504: error_msg = "Gateway timeout (504)" self.log(error_msg, "warning") return (SiteStatus.SERVER_ERROR, error_msg) # Check for forbidden (403) # Note: 403 is often just expired cookies, which FlareSolverr will fix if status_code == 403: self.log("Access forbidden (403) - cookies may be expired", "debug") return (SiteStatus.FORBIDDEN, "Access forbidden (403)") # Check for Cloudflare challenge indicators challenge_indicators = [ 'challenge-platform', 'checking your browser', 'just a moment', 'verify you are human', 'cloudflare', 'cf-challenge', 'cf_clearance' ] # Short response likely indicates challenge page if len(response.text) < 1000: for indicator in challenge_indicators: if indicator in content: self.log("Cloudflare challenge detected", "info") return (SiteStatus.CLOUDFLARE_CHALLENGE, "Cloudflare challenge detected") # Check first 500 chars for challenge indicators if any(indicator in content[:500] for indicator in challenge_indicators): self.log("Cloudflare challenge detected in page content", "info") return (SiteStatus.CLOUDFLARE_CHALLENGE, "Cloudflare challenge detected") # If we got here, site appears to be working self.log(f"Site appears to be working (status {status_code})", "debug") return (SiteStatus.WORKING, None) except requests.exceptions.Timeout: self.log(f"Request timed out after {timeout}s", "warning") return (SiteStatus.TIMEOUT, f"Request timed out after {timeout}s") except requests.exceptions.ConnectionError as e: self.log(f"Connection error: {e}", "error") return (SiteStatus.UNKNOWN_ERROR, f"Connection error: {e}") except Exception as e: self.log(f"Error checking site status: {e}", "error") return (SiteStatus.UNKNOWN_ERROR, str(e)) def should_skip_download(self, status: SiteStatus) -> bool: """ Determine if download should be skipped based on site status Args: status: SiteStatus enum value Returns: True if download should be skipped, False if retry is possible """ # Skip on server errors and timeouts (site is down) if status in [SiteStatus.SERVER_ERROR, SiteStatus.TIMEOUT]: return True # Don't skip on Cloudflare challenges (we can try to bypass) if status == SiteStatus.CLOUDFLARE_CHALLENGE: return False # Don't skip on forbidden (might be temporary) if status == SiteStatus.FORBIDDEN: return False # Skip on unknown errors to be safe if status == SiteStatus.UNKNOWN_ERROR: return True return False # ==================== High-Level Helper Methods ==================== def ensure_cookies(self, url: str) -> bool: """ Ensure we have valid cookies, getting new ones via FlareSolverr if needed Args: url: URL to use for FlareSolverr request Returns: True if cookies are available, False otherwise """ if not self.cookies_expired(): self.log("Using existing cookies", "debug") return True self.log("Cookies missing or expired, attempting FlareSolverr bypass...", "info") return self.get_cookies_via_flaresolverr(url) def check_and_bypass(self, url: str, auto_flaresolverr: bool = True) -> Tuple[SiteStatus, bool]: """ Check site status and automatically attempt FlareSolverr bypass if needed Args: url: URL to check auto_flaresolverr: Automatically call FlareSolverr on challenges Returns: Tuple of (SiteStatus, cookies_obtained) """ status, error_msg = self.check_site_status(url) # If site is down or timing out, don't bother with FlareSolverr if self.should_skip_download(status): return (status, False) # If Cloudflare challenge detected and auto-bypass enabled if status == SiteStatus.CLOUDFLARE_CHALLENGE and auto_flaresolverr: self.log("Attempting automatic Cloudflare bypass...", "info") success = self.get_cookies_via_flaresolverr(url) return (status, success) return (status, True) def wait_for_cloudflare_playwright(self, page, max_wait: int = 120) -> bool: """ Wait for Cloudflare challenge to resolve in Playwright page Args: page: Playwright page object max_wait: Maximum wait time in seconds Returns: True if challenge resolved, False if still blocked """ start_time = time.time() challenge_indicators = [ 'challenge-platform', 'checking your browser', 'just a moment' ] while time.time() - start_time < max_wait: try: content = page.content().lower() # Check if challenge is still present has_challenge = any(indicator in content for indicator in challenge_indicators) if not has_challenge: self.log("Cloudflare challenge resolved", "info") return True # Log progress every 15 seconds elapsed = int(time.time() - start_time) if elapsed % 15 == 0 and elapsed > 0: self.log(f"Still waiting for Cloudflare ({elapsed}s)...", "debug") time.sleep(1) except Exception as e: self.log(f"Error checking for Cloudflare: {e}", "warning") time.sleep(1) self.log(f"Cloudflare challenge did not resolve after {max_wait}s", "warning") return False