Files
media-downloader/modules/cloudflare_handler.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

1025 lines
38 KiB
Python

#!/usr/bin/env python3
"""
Universal Cloudflare Handler Module
Provides centralized Cloudflare bypass, error detection, and cookie management
for all download modules (imginn, fastdl, toolzu, snapchat, coppermine).
Features:
- FlareSolverr integration with retry logic
- Site status detection (down, challenge, working)
- Cookie management for Playwright and requests
- Standardized error handling (500, 403, 503, timeouts)
- Skip logic for when sites are unavailable
"""
import json
import time
import requests
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple, Union
from enum import Enum
from modules.universal_logger import get_logger
# Module-level cache for FlareSolverr browser fingerprint
_flaresolverr_fingerprint_cache = {
'fingerprint': None,
'timestamp': None,
'chrome_version': None,
'loaded_from_db': False
}
# Database instance for persistent storage (set via set_fingerprint_database)
_fingerprint_db = None
# Default fallback fingerprint (Chrome 142 on Linux)
DEFAULT_FINGERPRINT = {
'user_agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
'sec_ch_ua': '"Not_A Brand";v="99", "Chromium";v="142"',
'sec_ch_ua_mobile': '?0',
'sec_ch_ua_platform': '"Linux"',
'accept_language': 'en-US,en;q=0.9',
'platform': 'Linux',
'viewport': {'width': 1920, 'height': 1080},
'locale': 'en-US',
'timezone_id': 'America/New_York',
'color_scheme': 'light'
}
# Default fallback user-agent (for backwards compatibility)
DEFAULT_USER_AGENT = DEFAULT_FINGERPRINT['user_agent']
def _extract_chrome_version(user_agent: str) -> Optional[str]:
"""Extract Chrome version from user-agent string."""
import re
match = re.search(r'Chrome/(\d+)', user_agent)
return match.group(1) if match else None
def set_fingerprint_database(unified_db):
"""Set the database instance for persistent fingerprint storage.
Call this once at application startup with your UnifiedDatabase instance.
"""
global _fingerprint_db
_fingerprint_db = unified_db
def _load_fingerprint_from_db() -> Optional[Dict]:
"""Load fingerprint from database (instant)."""
global _fingerprint_db
if not _fingerprint_db:
return None
try:
data = _fingerprint_db.get_setting('flaresolverr_fingerprint')
if data:
return json.loads(data) if isinstance(data, str) else data
except Exception:
pass
return None
def _save_fingerprint_to_db(fingerprint: Dict):
"""Save fingerprint to database."""
global _fingerprint_db
if not _fingerprint_db:
return
try:
_fingerprint_db.set_setting('flaresolverr_fingerprint', json.dumps(fingerprint))
except Exception:
pass
def _fetch_fingerprint_from_flaresolverr(flaresolverr_url: str) -> Optional[Dict]:
"""Fetch fingerprint directly from FlareSolverr (blocking call)."""
try:
response = requests.post(
flaresolverr_url,
json={
"cmd": "request.get",
"url": "https://httpbin.org/headers",
"maxTimeout": 15000
},
timeout=20
)
if response.status_code == 200:
data = response.json()
if data.get('status') == 'ok' and data.get('solution'):
solution = data['solution']
user_agent = solution.get('userAgent', DEFAULT_USER_AGENT)
new_version = _extract_chrome_version(user_agent)
# Parse headers from response
response_text = solution.get('response', '')
headers = {}
try:
import re
json_start = response_text.find('{"headers"')
if json_start >= 0:
brace_count = 0
json_end = json_start
for i, char in enumerate(response_text[json_start:]):
if char == '{':
brace_count += 1
elif char == '}':
brace_count -= 1
if brace_count == 0:
json_end = json_start + i + 1
break
json_str = response_text[json_start:json_end]
headers_data = json.loads(json_str)
headers = headers_data.get('headers', {})
except Exception:
pass
return {
'user_agent': user_agent,
'sec_ch_ua': headers.get('Sec-Ch-Ua', DEFAULT_FINGERPRINT['sec_ch_ua']),
'sec_ch_ua_mobile': headers.get('Sec-Ch-Ua-Mobile', DEFAULT_FINGERPRINT['sec_ch_ua_mobile']),
'sec_ch_ua_platform': headers.get('Sec-Ch-Ua-Platform', DEFAULT_FINGERPRINT['sec_ch_ua_platform']),
'accept_language': headers.get('Accept-Language', DEFAULT_FINGERPRINT['accept_language']),
'platform': 'Linux' if 'Linux' in user_agent else ('Windows' if 'Windows' in user_agent else 'macOS'),
'viewport': DEFAULT_FINGERPRINT['viewport'],
'locale': 'en-US',
'timezone_id': 'America/New_York',
'color_scheme': 'light',
'chrome_version': new_version
}
except Exception:
pass
return None
def get_flaresolverr_fingerprint(flaresolverr_url: str = "http://localhost:8191/v1", force_refresh: bool = False) -> Dict:
"""
Get FlareSolverr's complete browser fingerprint - INSTANT from cache.
On first call: loads from database (instant, survives restarts).
Use force_refresh=True or refresh_fingerprint_if_changed() to update.
Args:
flaresolverr_url: FlareSolverr API endpoint
force_refresh: Force fetch from FlareSolverr (blocking)
Returns:
Dictionary containing full browser fingerprint
"""
global _flaresolverr_fingerprint_cache
# Force refresh - fetch directly from FlareSolverr
if force_refresh:
fingerprint = _fetch_fingerprint_from_flaresolverr(flaresolverr_url)
if fingerprint:
old_version = _flaresolverr_fingerprint_cache.get('chrome_version')
new_version = fingerprint.get('chrome_version')
if old_version and new_version and old_version != new_version:
print(f"[Cloudflare] FlareSolverr Chrome version changed: {old_version} -> {new_version}")
_flaresolverr_fingerprint_cache['fingerprint'] = fingerprint
_flaresolverr_fingerprint_cache['timestamp'] = datetime.now()
_flaresolverr_fingerprint_cache['chrome_version'] = new_version
_save_fingerprint_to_db(fingerprint)
return fingerprint
# Fall through to cache if fetch failed
# Return memory cache if available (instant)
if _flaresolverr_fingerprint_cache['fingerprint']:
return _flaresolverr_fingerprint_cache['fingerprint']
# Load from database (instant, survives restarts)
if not _flaresolverr_fingerprint_cache['loaded_from_db']:
_flaresolverr_fingerprint_cache['loaded_from_db'] = True
db_fingerprint = _load_fingerprint_from_db()
if db_fingerprint:
_flaresolverr_fingerprint_cache['fingerprint'] = db_fingerprint
_flaresolverr_fingerprint_cache['chrome_version'] = db_fingerprint.get('chrome_version')
return db_fingerprint
# No cache available - fetch from FlareSolverr (first time only, blocking)
fingerprint = _fetch_fingerprint_from_flaresolverr(flaresolverr_url)
if fingerprint:
_flaresolverr_fingerprint_cache['fingerprint'] = fingerprint
_flaresolverr_fingerprint_cache['timestamp'] = datetime.now()
_flaresolverr_fingerprint_cache['chrome_version'] = fingerprint.get('chrome_version')
_save_fingerprint_to_db(fingerprint)
return fingerprint
# Last resort - return default
return DEFAULT_FINGERPRINT.copy()
def refresh_fingerprint_if_changed(flaresolverr_url: str = "http://localhost:8191/v1") -> bool:
"""
Check if FlareSolverr's Chrome version changed and refresh if needed.
Call this periodically (e.g., before each scraping session) for instant updates.
Returns:
True if fingerprint was updated, False otherwise
"""
global _flaresolverr_fingerprint_cache
new_fingerprint = _fetch_fingerprint_from_flaresolverr(flaresolverr_url)
if not new_fingerprint:
return False
new_version = new_fingerprint.get('chrome_version')
old_version = _flaresolverr_fingerprint_cache.get('chrome_version')
if new_version and old_version and new_version != old_version:
print(f"[Cloudflare] FlareSolverr Chrome version updated: {old_version} -> {new_version}")
_flaresolverr_fingerprint_cache['fingerprint'] = new_fingerprint
_flaresolverr_fingerprint_cache['timestamp'] = datetime.now()
_flaresolverr_fingerprint_cache['chrome_version'] = new_version
_save_fingerprint_to_db(new_fingerprint)
return True
# Update cache even if version same (refresh timestamp)
if new_fingerprint:
_flaresolverr_fingerprint_cache['fingerprint'] = new_fingerprint
_flaresolverr_fingerprint_cache['timestamp'] = datetime.now()
_save_fingerprint_to_db(new_fingerprint)
return False
def get_flaresolverr_user_agent(flaresolverr_url: str = "http://localhost:8191/v1") -> str:
"""
Get FlareSolverr's user-agent dynamically.
This is a convenience wrapper around get_flaresolverr_fingerprint() for
backwards compatibility.
Args:
flaresolverr_url: FlareSolverr API endpoint
Returns:
User-agent string from FlareSolverr, or default fallback
"""
fingerprint = get_flaresolverr_fingerprint(flaresolverr_url)
return fingerprint.get('user_agent', DEFAULT_USER_AGENT)
def get_playwright_context_options(flaresolverr_url: str = "http://localhost:8191/v1") -> Dict:
"""
Get Playwright browser context options that match FlareSolverr's fingerprint.
Use this when creating a Playwright browser context to ensure the browser
fingerprint matches FlareSolverr, making cookies work correctly.
Args:
flaresolverr_url: FlareSolverr API endpoint
Returns:
Dictionary of options for browser.new_context()
Example:
context_options = get_playwright_context_options()
context = browser.new_context(**context_options)
"""
fingerprint = get_flaresolverr_fingerprint(flaresolverr_url)
return {
'viewport': fingerprint['viewport'],
'user_agent': fingerprint['user_agent'],
'locale': fingerprint['locale'],
'timezone_id': fingerprint['timezone_id'],
'color_scheme': fingerprint['color_scheme'],
'extra_http_headers': {
'Accept-Language': fingerprint['accept_language'],
'Sec-Ch-Ua': fingerprint['sec_ch_ua'],
'Sec-Ch-Ua-Mobile': fingerprint['sec_ch_ua_mobile'],
'Sec-Ch-Ua-Platform': fingerprint['sec_ch_ua_platform']
}
}
def get_playwright_stealth_scripts() -> str:
"""
Get JavaScript code to inject into Playwright pages for anti-detection.
Use this with page.add_init_script() to make Playwright harder to detect.
Returns:
JavaScript code string
Example:
page.add_init_script(get_playwright_stealth_scripts())
"""
return """
// Hide webdriver property
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
// Override plugins to look more realistic
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5]
});
// Override languages to match fingerprint
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
// Fix permissions API
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
// Add chrome runtime stub
window.chrome = { runtime: {} };
// Override connection property
Object.defineProperty(navigator, 'connection', {
get: () => ({
effectiveType: '4g',
rtt: 50,
downlink: 10,
saveData: false
})
});
// Fix iframe contentWindow access
const originalContentWindow = Object.getOwnPropertyDescriptor(HTMLIFrameElement.prototype, 'contentWindow');
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
get: function() {
try {
return originalContentWindow.get.call(this);
} catch (e) {
return window;
}
}
});
// Override hardware concurrency
Object.defineProperty(navigator, 'hardwareConcurrency', {
get: () => 8
});
// Override device memory
Object.defineProperty(navigator, 'deviceMemory', {
get: () => 8
});
"""
def invalidate_fingerprint_cache():
"""
Invalidate the fingerprint cache, forcing a refresh on next request.
Call this if FlareSolverr is updated or if you suspect fingerprint mismatch.
"""
global _flaresolverr_fingerprint_cache
_flaresolverr_fingerprint_cache['fingerprint'] = None
_flaresolverr_fingerprint_cache['timestamp'] = None
class SiteStatus(Enum):
"""Site status enumeration"""
WORKING = "working" # Site is accessible
CLOUDFLARE_CHALLENGE = "challenge" # Cloudflare challenge detected
SERVER_ERROR = "server_error" # 500, 502, 503, 504 errors
FORBIDDEN = "forbidden" # 403 forbidden
TIMEOUT = "timeout" # Request timeout
UNKNOWN_ERROR = "unknown_error" # Other errors
class CloudflareHandler:
"""Universal Cloudflare bypass and error handler"""
def __init__(self,
module_name: str,
cookie_file: str = None,
flaresolverr_url: str = "http://localhost:8191/v1",
flaresolverr_enabled: bool = True,
user_agent: str = None,
logger=None,
aggressive_expiry: bool = True,
proxy_url: str = None):
"""
Initialize Cloudflare handler
Args:
module_name: Name of the module (for logging)
cookie_file: Path to cookie storage file (optional, can be None for DB storage)
flaresolverr_url: FlareSolverr API endpoint
flaresolverr_enabled: Enable/disable FlareSolverr
user_agent: User-Agent string (default: Chrome 141)
logger: Logger instance (optional)
aggressive_expiry: Use aggressive cookie expiration (7 days) vs conservative (only expired)
proxy_url: Proxy URL for FlareSolverr and requests (e.g., "socks5://user:pass@host:port")
"""
self.module_name = module_name
# Cookie file is optional (can use DB storage instead)
if cookie_file:
self.cookie_file = Path(cookie_file)
self.cookie_file.parent.mkdir(parents=True, exist_ok=True)
else:
self.cookie_file = None
self.flaresolverr_url = flaresolverr_url
self.flaresolverr_enabled = flaresolverr_enabled
# Get User-Agent dynamically from FlareSolverr (or use provided/default)
self.user_agent = user_agent or get_flaresolverr_user_agent(flaresolverr_url)
self.aggressive_expiry = aggressive_expiry
# Proxy support for FlareSolverr and requests
self.proxy_url = proxy_url
# In-memory cookie storage (for when cookie_file is None)
self._cookies = []
self._cookies_user_agent = None
# Use provided logger or create universal logger
if logger:
self.logger = logger
else:
self.logger = get_logger(f'Cloudflare.{module_name}')
def log(self, message: str, level: str = "info", module: str = "Core"):
"""Log message using universal logger
Args:
message: Log message
level: Log level (debug, info, warning, error)
module: Module context
"""
level_upper = level.upper()
if level_upper == "DEBUG":
self.logger.debug(message, module=module)
elif level_upper == "WARNING":
self.logger.warning(message, module=module)
elif level_upper == "ERROR":
self.logger.error(message, module=module)
else:
self.logger.info(message, module=module)
# ==================== Cookie Management ====================
def cookies_expired(self) -> bool:
"""
Check if cookies are expired
Returns:
True if cookies need refresh, False otherwise
"""
# If no cookie file configured, check in-memory cookies
if not self.cookie_file:
if not self._cookies:
return True
cookies = self._cookies
data = {'cookies': cookies, 'timestamp': datetime.now().isoformat()}
elif not self.cookie_file.exists():
return True
else:
try:
with open(self.cookie_file, 'r') as f:
data = json.load(f)
except Exception as e:
self.log(f"Error reading cookie file: {e}", "warning")
return True
try:
# Check if cookies array is empty
cookies = data.get('cookies', [])
if not cookies or len(cookies) == 0:
return True
if self.aggressive_expiry:
# Aggressive: refresh if cf_clearance or site-functional cookies expire within 7 days
current_timestamp = int(time.time())
days_threshold = 7 * 24 * 60 * 60 # 7 days in seconds
# Only check cookies that matter for Cloudflare bypass and site functionality.
# Third-party ad/tracking cookies (panoramaId, __cf_bm on ad domains,
# KTPCACOOKIE, etc.) are short-lived by design and irrelevant.
#
# Strategy: find the primary site domain (most common domain in cookies),
# then only check cf_clearance + first-party non-ad cookies.
from collections import Counter
domain_counts = Counter(c.get('domain', '') for c in cookies)
site_domain = domain_counts.most_common(1)[0][0] if domain_counts else ''
# Known ad/tracking cookie name prefixes (even on first-party domains)
ad_prefixes = ('_ga', '_gat', '_gid', '_gcl', '__utm', 'panoramaId',
'_cc_', '_pubcid', 'cto_', 'FCCDCF', 'FCOEC', 'FCNEC',
'IABGPP', 'usprivacy', '__gads', '__gpi', '__eoi')
for cookie in cookies:
cookie_name = cookie.get('name', '')
cookie_domain = cookie.get('domain', '')
# Only check cf_clearance (critical) and first-party site cookies
is_cf = cookie_name == 'cf_clearance'
is_first_party = site_domain and site_domain == cookie_domain
if not is_cf and not is_first_party:
continue
# Skip known ad/tracking cookies even on the site's domain
if any(cookie_name.startswith(p) for p in ad_prefixes):
continue
expiry = cookie.get('expiry') or cookie.get('expires')
# Skip session cookies (expires = -1)
if expiry and expiry != -1:
if expiry < current_timestamp:
self.log(f"Cookie '{cookie_name}' has expired", "debug")
return True
elif expiry < (current_timestamp + days_threshold):
days_remaining = (expiry - current_timestamp) / (24 * 60 * 60)
self.log(f"Cookie '{cookie_name}' expires in {days_remaining:.1f} days, forcing refresh", "debug")
return True
# Check file age as fallback
saved_time = datetime.fromisoformat(data['timestamp'])
age = datetime.now() - saved_time
return age > timedelta(hours=12)
else:
# Conservative: refresh if cf_clearance OR any critical short-lived cookie expired
current_timestamp = int(time.time())
one_hour = 3600 # 1 hour in seconds
# Check cf_clearance (main Cloudflare cookie)
cf_clearance = next((c for c in cookies if c['name'] == 'cf_clearance'), None)
if cf_clearance:
expiry = cf_clearance.get('expiry') or cf_clearance.get('expires')
if expiry and expiry != -1 and expiry < current_timestamp:
self.log("cf_clearance cookie has expired", "debug")
return True
# Check for short-lived session cookies (common names: gc_session_id, session, sid, etc)
# If any cookie expires within 1 hour, consider it expired
# Skip analytics cookies that naturally expire quickly (_ga, _gat, _gid)
analytics_prefixes = ('_ga', '_gat', '_gid', '_gcl', '__utm')
for cookie in cookies:
cookie_name = cookie.get('name', 'unknown')
# Skip Google Analytics and tracking cookies - they're not critical for CF bypass
if cookie_name.startswith(analytics_prefixes):
continue
expiry = cookie.get('expiry') or cookie.get('expires')
if expiry and expiry != -1:
if expiry < (current_timestamp + one_hour):
time_remaining = (expiry - current_timestamp) / 60 # minutes
if time_remaining <= 0:
self.log(f"Cookie '{cookie_name}' has expired", "debug")
return True
elif 'session' in cookie_name.lower():
# Session cookies expiring soon are critical
self.log(f"Session cookie '{cookie_name}' expires in {time_remaining:.0f} min, refreshing", "debug")
return True
return False
except Exception as e:
self.log(f"Error checking cookie expiration: {e}", "warning")
return True
def get_cookies_dict(self) -> Dict[str, str]:
"""
Get cookies as dictionary for requests library
Returns:
Dictionary of cookie name->value pairs
"""
# If no cookie file, use in-memory cookies
if not self.cookie_file:
return {c['name']: c['value'] for c in self._cookies}
if not self.cookie_file.exists():
return {}
try:
with open(self.cookie_file, 'r') as f:
data = json.load(f)
cookies = {}
for cookie in data.get('cookies', []):
cookies[cookie['name']] = cookie['value']
return cookies
except Exception as e:
self.log(f"Error loading cookies: {e}", "warning")
return {}
def get_cookies_list(self) -> List[Dict]:
"""
Get cookies as list for Playwright
Returns:
List of cookie dictionaries with 'expiry' converted to 'expires' for Playwright compatibility
"""
# If no cookie file, use in-memory cookies
if not self.cookie_file:
cookies = self._cookies.copy()
elif not self.cookie_file.exists():
return []
else:
try:
with open(self.cookie_file, 'r') as f:
data = json.load(f)
cookies = data.get('cookies', [])
except Exception as e:
self.log(f"Error loading cookies: {e}", "warning")
return []
# Convert 'expiry' to 'expires' for Playwright compatibility
# FlareSolverr uses 'expiry' but Playwright expects 'expires'
converted = []
for cookie in cookies:
c = dict(cookie)
if 'expiry' in c and 'expires' not in c:
c['expires'] = c.pop('expiry')
converted.append(c)
return converted
def get_user_agent(self) -> Optional[str]:
"""
Get the user agent associated with the stored cookies.
This is important because cf_clearance cookies are tied to browser fingerprint.
Returns:
User agent string or None if not stored
"""
# If no cookie file, use in-memory user agent
if not self.cookie_file:
return self._cookies_user_agent
if not self.cookie_file.exists():
return None
try:
with open(self.cookie_file, 'r') as f:
data = json.load(f)
return data.get('user_agent')
except Exception:
return None
def save_cookies(self, cookies: Union[List[Dict], Dict[str, str]], user_agent: str = None):
"""
Save cookies to file or in-memory storage
Args:
cookies: Either list of cookie dicts (Playwright) or dict (requests)
user_agent: Browser user agent (important for cf_clearance cookies)
"""
# Convert dict to list format if needed
if isinstance(cookies, dict):
cookies_list = [
{'name': k, 'value': v, 'domain': '', 'path': '/'}
for k, v in cookies.items()
]
else:
cookies_list = cookies
# If no cookie file, store in memory
if not self.cookie_file:
self._cookies = cookies_list
self._cookies_user_agent = user_agent
self.log(f"Saved {len(cookies_list)} cookies to memory", "debug")
return
storage_data = {
'cookies': cookies_list,
'timestamp': datetime.now().isoformat()
}
if user_agent:
storage_data['user_agent'] = user_agent
try:
with open(self.cookie_file, 'w') as f:
json.dump(storage_data, f, indent=2)
self.log(f"Saved {len(cookies_list)} cookies to {self.cookie_file}", "debug")
except Exception as e:
self.log(f"Error saving cookies: {e}", "error")
def load_cookies_to_playwright(self, context):
"""
Load cookies into Playwright browser context
Args:
context: Playwright browser context
"""
cookies = self.get_cookies_list()
if cookies:
# CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect
try:
context.clear_cookies()
except Exception:
pass
context.add_cookies(cookies)
self.log(f"Loaded {len(cookies)} cookies into browser", "debug")
def save_cookies_from_playwright(self, context):
"""
Save cookies from Playwright browser context
Args:
context: Playwright browser context
"""
cookies = context.cookies()
self.save_cookies(cookies)
def load_cookies_to_requests(self, session: requests.Session):
"""
Load cookies into requests Session
Args:
session: requests.Session object
"""
cookies = self.get_cookies_dict()
session.cookies.update(cookies)
self.log(f"Loaded {len(cookies)} cookies into requests session", "debug")
# ==================== FlareSolverr Integration ====================
def get_cookies_via_flaresolverr(self, url: str, max_retries: int = 2) -> bool:
"""
Use FlareSolverr to bypass Cloudflare and get fresh cookies
Args:
url: URL to fetch
max_retries: Maximum number of retry attempts
Returns:
True if cookies obtained successfully, False otherwise
"""
if not self.flaresolverr_enabled:
self.log("FlareSolverr is disabled", "debug")
return False
for attempt in range(1, max_retries + 1):
try:
if attempt > 1:
self.log(f"Retrying FlareSolverr (attempt {attempt}/{max_retries})...", "info")
else:
self.log("Using FlareSolverr to bypass Cloudflare...", "info")
payload = {
"cmd": "request.get",
"url": url,
"maxTimeout": 120000 # 120 seconds for difficult challenges
}
# Add proxy if configured
if self.proxy_url:
payload["proxy"] = {"url": self.proxy_url}
self.log(f"Using proxy: {self.proxy_url}", "debug")
response = requests.post(self.flaresolverr_url, json=payload, timeout=130)
data = response.json()
if data.get('status') == 'ok' and data.get('solution'):
solution = data['solution']
cookies = solution.get('cookies', [])
user_agent = solution.get('userAgent')
if cookies:
has_cf_clearance = any(c['name'] == 'cf_clearance' for c in cookies)
if has_cf_clearance:
self.log(f"✓ FlareSolverr bypassed Cloudflare! Got {len(cookies)} cookies", "info")
else:
self.log(f"✓ FlareSolverr succeeded! Got {len(cookies)} cookies", "info")
# Save cookies with user_agent (important for cf_clearance)
self.save_cookies(cookies, user_agent=user_agent)
return True
else:
self.log("FlareSolverr returned no cookies", "warning")
return False
else:
error_msg = data.get('message', 'Unknown error')
self.log(f"FlareSolverr failed: {error_msg}", "warning")
# Retry on timeout errors
if 'timeout' in error_msg.lower() and attempt < max_retries:
continue
return False
except requests.exceptions.Timeout:
self.log(f"FlareSolverr request timed out (attempt {attempt}/{max_retries})", "warning")
if attempt < max_retries:
continue
return False
except Exception as e:
self.log(f"FlareSolverr error: {e}", "error")
return False
return False
# ==================== Site Status Detection ====================
def check_site_status(self, url: str, timeout: int = 10) -> Tuple[SiteStatus, Optional[str]]:
"""
Check site status and detect Cloudflare challenges or errors
Args:
url: URL to check
timeout: Request timeout in seconds
Returns:
Tuple of (SiteStatus, error_message)
"""
try:
response = requests.get(
url,
timeout=timeout,
headers={'User-Agent': self.user_agent},
cookies=self.get_cookies_dict(),
allow_redirects=True
)
status_code = response.status_code
content = response.text.lower()
# Check for server errors (5xx)
if status_code >= 500:
error_msg = f"Server error {status_code}"
if status_code == 500:
error_msg = "Internal server error (500)"
elif status_code == 502:
error_msg = "Bad gateway (502)"
elif status_code == 503:
error_msg = "Service unavailable (503)"
elif status_code == 504:
error_msg = "Gateway timeout (504)"
self.log(error_msg, "warning")
return (SiteStatus.SERVER_ERROR, error_msg)
# Check for forbidden (403)
# Note: 403 is often just expired cookies, which FlareSolverr will fix
if status_code == 403:
self.log("Access forbidden (403) - cookies may be expired", "debug")
return (SiteStatus.FORBIDDEN, "Access forbidden (403)")
# Check for Cloudflare challenge indicators
challenge_indicators = [
'challenge-platform',
'checking your browser',
'just a moment',
'verify you are human',
'cloudflare',
'cf-challenge',
'cf_clearance'
]
# Short response likely indicates challenge page
if len(response.text) < 1000:
for indicator in challenge_indicators:
if indicator in content:
self.log("Cloudflare challenge detected", "info")
return (SiteStatus.CLOUDFLARE_CHALLENGE, "Cloudflare challenge detected")
# Check first 500 chars for challenge indicators
if any(indicator in content[:500] for indicator in challenge_indicators):
self.log("Cloudflare challenge detected in page content", "info")
return (SiteStatus.CLOUDFLARE_CHALLENGE, "Cloudflare challenge detected")
# If we got here, site appears to be working
self.log(f"Site appears to be working (status {status_code})", "debug")
return (SiteStatus.WORKING, None)
except requests.exceptions.Timeout:
self.log(f"Request timed out after {timeout}s", "warning")
return (SiteStatus.TIMEOUT, f"Request timed out after {timeout}s")
except requests.exceptions.ConnectionError as e:
self.log(f"Connection error: {e}", "error")
return (SiteStatus.UNKNOWN_ERROR, f"Connection error: {e}")
except Exception as e:
self.log(f"Error checking site status: {e}", "error")
return (SiteStatus.UNKNOWN_ERROR, str(e))
def should_skip_download(self, status: SiteStatus) -> bool:
"""
Determine if download should be skipped based on site status
Args:
status: SiteStatus enum value
Returns:
True if download should be skipped, False if retry is possible
"""
# Skip on server errors and timeouts (site is down)
if status in [SiteStatus.SERVER_ERROR, SiteStatus.TIMEOUT]:
return True
# Don't skip on Cloudflare challenges (we can try to bypass)
if status == SiteStatus.CLOUDFLARE_CHALLENGE:
return False
# Don't skip on forbidden (might be temporary)
if status == SiteStatus.FORBIDDEN:
return False
# Skip on unknown errors to be safe
if status == SiteStatus.UNKNOWN_ERROR:
return True
return False
# ==================== High-Level Helper Methods ====================
def ensure_cookies(self, url: str) -> bool:
"""
Ensure we have valid cookies, getting new ones via FlareSolverr if needed
Args:
url: URL to use for FlareSolverr request
Returns:
True if cookies are available, False otherwise
"""
if not self.cookies_expired():
self.log("Using existing cookies", "debug")
return True
self.log("Cookies missing or expired, attempting FlareSolverr bypass...", "info")
return self.get_cookies_via_flaresolverr(url)
def check_and_bypass(self, url: str, auto_flaresolverr: bool = True) -> Tuple[SiteStatus, bool]:
"""
Check site status and automatically attempt FlareSolverr bypass if needed
Args:
url: URL to check
auto_flaresolverr: Automatically call FlareSolverr on challenges
Returns:
Tuple of (SiteStatus, cookies_obtained)
"""
status, error_msg = self.check_site_status(url)
# If site is down or timing out, don't bother with FlareSolverr
if self.should_skip_download(status):
return (status, False)
# If Cloudflare challenge detected and auto-bypass enabled
if status == SiteStatus.CLOUDFLARE_CHALLENGE and auto_flaresolverr:
self.log("Attempting automatic Cloudflare bypass...", "info")
success = self.get_cookies_via_flaresolverr(url)
return (status, success)
return (status, True)
def wait_for_cloudflare_playwright(self, page, max_wait: int = 120) -> bool:
"""
Wait for Cloudflare challenge to resolve in Playwright page
Args:
page: Playwright page object
max_wait: Maximum wait time in seconds
Returns:
True if challenge resolved, False if still blocked
"""
start_time = time.time()
challenge_indicators = [
'challenge-platform',
'checking your browser',
'just a moment'
]
while time.time() - start_time < max_wait:
try:
content = page.content().lower()
# Check if challenge is still present
has_challenge = any(indicator in content for indicator in challenge_indicators)
if not has_challenge:
self.log("Cloudflare challenge resolved", "info")
return True
# Log progress every 15 seconds
elapsed = int(time.time() - start_time)
if elapsed % 15 == 0 and elapsed > 0:
self.log(f"Still waiting for Cloudflare ({elapsed}s)...", "debug")
time.sleep(1)
except Exception as e:
self.log(f"Error checking for Cloudflare: {e}", "warning")
time.sleep(1)
self.log(f"Cloudflare challenge did not resolve after {max_wait}s", "warning")
return False