1025 lines
38 KiB
Python
1025 lines
38 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Universal Cloudflare Handler Module
|
|
|
|
Provides centralized Cloudflare bypass, error detection, and cookie management
|
|
for all download modules (imginn, fastdl, toolzu, snapchat, coppermine).
|
|
|
|
Features:
|
|
- FlareSolverr integration with retry logic
|
|
- Site status detection (down, challenge, working)
|
|
- Cookie management for Playwright and requests
|
|
- Standardized error handling (500, 403, 503, timeouts)
|
|
- Skip logic for when sites are unavailable
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import requests
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, List, Optional, Tuple, Union
|
|
from enum import Enum
|
|
|
|
from modules.universal_logger import get_logger
|
|
|
|
|
|
# Module-level cache for FlareSolverr browser fingerprint
|
|
_flaresolverr_fingerprint_cache = {
|
|
'fingerprint': None,
|
|
'timestamp': None,
|
|
'chrome_version': None,
|
|
'loaded_from_db': False
|
|
}
|
|
|
|
# Database instance for persistent storage (set via set_fingerprint_database)
|
|
_fingerprint_db = None
|
|
|
|
# Default fallback fingerprint (Chrome 142 on Linux)
|
|
DEFAULT_FINGERPRINT = {
|
|
'user_agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
|
|
'sec_ch_ua': '"Not_A Brand";v="99", "Chromium";v="142"',
|
|
'sec_ch_ua_mobile': '?0',
|
|
'sec_ch_ua_platform': '"Linux"',
|
|
'accept_language': 'en-US,en;q=0.9',
|
|
'platform': 'Linux',
|
|
'viewport': {'width': 1920, 'height': 1080},
|
|
'locale': 'en-US',
|
|
'timezone_id': 'America/New_York',
|
|
'color_scheme': 'light'
|
|
}
|
|
|
|
# Default fallback user-agent (for backwards compatibility)
|
|
DEFAULT_USER_AGENT = DEFAULT_FINGERPRINT['user_agent']
|
|
|
|
|
|
def _extract_chrome_version(user_agent: str) -> Optional[str]:
|
|
"""Extract Chrome version from user-agent string."""
|
|
import re
|
|
match = re.search(r'Chrome/(\d+)', user_agent)
|
|
return match.group(1) if match else None
|
|
|
|
|
|
def set_fingerprint_database(unified_db):
|
|
"""Set the database instance for persistent fingerprint storage.
|
|
|
|
Call this once at application startup with your UnifiedDatabase instance.
|
|
"""
|
|
global _fingerprint_db
|
|
_fingerprint_db = unified_db
|
|
|
|
|
|
def _load_fingerprint_from_db() -> Optional[Dict]:
|
|
"""Load fingerprint from database (instant)."""
|
|
global _fingerprint_db
|
|
if not _fingerprint_db:
|
|
return None
|
|
try:
|
|
data = _fingerprint_db.get_setting('flaresolverr_fingerprint')
|
|
if data:
|
|
return json.loads(data) if isinstance(data, str) else data
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
|
|
def _save_fingerprint_to_db(fingerprint: Dict):
|
|
"""Save fingerprint to database."""
|
|
global _fingerprint_db
|
|
if not _fingerprint_db:
|
|
return
|
|
try:
|
|
_fingerprint_db.set_setting('flaresolverr_fingerprint', json.dumps(fingerprint))
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _fetch_fingerprint_from_flaresolverr(flaresolverr_url: str) -> Optional[Dict]:
|
|
"""Fetch fingerprint directly from FlareSolverr (blocking call)."""
|
|
try:
|
|
response = requests.post(
|
|
flaresolverr_url,
|
|
json={
|
|
"cmd": "request.get",
|
|
"url": "https://httpbin.org/headers",
|
|
"maxTimeout": 15000
|
|
},
|
|
timeout=20
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
if data.get('status') == 'ok' and data.get('solution'):
|
|
solution = data['solution']
|
|
user_agent = solution.get('userAgent', DEFAULT_USER_AGENT)
|
|
new_version = _extract_chrome_version(user_agent)
|
|
|
|
# Parse headers from response
|
|
response_text = solution.get('response', '')
|
|
headers = {}
|
|
|
|
try:
|
|
import re
|
|
json_start = response_text.find('{"headers"')
|
|
if json_start >= 0:
|
|
brace_count = 0
|
|
json_end = json_start
|
|
for i, char in enumerate(response_text[json_start:]):
|
|
if char == '{':
|
|
brace_count += 1
|
|
elif char == '}':
|
|
brace_count -= 1
|
|
if brace_count == 0:
|
|
json_end = json_start + i + 1
|
|
break
|
|
json_str = response_text[json_start:json_end]
|
|
headers_data = json.loads(json_str)
|
|
headers = headers_data.get('headers', {})
|
|
except Exception:
|
|
pass
|
|
|
|
return {
|
|
'user_agent': user_agent,
|
|
'sec_ch_ua': headers.get('Sec-Ch-Ua', DEFAULT_FINGERPRINT['sec_ch_ua']),
|
|
'sec_ch_ua_mobile': headers.get('Sec-Ch-Ua-Mobile', DEFAULT_FINGERPRINT['sec_ch_ua_mobile']),
|
|
'sec_ch_ua_platform': headers.get('Sec-Ch-Ua-Platform', DEFAULT_FINGERPRINT['sec_ch_ua_platform']),
|
|
'accept_language': headers.get('Accept-Language', DEFAULT_FINGERPRINT['accept_language']),
|
|
'platform': 'Linux' if 'Linux' in user_agent else ('Windows' if 'Windows' in user_agent else 'macOS'),
|
|
'viewport': DEFAULT_FINGERPRINT['viewport'],
|
|
'locale': 'en-US',
|
|
'timezone_id': 'America/New_York',
|
|
'color_scheme': 'light',
|
|
'chrome_version': new_version
|
|
}
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
|
|
def get_flaresolverr_fingerprint(flaresolverr_url: str = "http://localhost:8191/v1", force_refresh: bool = False) -> Dict:
|
|
"""
|
|
Get FlareSolverr's complete browser fingerprint - INSTANT from cache.
|
|
|
|
On first call: loads from database (instant, survives restarts).
|
|
Use force_refresh=True or refresh_fingerprint_if_changed() to update.
|
|
|
|
Args:
|
|
flaresolverr_url: FlareSolverr API endpoint
|
|
force_refresh: Force fetch from FlareSolverr (blocking)
|
|
|
|
Returns:
|
|
Dictionary containing full browser fingerprint
|
|
"""
|
|
global _flaresolverr_fingerprint_cache
|
|
|
|
# Force refresh - fetch directly from FlareSolverr
|
|
if force_refresh:
|
|
fingerprint = _fetch_fingerprint_from_flaresolverr(flaresolverr_url)
|
|
if fingerprint:
|
|
old_version = _flaresolverr_fingerprint_cache.get('chrome_version')
|
|
new_version = fingerprint.get('chrome_version')
|
|
if old_version and new_version and old_version != new_version:
|
|
print(f"[Cloudflare] FlareSolverr Chrome version changed: {old_version} -> {new_version}")
|
|
|
|
_flaresolverr_fingerprint_cache['fingerprint'] = fingerprint
|
|
_flaresolverr_fingerprint_cache['timestamp'] = datetime.now()
|
|
_flaresolverr_fingerprint_cache['chrome_version'] = new_version
|
|
_save_fingerprint_to_db(fingerprint)
|
|
return fingerprint
|
|
# Fall through to cache if fetch failed
|
|
|
|
# Return memory cache if available (instant)
|
|
if _flaresolverr_fingerprint_cache['fingerprint']:
|
|
return _flaresolverr_fingerprint_cache['fingerprint']
|
|
|
|
# Load from database (instant, survives restarts)
|
|
if not _flaresolverr_fingerprint_cache['loaded_from_db']:
|
|
_flaresolverr_fingerprint_cache['loaded_from_db'] = True
|
|
db_fingerprint = _load_fingerprint_from_db()
|
|
if db_fingerprint:
|
|
_flaresolverr_fingerprint_cache['fingerprint'] = db_fingerprint
|
|
_flaresolverr_fingerprint_cache['chrome_version'] = db_fingerprint.get('chrome_version')
|
|
return db_fingerprint
|
|
|
|
# No cache available - fetch from FlareSolverr (first time only, blocking)
|
|
fingerprint = _fetch_fingerprint_from_flaresolverr(flaresolverr_url)
|
|
if fingerprint:
|
|
_flaresolverr_fingerprint_cache['fingerprint'] = fingerprint
|
|
_flaresolverr_fingerprint_cache['timestamp'] = datetime.now()
|
|
_flaresolverr_fingerprint_cache['chrome_version'] = fingerprint.get('chrome_version')
|
|
_save_fingerprint_to_db(fingerprint)
|
|
return fingerprint
|
|
|
|
# Last resort - return default
|
|
return DEFAULT_FINGERPRINT.copy()
|
|
|
|
|
|
def refresh_fingerprint_if_changed(flaresolverr_url: str = "http://localhost:8191/v1") -> bool:
|
|
"""
|
|
Check if FlareSolverr's Chrome version changed and refresh if needed.
|
|
Call this periodically (e.g., before each scraping session) for instant updates.
|
|
|
|
Returns:
|
|
True if fingerprint was updated, False otherwise
|
|
"""
|
|
global _flaresolverr_fingerprint_cache
|
|
|
|
new_fingerprint = _fetch_fingerprint_from_flaresolverr(flaresolverr_url)
|
|
if not new_fingerprint:
|
|
return False
|
|
|
|
new_version = new_fingerprint.get('chrome_version')
|
|
old_version = _flaresolverr_fingerprint_cache.get('chrome_version')
|
|
|
|
if new_version and old_version and new_version != old_version:
|
|
print(f"[Cloudflare] FlareSolverr Chrome version updated: {old_version} -> {new_version}")
|
|
_flaresolverr_fingerprint_cache['fingerprint'] = new_fingerprint
|
|
_flaresolverr_fingerprint_cache['timestamp'] = datetime.now()
|
|
_flaresolverr_fingerprint_cache['chrome_version'] = new_version
|
|
_save_fingerprint_to_db(new_fingerprint)
|
|
return True
|
|
|
|
# Update cache even if version same (refresh timestamp)
|
|
if new_fingerprint:
|
|
_flaresolverr_fingerprint_cache['fingerprint'] = new_fingerprint
|
|
_flaresolverr_fingerprint_cache['timestamp'] = datetime.now()
|
|
_save_fingerprint_to_db(new_fingerprint)
|
|
|
|
return False
|
|
|
|
|
|
def get_flaresolverr_user_agent(flaresolverr_url: str = "http://localhost:8191/v1") -> str:
|
|
"""
|
|
Get FlareSolverr's user-agent dynamically.
|
|
|
|
This is a convenience wrapper around get_flaresolverr_fingerprint() for
|
|
backwards compatibility.
|
|
|
|
Args:
|
|
flaresolverr_url: FlareSolverr API endpoint
|
|
|
|
Returns:
|
|
User-agent string from FlareSolverr, or default fallback
|
|
"""
|
|
fingerprint = get_flaresolverr_fingerprint(flaresolverr_url)
|
|
return fingerprint.get('user_agent', DEFAULT_USER_AGENT)
|
|
|
|
|
|
def get_playwright_context_options(flaresolverr_url: str = "http://localhost:8191/v1") -> Dict:
|
|
"""
|
|
Get Playwright browser context options that match FlareSolverr's fingerprint.
|
|
|
|
Use this when creating a Playwright browser context to ensure the browser
|
|
fingerprint matches FlareSolverr, making cookies work correctly.
|
|
|
|
Args:
|
|
flaresolverr_url: FlareSolverr API endpoint
|
|
|
|
Returns:
|
|
Dictionary of options for browser.new_context()
|
|
|
|
Example:
|
|
context_options = get_playwright_context_options()
|
|
context = browser.new_context(**context_options)
|
|
"""
|
|
fingerprint = get_flaresolverr_fingerprint(flaresolverr_url)
|
|
|
|
return {
|
|
'viewport': fingerprint['viewport'],
|
|
'user_agent': fingerprint['user_agent'],
|
|
'locale': fingerprint['locale'],
|
|
'timezone_id': fingerprint['timezone_id'],
|
|
'color_scheme': fingerprint['color_scheme'],
|
|
'extra_http_headers': {
|
|
'Accept-Language': fingerprint['accept_language'],
|
|
'Sec-Ch-Ua': fingerprint['sec_ch_ua'],
|
|
'Sec-Ch-Ua-Mobile': fingerprint['sec_ch_ua_mobile'],
|
|
'Sec-Ch-Ua-Platform': fingerprint['sec_ch_ua_platform']
|
|
}
|
|
}
|
|
|
|
|
|
def get_playwright_stealth_scripts() -> str:
|
|
"""
|
|
Get JavaScript code to inject into Playwright pages for anti-detection.
|
|
|
|
Use this with page.add_init_script() to make Playwright harder to detect.
|
|
|
|
Returns:
|
|
JavaScript code string
|
|
|
|
Example:
|
|
page.add_init_script(get_playwright_stealth_scripts())
|
|
"""
|
|
return """
|
|
// Hide webdriver property
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|
get: () => undefined
|
|
});
|
|
|
|
// Override plugins to look more realistic
|
|
Object.defineProperty(navigator, 'plugins', {
|
|
get: () => [1, 2, 3, 4, 5]
|
|
});
|
|
|
|
// Override languages to match fingerprint
|
|
Object.defineProperty(navigator, 'languages', {
|
|
get: () => ['en-US', 'en']
|
|
});
|
|
|
|
// Fix permissions API
|
|
const originalQuery = window.navigator.permissions.query;
|
|
window.navigator.permissions.query = (parameters) => (
|
|
parameters.name === 'notifications' ?
|
|
Promise.resolve({ state: Notification.permission }) :
|
|
originalQuery(parameters)
|
|
);
|
|
|
|
// Add chrome runtime stub
|
|
window.chrome = { runtime: {} };
|
|
|
|
// Override connection property
|
|
Object.defineProperty(navigator, 'connection', {
|
|
get: () => ({
|
|
effectiveType: '4g',
|
|
rtt: 50,
|
|
downlink: 10,
|
|
saveData: false
|
|
})
|
|
});
|
|
|
|
// Fix iframe contentWindow access
|
|
const originalContentWindow = Object.getOwnPropertyDescriptor(HTMLIFrameElement.prototype, 'contentWindow');
|
|
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {
|
|
get: function() {
|
|
try {
|
|
return originalContentWindow.get.call(this);
|
|
} catch (e) {
|
|
return window;
|
|
}
|
|
}
|
|
});
|
|
|
|
// Override hardware concurrency
|
|
Object.defineProperty(navigator, 'hardwareConcurrency', {
|
|
get: () => 8
|
|
});
|
|
|
|
// Override device memory
|
|
Object.defineProperty(navigator, 'deviceMemory', {
|
|
get: () => 8
|
|
});
|
|
"""
|
|
|
|
|
|
def invalidate_fingerprint_cache():
|
|
"""
|
|
Invalidate the fingerprint cache, forcing a refresh on next request.
|
|
|
|
Call this if FlareSolverr is updated or if you suspect fingerprint mismatch.
|
|
"""
|
|
global _flaresolverr_fingerprint_cache
|
|
_flaresolverr_fingerprint_cache['fingerprint'] = None
|
|
_flaresolverr_fingerprint_cache['timestamp'] = None
|
|
|
|
|
|
class SiteStatus(Enum):
|
|
"""Site status enumeration"""
|
|
WORKING = "working" # Site is accessible
|
|
CLOUDFLARE_CHALLENGE = "challenge" # Cloudflare challenge detected
|
|
SERVER_ERROR = "server_error" # 500, 502, 503, 504 errors
|
|
FORBIDDEN = "forbidden" # 403 forbidden
|
|
TIMEOUT = "timeout" # Request timeout
|
|
UNKNOWN_ERROR = "unknown_error" # Other errors
|
|
|
|
|
|
class CloudflareHandler:
|
|
"""Universal Cloudflare bypass and error handler"""
|
|
|
|
def __init__(self,
|
|
module_name: str,
|
|
cookie_file: str = None,
|
|
flaresolverr_url: str = "http://localhost:8191/v1",
|
|
flaresolverr_enabled: bool = True,
|
|
user_agent: str = None,
|
|
logger=None,
|
|
aggressive_expiry: bool = True,
|
|
proxy_url: str = None):
|
|
"""
|
|
Initialize Cloudflare handler
|
|
|
|
Args:
|
|
module_name: Name of the module (for logging)
|
|
cookie_file: Path to cookie storage file (optional, can be None for DB storage)
|
|
flaresolverr_url: FlareSolverr API endpoint
|
|
flaresolverr_enabled: Enable/disable FlareSolverr
|
|
user_agent: User-Agent string (default: Chrome 141)
|
|
logger: Logger instance (optional)
|
|
aggressive_expiry: Use aggressive cookie expiration (7 days) vs conservative (only expired)
|
|
proxy_url: Proxy URL for FlareSolverr and requests (e.g., "socks5://user:pass@host:port")
|
|
"""
|
|
self.module_name = module_name
|
|
|
|
# Cookie file is optional (can use DB storage instead)
|
|
if cookie_file:
|
|
self.cookie_file = Path(cookie_file)
|
|
self.cookie_file.parent.mkdir(parents=True, exist_ok=True)
|
|
else:
|
|
self.cookie_file = None
|
|
|
|
self.flaresolverr_url = flaresolverr_url
|
|
self.flaresolverr_enabled = flaresolverr_enabled
|
|
|
|
# Get User-Agent dynamically from FlareSolverr (or use provided/default)
|
|
self.user_agent = user_agent or get_flaresolverr_user_agent(flaresolverr_url)
|
|
|
|
self.aggressive_expiry = aggressive_expiry
|
|
|
|
# Proxy support for FlareSolverr and requests
|
|
self.proxy_url = proxy_url
|
|
|
|
# In-memory cookie storage (for when cookie_file is None)
|
|
self._cookies = []
|
|
self._cookies_user_agent = None
|
|
|
|
# Use provided logger or create universal logger
|
|
if logger:
|
|
self.logger = logger
|
|
else:
|
|
self.logger = get_logger(f'Cloudflare.{module_name}')
|
|
|
|
def log(self, message: str, level: str = "info", module: str = "Core"):
|
|
"""Log message using universal logger
|
|
|
|
Args:
|
|
message: Log message
|
|
level: Log level (debug, info, warning, error)
|
|
module: Module context
|
|
"""
|
|
level_upper = level.upper()
|
|
if level_upper == "DEBUG":
|
|
self.logger.debug(message, module=module)
|
|
elif level_upper == "WARNING":
|
|
self.logger.warning(message, module=module)
|
|
elif level_upper == "ERROR":
|
|
self.logger.error(message, module=module)
|
|
else:
|
|
self.logger.info(message, module=module)
|
|
|
|
# ==================== Cookie Management ====================
|
|
|
|
def cookies_expired(self) -> bool:
|
|
"""
|
|
Check if cookies are expired
|
|
|
|
Returns:
|
|
True if cookies need refresh, False otherwise
|
|
"""
|
|
# If no cookie file configured, check in-memory cookies
|
|
if not self.cookie_file:
|
|
if not self._cookies:
|
|
return True
|
|
cookies = self._cookies
|
|
data = {'cookies': cookies, 'timestamp': datetime.now().isoformat()}
|
|
elif not self.cookie_file.exists():
|
|
return True
|
|
else:
|
|
try:
|
|
with open(self.cookie_file, 'r') as f:
|
|
data = json.load(f)
|
|
except Exception as e:
|
|
self.log(f"Error reading cookie file: {e}", "warning")
|
|
return True
|
|
|
|
try:
|
|
|
|
# Check if cookies array is empty
|
|
cookies = data.get('cookies', [])
|
|
if not cookies or len(cookies) == 0:
|
|
return True
|
|
|
|
if self.aggressive_expiry:
|
|
# Aggressive: refresh if cf_clearance or site-functional cookies expire within 7 days
|
|
current_timestamp = int(time.time())
|
|
days_threshold = 7 * 24 * 60 * 60 # 7 days in seconds
|
|
|
|
# Only check cookies that matter for Cloudflare bypass and site functionality.
|
|
# Third-party ad/tracking cookies (panoramaId, __cf_bm on ad domains,
|
|
# KTPCACOOKIE, etc.) are short-lived by design and irrelevant.
|
|
#
|
|
# Strategy: find the primary site domain (most common domain in cookies),
|
|
# then only check cf_clearance + first-party non-ad cookies.
|
|
from collections import Counter
|
|
domain_counts = Counter(c.get('domain', '') for c in cookies)
|
|
site_domain = domain_counts.most_common(1)[0][0] if domain_counts else ''
|
|
|
|
# Known ad/tracking cookie name prefixes (even on first-party domains)
|
|
ad_prefixes = ('_ga', '_gat', '_gid', '_gcl', '__utm', 'panoramaId',
|
|
'_cc_', '_pubcid', 'cto_', 'FCCDCF', 'FCOEC', 'FCNEC',
|
|
'IABGPP', 'usprivacy', '__gads', '__gpi', '__eoi')
|
|
|
|
for cookie in cookies:
|
|
cookie_name = cookie.get('name', '')
|
|
cookie_domain = cookie.get('domain', '')
|
|
|
|
# Only check cf_clearance (critical) and first-party site cookies
|
|
is_cf = cookie_name == 'cf_clearance'
|
|
is_first_party = site_domain and site_domain == cookie_domain
|
|
if not is_cf and not is_first_party:
|
|
continue
|
|
|
|
# Skip known ad/tracking cookies even on the site's domain
|
|
if any(cookie_name.startswith(p) for p in ad_prefixes):
|
|
continue
|
|
|
|
expiry = cookie.get('expiry') or cookie.get('expires')
|
|
|
|
# Skip session cookies (expires = -1)
|
|
if expiry and expiry != -1:
|
|
if expiry < current_timestamp:
|
|
self.log(f"Cookie '{cookie_name}' has expired", "debug")
|
|
return True
|
|
elif expiry < (current_timestamp + days_threshold):
|
|
days_remaining = (expiry - current_timestamp) / (24 * 60 * 60)
|
|
self.log(f"Cookie '{cookie_name}' expires in {days_remaining:.1f} days, forcing refresh", "debug")
|
|
return True
|
|
|
|
# Check file age as fallback
|
|
saved_time = datetime.fromisoformat(data['timestamp'])
|
|
age = datetime.now() - saved_time
|
|
return age > timedelta(hours=12)
|
|
else:
|
|
# Conservative: refresh if cf_clearance OR any critical short-lived cookie expired
|
|
current_timestamp = int(time.time())
|
|
one_hour = 3600 # 1 hour in seconds
|
|
|
|
# Check cf_clearance (main Cloudflare cookie)
|
|
cf_clearance = next((c for c in cookies if c['name'] == 'cf_clearance'), None)
|
|
if cf_clearance:
|
|
expiry = cf_clearance.get('expiry') or cf_clearance.get('expires')
|
|
if expiry and expiry != -1 and expiry < current_timestamp:
|
|
self.log("cf_clearance cookie has expired", "debug")
|
|
return True
|
|
|
|
# Check for short-lived session cookies (common names: gc_session_id, session, sid, etc)
|
|
# If any cookie expires within 1 hour, consider it expired
|
|
# Skip analytics cookies that naturally expire quickly (_ga, _gat, _gid)
|
|
analytics_prefixes = ('_ga', '_gat', '_gid', '_gcl', '__utm')
|
|
|
|
for cookie in cookies:
|
|
cookie_name = cookie.get('name', 'unknown')
|
|
|
|
# Skip Google Analytics and tracking cookies - they're not critical for CF bypass
|
|
if cookie_name.startswith(analytics_prefixes):
|
|
continue
|
|
|
|
expiry = cookie.get('expiry') or cookie.get('expires')
|
|
if expiry and expiry != -1:
|
|
if expiry < (current_timestamp + one_hour):
|
|
time_remaining = (expiry - current_timestamp) / 60 # minutes
|
|
if time_remaining <= 0:
|
|
self.log(f"Cookie '{cookie_name}' has expired", "debug")
|
|
return True
|
|
elif 'session' in cookie_name.lower():
|
|
# Session cookies expiring soon are critical
|
|
self.log(f"Session cookie '{cookie_name}' expires in {time_remaining:.0f} min, refreshing", "debug")
|
|
return True
|
|
|
|
return False
|
|
|
|
except Exception as e:
|
|
self.log(f"Error checking cookie expiration: {e}", "warning")
|
|
return True
|
|
|
|
def get_cookies_dict(self) -> Dict[str, str]:
|
|
"""
|
|
Get cookies as dictionary for requests library
|
|
|
|
Returns:
|
|
Dictionary of cookie name->value pairs
|
|
"""
|
|
# If no cookie file, use in-memory cookies
|
|
if not self.cookie_file:
|
|
return {c['name']: c['value'] for c in self._cookies}
|
|
|
|
if not self.cookie_file.exists():
|
|
return {}
|
|
|
|
try:
|
|
with open(self.cookie_file, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
cookies = {}
|
|
for cookie in data.get('cookies', []):
|
|
cookies[cookie['name']] = cookie['value']
|
|
|
|
return cookies
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies: {e}", "warning")
|
|
return {}
|
|
|
|
def get_cookies_list(self) -> List[Dict]:
|
|
"""
|
|
Get cookies as list for Playwright
|
|
|
|
Returns:
|
|
List of cookie dictionaries with 'expiry' converted to 'expires' for Playwright compatibility
|
|
"""
|
|
# If no cookie file, use in-memory cookies
|
|
if not self.cookie_file:
|
|
cookies = self._cookies.copy()
|
|
elif not self.cookie_file.exists():
|
|
return []
|
|
else:
|
|
try:
|
|
with open(self.cookie_file, 'r') as f:
|
|
data = json.load(f)
|
|
cookies = data.get('cookies', [])
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies: {e}", "warning")
|
|
return []
|
|
|
|
# Convert 'expiry' to 'expires' for Playwright compatibility
|
|
# FlareSolverr uses 'expiry' but Playwright expects 'expires'
|
|
converted = []
|
|
for cookie in cookies:
|
|
c = dict(cookie)
|
|
if 'expiry' in c and 'expires' not in c:
|
|
c['expires'] = c.pop('expiry')
|
|
converted.append(c)
|
|
return converted
|
|
|
|
def get_user_agent(self) -> Optional[str]:
|
|
"""
|
|
Get the user agent associated with the stored cookies.
|
|
This is important because cf_clearance cookies are tied to browser fingerprint.
|
|
|
|
Returns:
|
|
User agent string or None if not stored
|
|
"""
|
|
# If no cookie file, use in-memory user agent
|
|
if not self.cookie_file:
|
|
return self._cookies_user_agent
|
|
|
|
if not self.cookie_file.exists():
|
|
return None
|
|
|
|
try:
|
|
with open(self.cookie_file, 'r') as f:
|
|
data = json.load(f)
|
|
return data.get('user_agent')
|
|
except Exception:
|
|
return None
|
|
|
|
def save_cookies(self, cookies: Union[List[Dict], Dict[str, str]], user_agent: str = None):
|
|
"""
|
|
Save cookies to file or in-memory storage
|
|
|
|
Args:
|
|
cookies: Either list of cookie dicts (Playwright) or dict (requests)
|
|
user_agent: Browser user agent (important for cf_clearance cookies)
|
|
"""
|
|
# Convert dict to list format if needed
|
|
if isinstance(cookies, dict):
|
|
cookies_list = [
|
|
{'name': k, 'value': v, 'domain': '', 'path': '/'}
|
|
for k, v in cookies.items()
|
|
]
|
|
else:
|
|
cookies_list = cookies
|
|
|
|
# If no cookie file, store in memory
|
|
if not self.cookie_file:
|
|
self._cookies = cookies_list
|
|
self._cookies_user_agent = user_agent
|
|
self.log(f"Saved {len(cookies_list)} cookies to memory", "debug")
|
|
return
|
|
|
|
storage_data = {
|
|
'cookies': cookies_list,
|
|
'timestamp': datetime.now().isoformat()
|
|
}
|
|
if user_agent:
|
|
storage_data['user_agent'] = user_agent
|
|
|
|
try:
|
|
with open(self.cookie_file, 'w') as f:
|
|
json.dump(storage_data, f, indent=2)
|
|
self.log(f"Saved {len(cookies_list)} cookies to {self.cookie_file}", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error saving cookies: {e}", "error")
|
|
|
|
def load_cookies_to_playwright(self, context):
|
|
"""
|
|
Load cookies into Playwright browser context
|
|
|
|
Args:
|
|
context: Playwright browser context
|
|
"""
|
|
cookies = self.get_cookies_list()
|
|
if cookies:
|
|
# CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect
|
|
try:
|
|
context.clear_cookies()
|
|
except Exception:
|
|
pass
|
|
context.add_cookies(cookies)
|
|
self.log(f"Loaded {len(cookies)} cookies into browser", "debug")
|
|
|
|
def save_cookies_from_playwright(self, context):
|
|
"""
|
|
Save cookies from Playwright browser context
|
|
|
|
Args:
|
|
context: Playwright browser context
|
|
"""
|
|
cookies = context.cookies()
|
|
self.save_cookies(cookies)
|
|
|
|
def load_cookies_to_requests(self, session: requests.Session):
|
|
"""
|
|
Load cookies into requests Session
|
|
|
|
Args:
|
|
session: requests.Session object
|
|
"""
|
|
cookies = self.get_cookies_dict()
|
|
session.cookies.update(cookies)
|
|
self.log(f"Loaded {len(cookies)} cookies into requests session", "debug")
|
|
|
|
# ==================== FlareSolverr Integration ====================
|
|
|
|
def get_cookies_via_flaresolverr(self, url: str, max_retries: int = 2) -> bool:
|
|
"""
|
|
Use FlareSolverr to bypass Cloudflare and get fresh cookies
|
|
|
|
Args:
|
|
url: URL to fetch
|
|
max_retries: Maximum number of retry attempts
|
|
|
|
Returns:
|
|
True if cookies obtained successfully, False otherwise
|
|
"""
|
|
if not self.flaresolverr_enabled:
|
|
self.log("FlareSolverr is disabled", "debug")
|
|
return False
|
|
|
|
for attempt in range(1, max_retries + 1):
|
|
try:
|
|
if attempt > 1:
|
|
self.log(f"Retrying FlareSolverr (attempt {attempt}/{max_retries})...", "info")
|
|
else:
|
|
self.log("Using FlareSolverr to bypass Cloudflare...", "info")
|
|
|
|
payload = {
|
|
"cmd": "request.get",
|
|
"url": url,
|
|
"maxTimeout": 120000 # 120 seconds for difficult challenges
|
|
}
|
|
|
|
# Add proxy if configured
|
|
if self.proxy_url:
|
|
payload["proxy"] = {"url": self.proxy_url}
|
|
self.log(f"Using proxy: {self.proxy_url}", "debug")
|
|
|
|
response = requests.post(self.flaresolverr_url, json=payload, timeout=130)
|
|
data = response.json()
|
|
|
|
if data.get('status') == 'ok' and data.get('solution'):
|
|
solution = data['solution']
|
|
cookies = solution.get('cookies', [])
|
|
user_agent = solution.get('userAgent')
|
|
|
|
if cookies:
|
|
has_cf_clearance = any(c['name'] == 'cf_clearance' for c in cookies)
|
|
|
|
if has_cf_clearance:
|
|
self.log(f"✓ FlareSolverr bypassed Cloudflare! Got {len(cookies)} cookies", "info")
|
|
else:
|
|
self.log(f"✓ FlareSolverr succeeded! Got {len(cookies)} cookies", "info")
|
|
|
|
# Save cookies with user_agent (important for cf_clearance)
|
|
self.save_cookies(cookies, user_agent=user_agent)
|
|
return True
|
|
else:
|
|
self.log("FlareSolverr returned no cookies", "warning")
|
|
return False
|
|
else:
|
|
error_msg = data.get('message', 'Unknown error')
|
|
self.log(f"FlareSolverr failed: {error_msg}", "warning")
|
|
|
|
# Retry on timeout errors
|
|
if 'timeout' in error_msg.lower() and attempt < max_retries:
|
|
continue
|
|
return False
|
|
|
|
except requests.exceptions.Timeout:
|
|
self.log(f"FlareSolverr request timed out (attempt {attempt}/{max_retries})", "warning")
|
|
if attempt < max_retries:
|
|
continue
|
|
return False
|
|
except Exception as e:
|
|
self.log(f"FlareSolverr error: {e}", "error")
|
|
return False
|
|
|
|
return False
|
|
|
|
# ==================== Site Status Detection ====================
|
|
|
|
def check_site_status(self, url: str, timeout: int = 10) -> Tuple[SiteStatus, Optional[str]]:
|
|
"""
|
|
Check site status and detect Cloudflare challenges or errors
|
|
|
|
Args:
|
|
url: URL to check
|
|
timeout: Request timeout in seconds
|
|
|
|
Returns:
|
|
Tuple of (SiteStatus, error_message)
|
|
"""
|
|
try:
|
|
response = requests.get(
|
|
url,
|
|
timeout=timeout,
|
|
headers={'User-Agent': self.user_agent},
|
|
cookies=self.get_cookies_dict(),
|
|
allow_redirects=True
|
|
)
|
|
|
|
status_code = response.status_code
|
|
content = response.text.lower()
|
|
|
|
# Check for server errors (5xx)
|
|
if status_code >= 500:
|
|
error_msg = f"Server error {status_code}"
|
|
if status_code == 500:
|
|
error_msg = "Internal server error (500)"
|
|
elif status_code == 502:
|
|
error_msg = "Bad gateway (502)"
|
|
elif status_code == 503:
|
|
error_msg = "Service unavailable (503)"
|
|
elif status_code == 504:
|
|
error_msg = "Gateway timeout (504)"
|
|
|
|
self.log(error_msg, "warning")
|
|
return (SiteStatus.SERVER_ERROR, error_msg)
|
|
|
|
# Check for forbidden (403)
|
|
# Note: 403 is often just expired cookies, which FlareSolverr will fix
|
|
if status_code == 403:
|
|
self.log("Access forbidden (403) - cookies may be expired", "debug")
|
|
return (SiteStatus.FORBIDDEN, "Access forbidden (403)")
|
|
|
|
# Check for Cloudflare challenge indicators
|
|
challenge_indicators = [
|
|
'challenge-platform',
|
|
'checking your browser',
|
|
'just a moment',
|
|
'verify you are human',
|
|
'cloudflare',
|
|
'cf-challenge',
|
|
'cf_clearance'
|
|
]
|
|
|
|
# Short response likely indicates challenge page
|
|
if len(response.text) < 1000:
|
|
for indicator in challenge_indicators:
|
|
if indicator in content:
|
|
self.log("Cloudflare challenge detected", "info")
|
|
return (SiteStatus.CLOUDFLARE_CHALLENGE, "Cloudflare challenge detected")
|
|
|
|
# Check first 500 chars for challenge indicators
|
|
if any(indicator in content[:500] for indicator in challenge_indicators):
|
|
self.log("Cloudflare challenge detected in page content", "info")
|
|
return (SiteStatus.CLOUDFLARE_CHALLENGE, "Cloudflare challenge detected")
|
|
|
|
# If we got here, site appears to be working
|
|
self.log(f"Site appears to be working (status {status_code})", "debug")
|
|
return (SiteStatus.WORKING, None)
|
|
|
|
except requests.exceptions.Timeout:
|
|
self.log(f"Request timed out after {timeout}s", "warning")
|
|
return (SiteStatus.TIMEOUT, f"Request timed out after {timeout}s")
|
|
except requests.exceptions.ConnectionError as e:
|
|
self.log(f"Connection error: {e}", "error")
|
|
return (SiteStatus.UNKNOWN_ERROR, f"Connection error: {e}")
|
|
except Exception as e:
|
|
self.log(f"Error checking site status: {e}", "error")
|
|
return (SiteStatus.UNKNOWN_ERROR, str(e))
|
|
|
|
def should_skip_download(self, status: SiteStatus) -> bool:
|
|
"""
|
|
Determine if download should be skipped based on site status
|
|
|
|
Args:
|
|
status: SiteStatus enum value
|
|
|
|
Returns:
|
|
True if download should be skipped, False if retry is possible
|
|
"""
|
|
# Skip on server errors and timeouts (site is down)
|
|
if status in [SiteStatus.SERVER_ERROR, SiteStatus.TIMEOUT]:
|
|
return True
|
|
|
|
# Don't skip on Cloudflare challenges (we can try to bypass)
|
|
if status == SiteStatus.CLOUDFLARE_CHALLENGE:
|
|
return False
|
|
|
|
# Don't skip on forbidden (might be temporary)
|
|
if status == SiteStatus.FORBIDDEN:
|
|
return False
|
|
|
|
# Skip on unknown errors to be safe
|
|
if status == SiteStatus.UNKNOWN_ERROR:
|
|
return True
|
|
|
|
return False
|
|
|
|
# ==================== High-Level Helper Methods ====================
|
|
|
|
def ensure_cookies(self, url: str) -> bool:
|
|
"""
|
|
Ensure we have valid cookies, getting new ones via FlareSolverr if needed
|
|
|
|
Args:
|
|
url: URL to use for FlareSolverr request
|
|
|
|
Returns:
|
|
True if cookies are available, False otherwise
|
|
"""
|
|
if not self.cookies_expired():
|
|
self.log("Using existing cookies", "debug")
|
|
return True
|
|
|
|
self.log("Cookies missing or expired, attempting FlareSolverr bypass...", "info")
|
|
return self.get_cookies_via_flaresolverr(url)
|
|
|
|
def check_and_bypass(self, url: str, auto_flaresolverr: bool = True) -> Tuple[SiteStatus, bool]:
|
|
"""
|
|
Check site status and automatically attempt FlareSolverr bypass if needed
|
|
|
|
Args:
|
|
url: URL to check
|
|
auto_flaresolverr: Automatically call FlareSolverr on challenges
|
|
|
|
Returns:
|
|
Tuple of (SiteStatus, cookies_obtained)
|
|
"""
|
|
status, error_msg = self.check_site_status(url)
|
|
|
|
# If site is down or timing out, don't bother with FlareSolverr
|
|
if self.should_skip_download(status):
|
|
return (status, False)
|
|
|
|
# If Cloudflare challenge detected and auto-bypass enabled
|
|
if status == SiteStatus.CLOUDFLARE_CHALLENGE and auto_flaresolverr:
|
|
self.log("Attempting automatic Cloudflare bypass...", "info")
|
|
success = self.get_cookies_via_flaresolverr(url)
|
|
return (status, success)
|
|
|
|
return (status, True)
|
|
|
|
def wait_for_cloudflare_playwright(self, page, max_wait: int = 120) -> bool:
|
|
"""
|
|
Wait for Cloudflare challenge to resolve in Playwright page
|
|
|
|
Args:
|
|
page: Playwright page object
|
|
max_wait: Maximum wait time in seconds
|
|
|
|
Returns:
|
|
True if challenge resolved, False if still blocked
|
|
"""
|
|
start_time = time.time()
|
|
|
|
challenge_indicators = [
|
|
'challenge-platform',
|
|
'checking your browser',
|
|
'just a moment'
|
|
]
|
|
|
|
while time.time() - start_time < max_wait:
|
|
try:
|
|
content = page.content().lower()
|
|
|
|
# Check if challenge is still present
|
|
has_challenge = any(indicator in content for indicator in challenge_indicators)
|
|
|
|
if not has_challenge:
|
|
self.log("Cloudflare challenge resolved", "info")
|
|
return True
|
|
|
|
# Log progress every 15 seconds
|
|
elapsed = int(time.time() - start_time)
|
|
if elapsed % 15 == 0 and elapsed > 0:
|
|
self.log(f"Still waiting for Cloudflare ({elapsed}s)...", "debug")
|
|
|
|
time.sleep(1)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error checking for Cloudflare: {e}", "warning")
|
|
time.sleep(1)
|
|
|
|
self.log(f"Cloudflare challenge did not resolve after {max_wait}s", "warning")
|
|
return False
|