""" Scrapers Router Handles scraper management and error monitoring: - Scraper configuration (list, get, update) - Cookie management (test connection, upload, clear) - Error tracking (recent, count, dismiss, mark viewed) """ import json import re from datetime import datetime, timedelta from pathlib import Path from typing import Dict, List, Optional import requests from fastapi import APIRouter, Body, Depends, Query, Request from pydantic import BaseModel from slowapi import Limiter from slowapi.util import get_remote_address from ..core.dependencies import get_current_user, require_admin, get_app_state from ..core.exceptions import handle_exceptions, NotFoundError, ValidationError from modules.universal_logger import get_logger logger = get_logger('API') router = APIRouter(prefix="/api", tags=["Scrapers"]) limiter = Limiter(key_func=get_remote_address) # ============================================================================ # PYDANTIC MODELS # ============================================================================ class ScraperUpdate(BaseModel): enabled: Optional[bool] = None proxy_enabled: Optional[bool] = None proxy_url: Optional[str] = None flaresolverr_required: Optional[bool] = None base_url: Optional[str] = None class CookieUpload(BaseModel): cookies: List[dict] merge: bool = True user_agent: Optional[str] = None class DismissErrors(BaseModel): error_ids: Optional[List[int]] = None dismiss_all: bool = False class MarkErrorsViewed(BaseModel): error_ids: Optional[List[int]] = None mark_all: bool = False # ============================================================================ # SCRAPER ENDPOINTS # ============================================================================ @router.get("/scrapers") @limiter.limit("60/minute") @handle_exceptions async def get_scrapers( request: Request, current_user: Dict = Depends(get_current_user), type_filter: Optional[str] = Query(None, alias="type", description="Filter by type") ): """Get all scrapers with optional type filter.""" app_state = get_app_state() scrapers = app_state.db.get_all_scrapers(type_filter=type_filter) # Filter out scrapers whose related modules are all hidden hidden_modules = app_state.config.get('hidden_modules', []) if hidden_modules: # Map scraper IDs to the modules that use them. # A scraper is only hidden if ALL related modules are hidden. scraper_to_modules = { 'instagram': ['instagram', 'instagram_client'], 'snapchat': ['snapchat', 'snapchat_client'], 'fastdl': ['fastdl'], 'imginn': ['imginn'], 'toolzu': ['toolzu'], 'tiktok': ['tiktok'], 'coppermine': ['coppermine'], } # Forum scrapers map to the 'forums' module filtered = [] for scraper in scrapers: sid = scraper.get('id', '') if sid.startswith('forum_'): related = ['forums'] else: related = scraper_to_modules.get(sid, []) # Only hide if ALL related modules are hidden if related and all(m in hidden_modules for m in related): continue filtered.append(scraper) scrapers = filtered # Don't send cookies_json to frontend (too large) for scraper in scrapers: if 'cookies_json' in scraper: del scraper['cookies_json'] return {"scrapers": scrapers} # ============================================================================ # PLATFORM CREDENTIALS (UNIFIED COOKIE MANAGEMENT) # ============================================================================ # Platform definitions for the unified credentials view _SCRAPER_PLATFORMS = [ {'id': 'instagram', 'name': 'Instagram', 'type': 'cookies', 'source': 'scraper', 'used_by': ['Scheduler']}, {'id': 'tiktok', 'name': 'TikTok', 'type': 'cookies', 'source': 'scraper', 'used_by': ['Scheduler']}, {'id': 'snapchat', 'name': 'Snapchat', 'type': 'cookies', 'source': 'scraper', 'used_by': ['Scheduler']}, {'id': 'ytdlp', 'name': 'YouTube', 'type': 'cookies', 'source': 'scraper', 'used_by': ['Scheduler']}, {'id': 'pornhub', 'name': 'PornHub', 'type': 'cookies', 'source': 'scraper', 'used_by': ['Scheduler']}, {'id': 'xhamster', 'name': 'xHamster', 'type': 'cookies', 'source': 'scraper', 'used_by': ['Scheduler']}, ] _PAID_CONTENT_PLATFORMS = [ {'id': 'onlyfans_direct', 'name': 'OnlyFans', 'type': 'token', 'source': 'paid_content', 'used_by': ['Paid Content'], 'base_url': 'https://onlyfans.com'}, {'id': 'fansly_direct', 'name': 'Fansly', 'type': 'token', 'source': 'paid_content', 'used_by': ['Paid Content'], 'base_url': 'https://fansly.com'}, {'id': 'coomer', 'name': 'Coomer', 'type': 'session', 'source': 'paid_content', 'used_by': ['Paid Content'], 'base_url': 'https://coomer.su'}, {'id': 'kemono', 'name': 'Kemono', 'type': 'session', 'source': 'paid_content', 'used_by': ['Paid Content'], 'base_url': 'https://kemono.su'}, {'id': 'twitch', 'name': 'Twitch', 'type': 'session', 'source': 'paid_content', 'used_by': ['Paid Content'], 'base_url': 'https://twitch.tv'}, {'id': 'bellazon', 'name': 'Bellazon', 'type': 'session', 'source': 'paid_content', 'used_by': ['Paid Content'], 'base_url': 'https://www.bellazon.com'}, ] @router.get("/scrapers/platform-credentials") @limiter.limit("30/minute") @handle_exceptions async def get_platform_credentials( request: Request, current_user: Dict = Depends(get_current_user) ): """Get aggregated credential status for all platforms + monitoring preferences.""" app_state = get_app_state() db = app_state.db platforms = [] def _get_monitoring_flag(platform_id: str) -> bool: """Read monitoring preference from settings.""" try: val = app_state.settings.get(f"cookie_monitoring:{platform_id}") if val is not None: return str(val).lower() not in ('false', '0', 'no') except Exception: pass return True # 1. Scraper platforms for platform_def in _SCRAPER_PLATFORMS: scraper = db.get_scraper(platform_def['id']) cookies_count = 0 updated_at = None if scraper: raw = scraper.get('cookies_json') if raw: try: data = json.loads(raw) if isinstance(data, list): cookies_count = len(data) elif isinstance(data, dict): c = data.get('cookies', []) cookies_count = len(c) if isinstance(c, list) else 0 except (json.JSONDecodeError, TypeError): pass updated_at = scraper.get('cookies_updated_at') monitoring_enabled = _get_monitoring_flag(platform_def['id']) platforms.append({ 'id': platform_def['id'], 'name': platform_def['name'], 'type': platform_def['type'], 'source': platform_def['source'], 'cookies_count': cookies_count, 'has_credentials': cookies_count > 0, 'updated_at': updated_at, 'used_by': platform_def['used_by'], 'monitoring_enabled': monitoring_enabled, }) # 2. Paid content platforms try: from modules.paid_content import PaidContentDBAdapter paid_db = PaidContentDBAdapter(db) paid_services = {svc['id']: svc for svc in paid_db.get_services()} except Exception: paid_services = {} for platform_def in _PAID_CONTENT_PLATFORMS: svc = paid_services.get(platform_def['id'], {}) session_val = svc.get('session_cookie') or '' has_creds = bool(session_val) updated_at = svc.get('session_updated_at') # Count credentials: for JSON objects count keys, for JSON arrays count items, otherwise 1 if set cookies_count = 0 if has_creds: try: parsed = json.loads(session_val) if isinstance(parsed, dict): cookies_count = len(parsed) elif isinstance(parsed, list): cookies_count = len(parsed) else: cookies_count = 1 except (json.JSONDecodeError, TypeError): cookies_count = 1 platforms.append({ 'id': platform_def['id'], 'name': platform_def['name'], 'type': platform_def['type'], 'source': platform_def['source'], 'base_url': platform_def.get('base_url'), 'cookies_count': cookies_count, 'has_credentials': has_creds, 'updated_at': updated_at, 'used_by': platform_def['used_by'], 'monitoring_enabled': _get_monitoring_flag(platform_def['id']), }) # 3. Reddit (private gallery) reddit_has_creds = False reddit_cookies_count = 0 reddit_locked = True try: from modules.reddit_community_monitor import RedditCommunityMonitor, REDDIT_MONITOR_KEY_FILE from modules.private_gallery_crypto import get_private_gallery_crypto, load_key_from_file db_path = str(Path(__file__).parent.parent.parent.parent / 'database' / 'media_downloader.db') reddit_monitor = RedditCommunityMonitor(db_path) crypto = get_private_gallery_crypto() reddit_locked = not crypto.is_initialized() # If gallery is locked, try loading crypto from key file (exported on unlock) active_crypto = crypto if not reddit_locked else load_key_from_file(REDDIT_MONITOR_KEY_FILE) if active_crypto and active_crypto.is_initialized(): reddit_has_creds = reddit_monitor.has_cookies(active_crypto) if reddit_has_creds: try: conn = reddit_monitor._get_connection() cursor = conn.cursor() cursor.execute("SELECT value FROM private_media_config WHERE key = 'reddit_monitor_encrypted_cookies'") row = cursor.fetchone() conn.close() if row and row['value']: decrypted = active_crypto.decrypt_field(row['value']) parsed = json.loads(decrypted) if isinstance(parsed, list): reddit_cookies_count = len(parsed) except Exception: reddit_cookies_count = 1 if reddit_has_creds else 0 except Exception: pass platforms.append({ 'id': 'reddit', 'name': 'Reddit', 'type': 'cookies', 'source': 'private_gallery', 'base_url': 'https://reddit.com', 'cookies_count': reddit_cookies_count, 'has_credentials': reddit_has_creds, 'gallery_locked': reddit_locked, 'updated_at': None, 'used_by': ['Private Gallery'], 'monitoring_enabled': _get_monitoring_flag('reddit'), }) return { 'platforms': platforms, 'global_monitoring_enabled': _get_monitoring_flag('global'), } @router.put("/scrapers/platform-credentials/{platform_id}/monitoring") @limiter.limit("30/minute") @handle_exceptions async def toggle_platform_monitoring( request: Request, platform_id: str, current_user: Dict = Depends(require_admin) ): """Toggle health monitoring for a single platform.""" app_state = get_app_state() body = await request.json() enabled = body.get('enabled', True) app_state.settings.set( key=f"cookie_monitoring:{platform_id}", value=str(enabled).lower(), category="cookie_monitoring", updated_by=current_user.get('username', 'user') ) return { 'success': True, 'message': f"Monitoring {'enabled' if enabled else 'disabled'} for {platform_id}", } @router.put("/scrapers/platform-credentials/monitoring") @limiter.limit("30/minute") @handle_exceptions async def toggle_global_monitoring( request: Request, current_user: Dict = Depends(require_admin) ): """Toggle global cookie health monitoring.""" app_state = get_app_state() body = await request.json() enabled = body.get('enabled', True) app_state.settings.set( key="cookie_monitoring:global", value=str(enabled).lower(), category="cookie_monitoring", updated_by=current_user.get('username', 'user') ) return { 'success': True, 'message': f"Global cookie monitoring {'enabled' if enabled else 'disabled'}", } @router.get("/scrapers/{scraper_id}") @limiter.limit("60/minute") @handle_exceptions async def get_scraper( request: Request, scraper_id: str, current_user: Dict = Depends(get_current_user) ): """Get a single scraper configuration.""" app_state = get_app_state() scraper = app_state.db.get_scraper(scraper_id) if not scraper: raise NotFoundError(f"Scraper '{scraper_id}' not found") if 'cookies_json' in scraper: del scraper['cookies_json'] cookies = app_state.db.get_scraper_cookies(scraper_id) scraper['cookies_count'] = len(cookies) if cookies else 0 return scraper @router.put("/scrapers/{scraper_id}") @limiter.limit("30/minute") @handle_exceptions async def update_scraper( request: Request, scraper_id: str, current_user: Dict = Depends(require_admin) ): """Update scraper settings (proxy, enabled, base_url).""" app_state = get_app_state() body = await request.json() scraper = app_state.db.get_scraper(scraper_id) if not scraper: raise NotFoundError(f"Scraper '{scraper_id}' not found") success = app_state.db.update_scraper(scraper_id, body) if not success: raise ValidationError("No valid fields to update") return {"success": True, "message": f"Scraper '{scraper_id}' updated"} @router.post("/scrapers/{scraper_id}/test") @limiter.limit("10/minute") @handle_exceptions async def test_scraper_connection( request: Request, scraper_id: str, current_user: Dict = Depends(require_admin) ): """ Test scraper connection via FlareSolverr (if required). On success, saves cookies to database. For CLI tools (yt-dlp, gallery-dl), tests that the tool is installed and working. """ import subprocess from modules.cloudflare_handler import CloudflareHandler app_state = get_app_state() scraper = app_state.db.get_scraper(scraper_id) if not scraper: raise NotFoundError(f"Scraper '{scraper_id}' not found") # Handle CLI tools specially - test that they're installed and working if scraper.get('type') == 'cli_tool': cli_tests = { 'ytdlp': { 'cmd': ['/opt/media-downloader/venv/bin/yt-dlp', '--version'], 'name': 'yt-dlp' }, 'gallerydl': { 'cmd': ['/opt/media-downloader/venv/bin/gallery-dl', '--version'], 'name': 'gallery-dl' } } test_config = cli_tests.get(scraper_id) if test_config: try: result = subprocess.run( test_config['cmd'], capture_output=True, text=True, timeout=10 ) if result.returncode == 0: version = result.stdout.strip().split('\n')[0] cookies_count = 0 # Check if cookies are configured if scraper.get('cookies_json'): try: import json data = json.loads(scraper['cookies_json']) # Support both {"cookies": [...]} and [...] formats if isinstance(data, dict) and 'cookies' in data: cookies = data['cookies'] elif isinstance(data, list): cookies = data else: cookies = [] cookies_count = len(cookies) if cookies else 0 except (json.JSONDecodeError, TypeError, KeyError) as e: logger.debug(f"Failed to parse cookies for {scraper_id}: {e}") app_state.db.update_scraper_test_status(scraper_id, 'success') msg = f"{test_config['name']} v{version} installed" if cookies_count > 0: msg += f", {cookies_count} cookies configured" return { "success": True, "message": msg } else: error_msg = result.stderr.strip() or "Command failed" app_state.db.update_scraper_test_status(scraper_id, 'failed', error_msg) return { "success": False, "message": f"{test_config['name']} error: {error_msg}" } except subprocess.TimeoutExpired: app_state.db.update_scraper_test_status(scraper_id, 'failed', "Command timed out") return {"success": False, "message": "Command timed out"} except FileNotFoundError: app_state.db.update_scraper_test_status(scraper_id, 'failed', "Tool not installed") return {"success": False, "message": f"{test_config['name']} not installed"} else: # Unknown CLI tool app_state.db.update_scraper_test_status(scraper_id, 'success') return {"success": True, "message": "CLI tool registered"} base_url = scraper.get('base_url') if not base_url: raise ValidationError(f"Scraper '{scraper_id}' has no base_url configured") proxy_url = None if scraper.get('proxy_enabled') and scraper.get('proxy_url'): proxy_url = scraper['proxy_url'] cf_handler = CloudflareHandler( module_name=scraper_id, cookie_file=None, proxy_url=proxy_url if proxy_url else None, flaresolverr_enabled=scraper.get('flaresolverr_required', False) ) if scraper.get('flaresolverr_required'): success = cf_handler.get_cookies_via_flaresolverr(base_url, max_retries=2) if success: cookies = cf_handler.get_cookies_list() user_agent = cf_handler.get_user_agent() app_state.db.save_scraper_cookies(scraper_id, cookies, user_agent=user_agent) app_state.db.update_scraper_test_status(scraper_id, 'success') return { "success": True, "message": f"Connection successful, {len(cookies)} cookies saved", "cookies_count": len(cookies) } else: error_msg = "FlareSolverr returned no cookies" if proxy_url: error_msg += " (check proxy connection)" app_state.db.update_scraper_test_status(scraper_id, 'failed', error_msg) return { "success": False, "message": error_msg } else: try: proxies = {"http": proxy_url, "https": proxy_url} if proxy_url else None response = requests.get( base_url, timeout=10, proxies=proxies, headers={'User-Agent': cf_handler.user_agent} ) if response.status_code < 400: app_state.db.update_scraper_test_status(scraper_id, 'success') return { "success": True, "message": f"Connection successful (HTTP {response.status_code})" } else: app_state.db.update_scraper_test_status( scraper_id, 'failed', f"HTTP {response.status_code}" ) return { "success": False, "message": f"Connection failed with HTTP {response.status_code}" } except requests.exceptions.Timeout: app_state.db.update_scraper_test_status(scraper_id, 'timeout', 'Request timed out') return {"success": False, "message": "Connection timed out"} except Exception as e: app_state.db.update_scraper_test_status(scraper_id, 'failed', str(e)) return {"success": False, "message": str(e)} @router.post("/scrapers/{scraper_id}/cookies") @limiter.limit("20/minute") @handle_exceptions async def upload_scraper_cookies( request: Request, scraper_id: str, current_user: Dict = Depends(require_admin) ): """Upload cookies for a scraper (from browser extension export).""" app_state = get_app_state() scraper = app_state.db.get_scraper(scraper_id) if not scraper: raise NotFoundError(f"Scraper '{scraper_id}' not found") body = await request.json() # Support both {cookies: [...]} and bare [...] formats if isinstance(body, list): cookies = body merge = True user_agent = None else: cookies = body.get('cookies', []) merge = body.get('merge', True) user_agent = body.get('user_agent') if not cookies or not isinstance(cookies, list): raise ValidationError("Invalid cookies format. Expected {cookies: [...]}") for i, cookie in enumerate(cookies): if not isinstance(cookie, dict): raise ValidationError(f"Cookie {i} is not an object") if 'name' not in cookie or 'value' not in cookie: raise ValidationError(f"Cookie {i} missing 'name' or 'value'") success = app_state.db.save_scraper_cookies( scraper_id, cookies, user_agent=user_agent, merge=merge ) if success: all_cookies = app_state.db.get_scraper_cookies(scraper_id) count = len(all_cookies) if all_cookies else 0 return { "success": True, "message": f"{'Merged' if merge else 'Replaced'} {len(cookies)} cookies (total: {count})", "cookies_count": count } else: raise ValidationError("Failed to save cookies") @router.delete("/scrapers/{scraper_id}/cookies") @limiter.limit("20/minute") @handle_exceptions async def clear_scraper_cookies( request: Request, scraper_id: str, current_user: Dict = Depends(require_admin) ): """Clear all cookies for a scraper.""" app_state = get_app_state() scraper = app_state.db.get_scraper(scraper_id) if not scraper: raise NotFoundError(f"Scraper '{scraper_id}' not found") success = app_state.db.clear_scraper_cookies(scraper_id) return { "success": success, "message": f"Cookies cleared for '{scraper_id}'" if success else "Failed to clear cookies" } # ============================================================================ # ERROR MONITORING ENDPOINTS # ============================================================================ @router.get("/errors/recent") @limiter.limit("30/minute") @handle_exceptions async def get_recent_errors( request: Request, limit: int = Query(50, ge=1, le=500, description="Maximum number of errors to return"), since_visit: bool = Query(False, description="Only show errors since last dashboard visit (default: show ALL unviewed)"), include_dismissed: bool = Query(False, description="Include dismissed errors"), current_user: Dict = Depends(get_current_user) ): """Get recent errors from database. By default, shows ALL unviewed/undismissed errors regardless of when they occurred. This ensures errors are not missed just because the user visited the dashboard. Errors are recorded in real-time by universal_logger.py. """ app_state = get_app_state() # By default, show ALL unviewed errors (since=None) # Only filter by visit time if explicitly requested since = None if since_visit: since = app_state.db.get_last_dashboard_visit() if not since: since = datetime.now() - timedelta(hours=24) errors = app_state.db.get_recent_errors(since=since, include_dismissed=include_dismissed, limit=limit) return { "errors": errors, "total_count": len(errors), "since": since.isoformat() if since else None, "unviewed_count": app_state.db.get_unviewed_error_count(since=None) # Always count ALL unviewed } @router.get("/errors/count") @limiter.limit("60/minute") @handle_exceptions async def get_error_count( request: Request, current_user: Dict = Depends(get_current_user) ): """Get count of ALL unviewed/undismissed errors. Errors are recorded in real-time by universal_logger.py. """ app_state = get_app_state() # Count ALL unviewed errors total_unviewed = app_state.db.get_unviewed_error_count(since=None) # Count errors since last dashboard visit last_visit = app_state.db.get_last_dashboard_visit() since_last_visit = app_state.db.get_unviewed_error_count(since=last_visit) if last_visit else total_unviewed return { "unviewed_count": total_unviewed, "total_recent": total_unviewed, "since_last_visit": since_last_visit } @router.post("/errors/dismiss") @limiter.limit("20/minute") @handle_exceptions async def dismiss_errors( request: Request, body: Dict = Body(...), current_user: Dict = Depends(get_current_user) ): """Dismiss errors by ID or all.""" app_state = get_app_state() error_ids = body.get("error_ids", []) dismiss_all = body.get("dismiss_all", False) if dismiss_all: dismissed = app_state.db.dismiss_errors(dismiss_all=True) elif error_ids: dismissed = app_state.db.dismiss_errors(error_ids=error_ids) else: return {"success": False, "dismissed": 0, "message": "No errors specified"} return { "success": True, "dismissed": dismissed, "message": f"Dismissed {dismissed} error(s)" } @router.post("/errors/mark-viewed") @limiter.limit("20/minute") @handle_exceptions async def mark_errors_viewed( request: Request, body: Dict = Body(...), current_user: Dict = Depends(get_current_user) ): """Mark errors as viewed.""" app_state = get_app_state() error_ids = body.get("error_ids", []) mark_all = body.get("mark_all", False) if mark_all: marked = app_state.db.mark_errors_viewed(mark_all=True) elif error_ids: marked = app_state.db.mark_errors_viewed(error_ids=error_ids) else: return {"success": False, "marked": 0} return { "success": True, "marked": marked } @router.post("/errors/update-visit") @limiter.limit("30/minute") @handle_exceptions async def update_dashboard_visit( request: Request, current_user: Dict = Depends(get_current_user) ): """Update the last dashboard visit timestamp.""" app_state = get_app_state() success = app_state.db.update_dashboard_visit() return {"success": success} @router.get("/logs/context") @limiter.limit("30/minute") @handle_exceptions async def get_log_context( request: Request, timestamp: str = Query(..., description="ISO timestamp of the error"), module: Optional[str] = Query(None, description="Module name to filter"), minutes_before: int = Query(1, description="Minutes of context before error"), minutes_after: int = Query(1, description="Minutes of context after error"), current_user: Dict = Depends(get_current_user) ): """Get log lines around a specific timestamp for debugging context.""" target_time = datetime.fromisoformat(timestamp) start_time = target_time - timedelta(minutes=minutes_before) end_time = target_time + timedelta(minutes=minutes_after) log_dir = Path('/opt/media-downloader/logs') log_pattern = re.compile( r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) ' r'\[MediaDownloader\.(\w+)\] ' r'\[(\w+)\] ' r'\[(\w+)\] ' r'(.+)$' ) date_str = target_time.strftime('%Y%m%d') matching_lines = [] for log_file in log_dir.glob(f'{date_str}_*.log'): if module and module.lower() not in log_file.stem.lower(): continue try: lines = log_file.read_text(errors='replace').splitlines() for line in lines: match = log_pattern.match(line) if match: timestamp_str, _, log_module, level, message = match.groups() try: line_time = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S') if start_time <= line_time <= end_time: matching_lines.append({ 'timestamp': timestamp_str, 'module': log_module, 'level': level, 'message': message, 'is_target': abs((line_time - target_time).total_seconds()) < 2 }) except ValueError: continue except Exception: continue matching_lines.sort(key=lambda x: x['timestamp']) return { "context": matching_lines, "target_timestamp": timestamp, "range": { "start": start_time.isoformat(), "end": end_time.isoformat() } }