#!/usr/bin/env python3 """ Thread-safe WebSocket event emitter for scraper monitoring Provides real-time events for the scraping monitor page: - Scraper sessions starting/completing - File downloads and movements - Progress updates """ from datetime import datetime from typing import Optional, Dict, Any class ScraperEventEmitter: """Emits WebSocket events for real-time scraper monitoring""" def __init__(self, websocket_manager=None, app_state=None): """ Initialize event emitter Args: websocket_manager: WebSocket connection manager (optional) app_state: Application state for tracking active sessions (optional) """ self.websocket_manager = websocket_manager self.app_state = app_state def emit_scraper_started(self, session_id: str, platform: str, account: str, content_type: str, estimated_count: int = 0, accounts_list: list = None): """ Emit when scraper session begins Args: session_id: Unique session identifier platform: Platform name (instagram, snapchat, etc.) account: Account/username being scraped (or comma-separated list) content_type: Type of content (stories, posts, etc.) estimated_count: Estimated number of items to download accounts_list: Optional list of all accounts to be processed """ event_data = { 'session_id': session_id, 'platform': platform, 'account': account, 'content_type': content_type, 'estimated_count': estimated_count, 'timestamp': datetime.now().isoformat() } # Include accounts list if provided if accounts_list: event_data['accounts_list'] = accounts_list # Store session in app_state for API retrieval # Match the scheduler's data structure exactly if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'): self.app_state.active_scraper_sessions[session_id] = { 'session_id': session_id, 'platform': platform, 'account': account, 'content_type': content_type, 'start_time': datetime.now().isoformat(), 'status': 'Starting...', 'detailed_status': 'Starting...', 'progress': { 'current': 0, 'total': estimated_count or 100 }, 'stats': {'media': 0, 'review': 0, 'failed': 0} } self._broadcast({ 'type': 'scraper_started', 'data': event_data }) def emit_scraper_progress(self, session_id: str, status: str, current: int, total: int, current_account: str = None, completed_accounts: list = None): """ Emit progress update Args: session_id: Session identifier status: Status message (e.g., "Downloading stories...") current: Current item count total: Total item count current_account: Currently active account/forum name (optional) completed_accounts: List of completed accounts (optional) """ event_data = { 'session_id': session_id, 'status': status, 'progress_current': current, 'progress_total': total, 'timestamp': datetime.now().isoformat() } # Include current account if provided if current_account: event_data['current_account'] = current_account # Include completed accounts if provided if completed_accounts: event_data['completed_accounts'] = completed_accounts # Update session in app_state - match scheduler structure if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'): if session_id in self.app_state.active_scraper_sessions: session = self.app_state.active_scraper_sessions[session_id] session['status'] = status session['detailed_status'] = status # Update account to current account if provided if current_account: session['account'] = current_account # Use nested progress structure to match scheduler session['progress'] = { 'current': current, 'total': total } if completed_accounts: session['completed_accounts'] = completed_accounts self._broadcast({ 'type': 'scraper_progress', 'data': event_data }) def emit_scraper_completed(self, session_id: str, stats: Dict[str, int]): """ Emit when scraper session completes Args: session_id: Session identifier stats: Statistics dict with keys: total_downloaded, moved, review, duplicates, failed """ # Remove session from app_state if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'): self.app_state.active_scraper_sessions.pop(session_id, None) self._broadcast({ 'type': 'scraper_completed', 'data': { 'session_id': session_id, 'stats': stats, 'timestamp': datetime.now().isoformat() } }) def emit_file_moved(self, session_id: str, platform: str, account: str, filename: str, media_type: str, destination_type: str, destination_path: str, thumbnail_url: str = None, face_match: Dict[str, Any] = None): """ Emit when file is moved to destination Args: session_id: Session identifier platform: Platform name account: Account/username filename: File name media_type: 'image' or 'video' destination_type: 'media', 'review', or 'recycle' destination_path: Full path to destination file thumbnail_url: URL to thumbnail (optional) face_match: Face recognition result dict (optional) """ self._broadcast({ 'type': 'file_moved', 'data': { 'session_id': session_id, 'platform': platform, 'account': account, 'filename': filename, 'media_type': media_type, 'destination_type': destination_type, 'destination_path': destination_path, 'thumbnail_url': thumbnail_url, 'face_match': face_match or {'matched': False}, 'timestamp': datetime.now().isoformat() } }) def _broadcast(self, message: dict): """ Thread-safe broadcast to WebSocket clients Args: message: Event message dict """ if self.websocket_manager: # Use broadcast_sync for thread-safe emission from background threads self.websocket_manager.broadcast_sync(message)