media-downloader/modules/scraper_event_emitter.py

#!/usr/bin/env python3
"""
Thread-safe WebSocket event emitter for scraper monitoring

Provides real-time events for the scraping monitor page:
- Scraper sessions starting/completing
- File downloads and movements
- Progress updates
"""

from datetime import datetime
from typing import Optional, Dict, Any


class ScraperEventEmitter:
    """Emits WebSocket events for real-time scraper monitoring"""

    def __init__(self, websocket_manager=None, app_state=None):
        """
        Initialize event emitter

        Args:
            websocket_manager: WebSocket connection manager (optional)
            app_state: Application state for tracking active sessions (optional)
        """
        self.websocket_manager = websocket_manager
        self.app_state = app_state

    def emit_scraper_started(self, session_id: str, platform: str, account: str,
                            content_type: str, estimated_count: int = 0, accounts_list: list = None):
        """
        Emit when scraper session begins

        Args:
            session_id: Unique session identifier
            platform: Platform name (instagram, snapchat, etc.)
            account: Account/username being scraped (or comma-separated list)
            content_type: Type of content (stories, posts, etc.)
            estimated_count: Estimated number of items to download
            accounts_list: Optional list of all accounts to be processed
        """
        event_data = {
            'session_id': session_id,
            'platform': platform,
            'account': account,
            'content_type': content_type,
            'estimated_count': estimated_count,
            'timestamp': datetime.now().isoformat()
        }

        # Include accounts list if provided
        if accounts_list:
            event_data['accounts_list'] = accounts_list

        # Store session in app_state for API retrieval
        # Match the scheduler's data structure exactly
        if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
            self.app_state.active_scraper_sessions[session_id] = {
                'session_id': session_id,
                'platform': platform,
                'account': account,
                'content_type': content_type,
                'start_time': datetime.now().isoformat(),
                'status': 'Starting...',
                'detailed_status': 'Starting...',
                'progress': {
                    'current': 0,
                    'total': estimated_count or 100
                },
                'stats': {'media': 0, 'review': 0, 'failed': 0}
            }

        self._broadcast({
            'type': 'scraper_started',
            'data': event_data
        })

    def emit_scraper_progress(self, session_id: str, status: str,
                             current: int, total: int, current_account: str = None,
                             completed_accounts: list = None):
        """
        Emit progress update

        Args:
            session_id: Session identifier
            status: Status message (e.g., "Downloading stories...")
            current: Current item count
            total: Total item count
            current_account: Currently active account/forum name (optional)
            completed_accounts: List of completed accounts (optional)
        """
        event_data = {
            'session_id': session_id,
            'status': status,
            'progress_current': current,
            'progress_total': total,
            'timestamp': datetime.now().isoformat()
        }

        # Include current account if provided
        if current_account:
            event_data['current_account'] = current_account

        # Include completed accounts if provided
        if completed_accounts:
            event_data['completed_accounts'] = completed_accounts

        # Update session in app_state - match scheduler structure
        if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
            if session_id in self.app_state.active_scraper_sessions:
                session = self.app_state.active_scraper_sessions[session_id]
                session['status'] = status
                session['detailed_status'] = status
                # Update account to current account if provided
                if current_account:
                    session['account'] = current_account
                # Use nested progress structure to match scheduler
                session['progress'] = {
                    'current': current,
                    'total': total
                }
                if completed_accounts:
                    session['completed_accounts'] = completed_accounts

        self._broadcast({
            'type': 'scraper_progress',
            'data': event_data
        })

    def emit_scraper_completed(self, session_id: str, stats: Dict[str, int]):
        """
        Emit when scraper session completes

        Args:
            session_id: Session identifier
            stats: Statistics dict with keys: total_downloaded, moved, review, duplicates, failed
        """
        # Remove session from app_state
        if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
            self.app_state.active_scraper_sessions.pop(session_id, None)

        self._broadcast({
            'type': 'scraper_completed',
            'data': {
                'session_id': session_id,
                'stats': stats,
                'timestamp': datetime.now().isoformat()
            }
        })

    def emit_file_moved(self, session_id: str, platform: str, account: str,
                       filename: str, media_type: str, destination_type: str,
                       destination_path: str, thumbnail_url: str = None,
                       face_match: Dict[str, Any] = None):
        """
        Emit when file is moved to destination

        Args:
            session_id: Session identifier
            platform: Platform name
            account: Account/username
            filename: File name
            media_type: 'image' or 'video'
            destination_type: 'media', 'review', or 'recycle'
            destination_path: Full path to destination file
            thumbnail_url: URL to thumbnail (optional)
            face_match: Face recognition result dict (optional)
        """
        self._broadcast({
            'type': 'file_moved',
            'data': {
                'session_id': session_id,
                'platform': platform,
                'account': account,
                'filename': filename,
                'media_type': media_type,
                'destination_type': destination_type,
                'destination_path': destination_path,
                'thumbnail_url': thumbnail_url,
                'face_match': face_match or {'matched': False},
                'timestamp': datetime.now().isoformat()
            }
        })

    def _broadcast(self, message: dict):
        """
        Thread-safe broadcast to WebSocket clients

        Args:
            message: Event message dict
        """
        if self.websocket_manager:
            # Use broadcast_sync for thread-safe emission from background threads
            self.websocket_manager.broadcast_sync(message)