Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/modules/scraper_event_emitter.py
+++ b/modules/scraper_event_emitter.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+"""
+Thread-safe WebSocket event emitter for scraper monitoring
+
+Provides real-time events for the scraping monitor page:
+- Scraper sessions starting/completing
+- File downloads and movements
+- Progress updates
+"""
+
+from datetime import datetime
+from typing import Optional, Dict, Any
+
+
+class ScraperEventEmitter:
+    """Emits WebSocket events for real-time scraper monitoring"""
+
+    def __init__(self, websocket_manager=None, app_state=None):
+        """
+        Initialize event emitter
+
+        Args:
+            websocket_manager: WebSocket connection manager (optional)
+            app_state: Application state for tracking active sessions (optional)
+        """
+        self.websocket_manager = websocket_manager
+        self.app_state = app_state
+
+    def emit_scraper_started(self, session_id: str, platform: str, account: str,
+                            content_type: str, estimated_count: int = 0, accounts_list: list = None):
+        """
+        Emit when scraper session begins
+
+        Args:
+            session_id: Unique session identifier
+            platform: Platform name (instagram, snapchat, etc.)
+            account: Account/username being scraped (or comma-separated list)
+            content_type: Type of content (stories, posts, etc.)
+            estimated_count: Estimated number of items to download
+            accounts_list: Optional list of all accounts to be processed
+        """
+        event_data = {
+            'session_id': session_id,
+            'platform': platform,
+            'account': account,
+            'content_type': content_type,
+            'estimated_count': estimated_count,
+            'timestamp': datetime.now().isoformat()
+        }
+
+        # Include accounts list if provided
+        if accounts_list:
+            event_data['accounts_list'] = accounts_list
+
+        # Store session in app_state for API retrieval
+        # Match the scheduler's data structure exactly
+        if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
+            self.app_state.active_scraper_sessions[session_id] = {
+                'session_id': session_id,
+                'platform': platform,
+                'account': account,
+                'content_type': content_type,
+                'start_time': datetime.now().isoformat(),
+                'status': 'Starting...',
+                'detailed_status': 'Starting...',
+                'progress': {
+                    'current': 0,
+                    'total': estimated_count or 100
+                },
+                'stats': {'media': 0, 'review': 0, 'failed': 0}
+            }
+
+        self._broadcast({
+            'type': 'scraper_started',
+            'data': event_data
+        })
+
+    def emit_scraper_progress(self, session_id: str, status: str,
+                             current: int, total: int, current_account: str = None,
+                             completed_accounts: list = None):
+        """
+        Emit progress update
+
+        Args:
+            session_id: Session identifier
+            status: Status message (e.g., "Downloading stories...")
+            current: Current item count
+            total: Total item count
+            current_account: Currently active account/forum name (optional)
+            completed_accounts: List of completed accounts (optional)
+        """
+        event_data = {
+            'session_id': session_id,
+            'status': status,
+            'progress_current': current,
+            'progress_total': total,
+            'timestamp': datetime.now().isoformat()
+        }
+
+        # Include current account if provided
+        if current_account:
+            event_data['current_account'] = current_account
+
+        # Include completed accounts if provided
+        if completed_accounts:
+            event_data['completed_accounts'] = completed_accounts
+
+        # Update session in app_state - match scheduler structure
+        if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
+            if session_id in self.app_state.active_scraper_sessions:
+                session = self.app_state.active_scraper_sessions[session_id]
+                session['status'] = status
+                session['detailed_status'] = status
+                # Update account to current account if provided
+                if current_account:
+                    session['account'] = current_account
+                # Use nested progress structure to match scheduler
+                session['progress'] = {
+                    'current': current,
+                    'total': total
+                }
+                if completed_accounts:
+                    session['completed_accounts'] = completed_accounts
+
+        self._broadcast({
+            'type': 'scraper_progress',
+            'data': event_data
+        })
+
+    def emit_scraper_completed(self, session_id: str, stats: Dict[str, int]):
+        """
+        Emit when scraper session completes
+
+        Args:
+            session_id: Session identifier
+            stats: Statistics dict with keys: total_downloaded, moved, review, duplicates, failed
+        """
+        # Remove session from app_state
+        if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
+            self.app_state.active_scraper_sessions.pop(session_id, None)
+
+        self._broadcast({
+            'type': 'scraper_completed',
+            'data': {
+                'session_id': session_id,
+                'stats': stats,
+                'timestamp': datetime.now().isoformat()
+            }
+        })
+
+    def emit_file_moved(self, session_id: str, platform: str, account: str,
+                       filename: str, media_type: str, destination_type: str,
+                       destination_path: str, thumbnail_url: str = None,
+                       face_match: Dict[str, Any] = None):
+        """
+        Emit when file is moved to destination
+
+        Args:
+            session_id: Session identifier
+            platform: Platform name
+            account: Account/username
+            filename: File name
+            media_type: 'image' or 'video'
+            destination_type: 'media', 'review', or 'recycle'
+            destination_path: Full path to destination file
+            thumbnail_url: URL to thumbnail (optional)
+            face_match: Face recognition result dict (optional)
+        """
+        self._broadcast({
+            'type': 'file_moved',
+            'data': {
+                'session_id': session_id,
+                'platform': platform,
+                'account': account,
+                'filename': filename,
+                'media_type': media_type,
+                'destination_type': destination_type,
+                'destination_path': destination_path,
+                'thumbnail_url': thumbnail_url,
+                'face_match': face_match or {'matched': False},
+                'timestamp': datetime.now().isoformat()
+            }
+        })
+
+    def _broadcast(self, message: dict):
+        """
+        Thread-safe broadcast to WebSocket clients
+
+        Args:
+            message: Event message dict
+        """
+        if self.websocket_manager:
+            # Use broadcast_sync for thread-safe emission from background threads
+            self.websocket_manager.broadcast_sync(message)