195 lines
7.1 KiB
Python
195 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Thread-safe WebSocket event emitter for scraper monitoring
|
|
|
|
Provides real-time events for the scraping monitor page:
|
|
- Scraper sessions starting/completing
|
|
- File downloads and movements
|
|
- Progress updates
|
|
"""
|
|
|
|
from datetime import datetime
|
|
from typing import Optional, Dict, Any
|
|
|
|
|
|
class ScraperEventEmitter:
|
|
"""Emits WebSocket events for real-time scraper monitoring"""
|
|
|
|
def __init__(self, websocket_manager=None, app_state=None):
|
|
"""
|
|
Initialize event emitter
|
|
|
|
Args:
|
|
websocket_manager: WebSocket connection manager (optional)
|
|
app_state: Application state for tracking active sessions (optional)
|
|
"""
|
|
self.websocket_manager = websocket_manager
|
|
self.app_state = app_state
|
|
|
|
def emit_scraper_started(self, session_id: str, platform: str, account: str,
|
|
content_type: str, estimated_count: int = 0, accounts_list: list = None):
|
|
"""
|
|
Emit when scraper session begins
|
|
|
|
Args:
|
|
session_id: Unique session identifier
|
|
platform: Platform name (instagram, snapchat, etc.)
|
|
account: Account/username being scraped (or comma-separated list)
|
|
content_type: Type of content (stories, posts, etc.)
|
|
estimated_count: Estimated number of items to download
|
|
accounts_list: Optional list of all accounts to be processed
|
|
"""
|
|
event_data = {
|
|
'session_id': session_id,
|
|
'platform': platform,
|
|
'account': account,
|
|
'content_type': content_type,
|
|
'estimated_count': estimated_count,
|
|
'timestamp': datetime.now().isoformat()
|
|
}
|
|
|
|
# Include accounts list if provided
|
|
if accounts_list:
|
|
event_data['accounts_list'] = accounts_list
|
|
|
|
# Store session in app_state for API retrieval
|
|
# Match the scheduler's data structure exactly
|
|
if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
|
|
self.app_state.active_scraper_sessions[session_id] = {
|
|
'session_id': session_id,
|
|
'platform': platform,
|
|
'account': account,
|
|
'content_type': content_type,
|
|
'start_time': datetime.now().isoformat(),
|
|
'status': 'Starting...',
|
|
'detailed_status': 'Starting...',
|
|
'progress': {
|
|
'current': 0,
|
|
'total': estimated_count or 100
|
|
},
|
|
'stats': {'media': 0, 'review': 0, 'failed': 0}
|
|
}
|
|
|
|
self._broadcast({
|
|
'type': 'scraper_started',
|
|
'data': event_data
|
|
})
|
|
|
|
def emit_scraper_progress(self, session_id: str, status: str,
|
|
current: int, total: int, current_account: str = None,
|
|
completed_accounts: list = None):
|
|
"""
|
|
Emit progress update
|
|
|
|
Args:
|
|
session_id: Session identifier
|
|
status: Status message (e.g., "Downloading stories...")
|
|
current: Current item count
|
|
total: Total item count
|
|
current_account: Currently active account/forum name (optional)
|
|
completed_accounts: List of completed accounts (optional)
|
|
"""
|
|
event_data = {
|
|
'session_id': session_id,
|
|
'status': status,
|
|
'progress_current': current,
|
|
'progress_total': total,
|
|
'timestamp': datetime.now().isoformat()
|
|
}
|
|
|
|
# Include current account if provided
|
|
if current_account:
|
|
event_data['current_account'] = current_account
|
|
|
|
# Include completed accounts if provided
|
|
if completed_accounts:
|
|
event_data['completed_accounts'] = completed_accounts
|
|
|
|
# Update session in app_state - match scheduler structure
|
|
if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
|
|
if session_id in self.app_state.active_scraper_sessions:
|
|
session = self.app_state.active_scraper_sessions[session_id]
|
|
session['status'] = status
|
|
session['detailed_status'] = status
|
|
# Update account to current account if provided
|
|
if current_account:
|
|
session['account'] = current_account
|
|
# Use nested progress structure to match scheduler
|
|
session['progress'] = {
|
|
'current': current,
|
|
'total': total
|
|
}
|
|
if completed_accounts:
|
|
session['completed_accounts'] = completed_accounts
|
|
|
|
self._broadcast({
|
|
'type': 'scraper_progress',
|
|
'data': event_data
|
|
})
|
|
|
|
def emit_scraper_completed(self, session_id: str, stats: Dict[str, int]):
|
|
"""
|
|
Emit when scraper session completes
|
|
|
|
Args:
|
|
session_id: Session identifier
|
|
stats: Statistics dict with keys: total_downloaded, moved, review, duplicates, failed
|
|
"""
|
|
# Remove session from app_state
|
|
if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
|
|
self.app_state.active_scraper_sessions.pop(session_id, None)
|
|
|
|
self._broadcast({
|
|
'type': 'scraper_completed',
|
|
'data': {
|
|
'session_id': session_id,
|
|
'stats': stats,
|
|
'timestamp': datetime.now().isoformat()
|
|
}
|
|
})
|
|
|
|
def emit_file_moved(self, session_id: str, platform: str, account: str,
|
|
filename: str, media_type: str, destination_type: str,
|
|
destination_path: str, thumbnail_url: str = None,
|
|
face_match: Dict[str, Any] = None):
|
|
"""
|
|
Emit when file is moved to destination
|
|
|
|
Args:
|
|
session_id: Session identifier
|
|
platform: Platform name
|
|
account: Account/username
|
|
filename: File name
|
|
media_type: 'image' or 'video'
|
|
destination_type: 'media', 'review', or 'recycle'
|
|
destination_path: Full path to destination file
|
|
thumbnail_url: URL to thumbnail (optional)
|
|
face_match: Face recognition result dict (optional)
|
|
"""
|
|
self._broadcast({
|
|
'type': 'file_moved',
|
|
'data': {
|
|
'session_id': session_id,
|
|
'platform': platform,
|
|
'account': account,
|
|
'filename': filename,
|
|
'media_type': media_type,
|
|
'destination_type': destination_type,
|
|
'destination_path': destination_path,
|
|
'thumbnail_url': thumbnail_url,
|
|
'face_match': face_match or {'matched': False},
|
|
'timestamp': datetime.now().isoformat()
|
|
}
|
|
})
|
|
|
|
def _broadcast(self, message: dict):
|
|
"""
|
|
Thread-safe broadcast to WebSocket clients
|
|
|
|
Args:
|
|
message: Event message dict
|
|
"""
|
|
if self.websocket_manager:
|
|
# Use broadcast_sync for thread-safe emission from background threads
|
|
self.websocket_manager.broadcast_sync(message)
|