194
modules/scraper_event_emitter.py
Normal file
194
modules/scraper_event_emitter.py
Normal file
@@ -0,0 +1,194 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Thread-safe WebSocket event emitter for scraper monitoring
|
||||
|
||||
Provides real-time events for the scraping monitor page:
|
||||
- Scraper sessions starting/completing
|
||||
- File downloads and movements
|
||||
- Progress updates
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
|
||||
class ScraperEventEmitter:
|
||||
"""Emits WebSocket events for real-time scraper monitoring"""
|
||||
|
||||
def __init__(self, websocket_manager=None, app_state=None):
|
||||
"""
|
||||
Initialize event emitter
|
||||
|
||||
Args:
|
||||
websocket_manager: WebSocket connection manager (optional)
|
||||
app_state: Application state for tracking active sessions (optional)
|
||||
"""
|
||||
self.websocket_manager = websocket_manager
|
||||
self.app_state = app_state
|
||||
|
||||
def emit_scraper_started(self, session_id: str, platform: str, account: str,
|
||||
content_type: str, estimated_count: int = 0, accounts_list: list = None):
|
||||
"""
|
||||
Emit when scraper session begins
|
||||
|
||||
Args:
|
||||
session_id: Unique session identifier
|
||||
platform: Platform name (instagram, snapchat, etc.)
|
||||
account: Account/username being scraped (or comma-separated list)
|
||||
content_type: Type of content (stories, posts, etc.)
|
||||
estimated_count: Estimated number of items to download
|
||||
accounts_list: Optional list of all accounts to be processed
|
||||
"""
|
||||
event_data = {
|
||||
'session_id': session_id,
|
||||
'platform': platform,
|
||||
'account': account,
|
||||
'content_type': content_type,
|
||||
'estimated_count': estimated_count,
|
||||
'timestamp': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Include accounts list if provided
|
||||
if accounts_list:
|
||||
event_data['accounts_list'] = accounts_list
|
||||
|
||||
# Store session in app_state for API retrieval
|
||||
# Match the scheduler's data structure exactly
|
||||
if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
|
||||
self.app_state.active_scraper_sessions[session_id] = {
|
||||
'session_id': session_id,
|
||||
'platform': platform,
|
||||
'account': account,
|
||||
'content_type': content_type,
|
||||
'start_time': datetime.now().isoformat(),
|
||||
'status': 'Starting...',
|
||||
'detailed_status': 'Starting...',
|
||||
'progress': {
|
||||
'current': 0,
|
||||
'total': estimated_count or 100
|
||||
},
|
||||
'stats': {'media': 0, 'review': 0, 'failed': 0}
|
||||
}
|
||||
|
||||
self._broadcast({
|
||||
'type': 'scraper_started',
|
||||
'data': event_data
|
||||
})
|
||||
|
||||
def emit_scraper_progress(self, session_id: str, status: str,
|
||||
current: int, total: int, current_account: str = None,
|
||||
completed_accounts: list = None):
|
||||
"""
|
||||
Emit progress update
|
||||
|
||||
Args:
|
||||
session_id: Session identifier
|
||||
status: Status message (e.g., "Downloading stories...")
|
||||
current: Current item count
|
||||
total: Total item count
|
||||
current_account: Currently active account/forum name (optional)
|
||||
completed_accounts: List of completed accounts (optional)
|
||||
"""
|
||||
event_data = {
|
||||
'session_id': session_id,
|
||||
'status': status,
|
||||
'progress_current': current,
|
||||
'progress_total': total,
|
||||
'timestamp': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Include current account if provided
|
||||
if current_account:
|
||||
event_data['current_account'] = current_account
|
||||
|
||||
# Include completed accounts if provided
|
||||
if completed_accounts:
|
||||
event_data['completed_accounts'] = completed_accounts
|
||||
|
||||
# Update session in app_state - match scheduler structure
|
||||
if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
|
||||
if session_id in self.app_state.active_scraper_sessions:
|
||||
session = self.app_state.active_scraper_sessions[session_id]
|
||||
session['status'] = status
|
||||
session['detailed_status'] = status
|
||||
# Update account to current account if provided
|
||||
if current_account:
|
||||
session['account'] = current_account
|
||||
# Use nested progress structure to match scheduler
|
||||
session['progress'] = {
|
||||
'current': current,
|
||||
'total': total
|
||||
}
|
||||
if completed_accounts:
|
||||
session['completed_accounts'] = completed_accounts
|
||||
|
||||
self._broadcast({
|
||||
'type': 'scraper_progress',
|
||||
'data': event_data
|
||||
})
|
||||
|
||||
def emit_scraper_completed(self, session_id: str, stats: Dict[str, int]):
|
||||
"""
|
||||
Emit when scraper session completes
|
||||
|
||||
Args:
|
||||
session_id: Session identifier
|
||||
stats: Statistics dict with keys: total_downloaded, moved, review, duplicates, failed
|
||||
"""
|
||||
# Remove session from app_state
|
||||
if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
|
||||
self.app_state.active_scraper_sessions.pop(session_id, None)
|
||||
|
||||
self._broadcast({
|
||||
'type': 'scraper_completed',
|
||||
'data': {
|
||||
'session_id': session_id,
|
||||
'stats': stats,
|
||||
'timestamp': datetime.now().isoformat()
|
||||
}
|
||||
})
|
||||
|
||||
def emit_file_moved(self, session_id: str, platform: str, account: str,
|
||||
filename: str, media_type: str, destination_type: str,
|
||||
destination_path: str, thumbnail_url: str = None,
|
||||
face_match: Dict[str, Any] = None):
|
||||
"""
|
||||
Emit when file is moved to destination
|
||||
|
||||
Args:
|
||||
session_id: Session identifier
|
||||
platform: Platform name
|
||||
account: Account/username
|
||||
filename: File name
|
||||
media_type: 'image' or 'video'
|
||||
destination_type: 'media', 'review', or 'recycle'
|
||||
destination_path: Full path to destination file
|
||||
thumbnail_url: URL to thumbnail (optional)
|
||||
face_match: Face recognition result dict (optional)
|
||||
"""
|
||||
self._broadcast({
|
||||
'type': 'file_moved',
|
||||
'data': {
|
||||
'session_id': session_id,
|
||||
'platform': platform,
|
||||
'account': account,
|
||||
'filename': filename,
|
||||
'media_type': media_type,
|
||||
'destination_type': destination_type,
|
||||
'destination_path': destination_path,
|
||||
'thumbnail_url': thumbnail_url,
|
||||
'face_match': face_match or {'matched': False},
|
||||
'timestamp': datetime.now().isoformat()
|
||||
}
|
||||
})
|
||||
|
||||
def _broadcast(self, message: dict):
|
||||
"""
|
||||
Thread-safe broadcast to WebSocket clients
|
||||
|
||||
Args:
|
||||
message: Event message dict
|
||||
"""
|
||||
if self.websocket_manager:
|
||||
# Use broadcast_sync for thread-safe emission from background threads
|
||||
self.websocket_manager.broadcast_sync(message)
|
||||
Reference in New Issue
Block a user