319
modules/service_health_monitor.py
Normal file
319
modules/service_health_monitor.py
Normal file
@@ -0,0 +1,319 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Service Health Monitor - Tracks service failures and sends alerts
|
||||
Only active during scheduler mode for unattended operation monitoring
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Optional
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
|
||||
class ServiceHealthMonitor:
|
||||
"""Monitor service health and send alerts when services get stuck"""
|
||||
|
||||
def __init__(self,
|
||||
state_file: str = "/opt/media-downloader/database/service_health.json",
|
||||
config: dict = None,
|
||||
error_monitoring_config: dict = None,
|
||||
pushover_notifier = None,
|
||||
scheduler_mode: bool = False):
|
||||
"""
|
||||
Initialize health monitor
|
||||
|
||||
Args:
|
||||
state_file: Path to JSON file storing health state
|
||||
config: Configuration dict from settings.json
|
||||
error_monitoring_config: Error monitoring settings (for push alert delay)
|
||||
pushover_notifier: Instance of PushoverNotifier for alerts
|
||||
scheduler_mode: Only monitor when True (scheduler mode)
|
||||
"""
|
||||
self.state_file = Path(state_file)
|
||||
self.state_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.pushover = pushover_notifier
|
||||
self.scheduler_mode = scheduler_mode
|
||||
self.error_monitoring_config = error_monitoring_config or {}
|
||||
|
||||
# Default configuration
|
||||
self.config = {
|
||||
'enabled': True,
|
||||
'notification_cooldown_hours': 24,
|
||||
'min_consecutive_failures': 2, # Number of consecutive run failures before alerting
|
||||
'services': {
|
||||
'fastdl': {'monitor': True, 'notify': True},
|
||||
'imginn': {'monitor': True, 'notify': True},
|
||||
'snapchat': {'monitor': True, 'notify': True},
|
||||
'toolzu': {'monitor': True, 'notify': True},
|
||||
'tiktok': {'monitor': True, 'notify': True},
|
||||
'forums': {'monitor': True, 'notify': True}
|
||||
},
|
||||
'pushover': {
|
||||
'enabled': True,
|
||||
'priority': 0,
|
||||
'sound': 'pushover'
|
||||
}
|
||||
}
|
||||
|
||||
# Merge user config
|
||||
if config:
|
||||
self.config.update(config)
|
||||
|
||||
# Load or initialize state
|
||||
self.state = self._load_state()
|
||||
|
||||
# Setup logging
|
||||
self.logger = get_logger('ServiceHealthMonitor')
|
||||
|
||||
def _load_state(self) -> Dict:
|
||||
"""Load health state from file"""
|
||||
if self.state_file.exists():
|
||||
try:
|
||||
with open(self.state_file, 'r') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to load health state: {e}")
|
||||
|
||||
# Initialize empty state
|
||||
return {'service_health': {}}
|
||||
|
||||
def _save_state(self):
|
||||
"""Save health state to file"""
|
||||
try:
|
||||
with open(self.state_file, 'w') as f:
|
||||
json.dump(self.state, f, indent=2, default=str)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to save health state: {e}")
|
||||
|
||||
def _get_service_state(self, service: str) -> Dict:
|
||||
"""Get state for a service, initialize if doesn't exist"""
|
||||
if service not in self.state['service_health']:
|
||||
self.state['service_health'][service] = {
|
||||
'status': 'healthy',
|
||||
'consecutive_failures': 0,
|
||||
'last_success': None,
|
||||
'last_failure': None,
|
||||
'last_notification_sent': None,
|
||||
'failure_type': None,
|
||||
'total_failures': 0,
|
||||
'total_successes': 0
|
||||
}
|
||||
return self.state['service_health'][service]
|
||||
|
||||
def record_success(self, service: str):
|
||||
"""
|
||||
Record successful operation for a service
|
||||
|
||||
Args:
|
||||
service: Service name (fastdl, imginn, snapchat, etc.)
|
||||
"""
|
||||
# Only monitor in scheduler mode
|
||||
if not self.scheduler_mode:
|
||||
return
|
||||
|
||||
# Check if service is monitored
|
||||
if not self._is_monitored(service):
|
||||
return
|
||||
|
||||
state = self._get_service_state(service)
|
||||
now = datetime.now()
|
||||
|
||||
# Was service previously stuck? Send recovery notification
|
||||
was_stuck = state['status'] == 'stuck'
|
||||
|
||||
# Update state
|
||||
state['status'] = 'healthy'
|
||||
state['consecutive_failures'] = 0
|
||||
state['last_success'] = now.isoformat()
|
||||
state['failure_type'] = None
|
||||
state['total_successes'] += 1
|
||||
|
||||
self._save_state()
|
||||
|
||||
# Send recovery notification if service was stuck
|
||||
if was_stuck and self._should_notify(service):
|
||||
self._send_recovery_notification(service, now)
|
||||
|
||||
def record_failure(self, service: str, reason: str = 'unknown'):
|
||||
"""
|
||||
Record failure for a service
|
||||
|
||||
Args:
|
||||
service: Service name (fastdl, imginn, snapchat, etc.)
|
||||
reason: Reason for failure (cloudflare, rate_limit, timeout, etc.)
|
||||
"""
|
||||
# Only monitor in scheduler mode
|
||||
if not self.scheduler_mode:
|
||||
return
|
||||
|
||||
# Check if service is monitored
|
||||
if not self._is_monitored(service):
|
||||
return
|
||||
|
||||
state = self._get_service_state(service)
|
||||
now = datetime.now()
|
||||
|
||||
# Update state - increment consecutive failures
|
||||
state['consecutive_failures'] += 1
|
||||
state['last_failure'] = now.isoformat()
|
||||
state['failure_type'] = reason
|
||||
state['total_failures'] += 1
|
||||
|
||||
# Check if service should be marked as stuck based on consecutive run failures
|
||||
min_failures = self.config.get('min_consecutive_failures', 2)
|
||||
if state['consecutive_failures'] >= min_failures:
|
||||
state['status'] = 'stuck'
|
||||
|
||||
# Send notification if cooldown period has passed
|
||||
if self._should_notify(service) and self._notification_cooldown_expired(service):
|
||||
self._send_alert_notification(service, reason, now)
|
||||
state['last_notification_sent'] = now.isoformat()
|
||||
|
||||
self._save_state()
|
||||
|
||||
def _is_monitored(self, service: str) -> bool:
|
||||
"""Check if service should be monitored"""
|
||||
if not self.config.get('enabled', True):
|
||||
return False
|
||||
|
||||
service_config = self.config.get('services', {}).get(service, {})
|
||||
return service_config.get('monitor', True)
|
||||
|
||||
def _should_notify(self, service: str) -> bool:
|
||||
"""Check if notifications are enabled for this service"""
|
||||
if not self.pushover:
|
||||
return False
|
||||
|
||||
if not self.config.get('pushover', {}).get('enabled', True):
|
||||
return False
|
||||
|
||||
service_config = self.config.get('services', {}).get(service, {})
|
||||
return service_config.get('notify', True)
|
||||
|
||||
def _notification_cooldown_expired(self, service: str) -> bool:
|
||||
"""Check if notification cooldown period has expired"""
|
||||
state = self._get_service_state(service)
|
||||
last_sent = state.get('last_notification_sent')
|
||||
|
||||
if not last_sent:
|
||||
return True # Never sent, can send now
|
||||
|
||||
try:
|
||||
last_sent_time = datetime.fromisoformat(last_sent)
|
||||
# Use push_alert_delay_hours from error_monitoring config if available,
|
||||
# otherwise fall back to notification_cooldown_hours or default 24
|
||||
cooldown_hours = self.error_monitoring_config.get('push_alert_delay_hours',
|
||||
self.config.get('notification_cooldown_hours', 24))
|
||||
cooldown_period = timedelta(hours=cooldown_hours)
|
||||
|
||||
return datetime.now() - last_sent_time > cooldown_period
|
||||
except (ValueError, TypeError):
|
||||
return True # Error parsing date, allow notification
|
||||
|
||||
def _send_alert_notification(self, service: str, reason: str, now: datetime):
|
||||
"""Send Pushover alert notification"""
|
||||
state = self._get_service_state(service)
|
||||
|
||||
# Calculate time since last success
|
||||
time_stuck = "Unknown"
|
||||
if state['last_success']:
|
||||
try:
|
||||
last_success = datetime.fromisoformat(state['last_success'])
|
||||
delta = now - last_success
|
||||
hours = int(delta.total_seconds() / 3600)
|
||||
if hours < 1:
|
||||
time_stuck = f"{int(delta.total_seconds() / 60)} minutes ago"
|
||||
elif hours < 48:
|
||||
time_stuck = f"{hours} hours ago"
|
||||
else:
|
||||
days = int(hours / 24)
|
||||
time_stuck = f"{days} days ago"
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Format service name
|
||||
service_name = service.replace('_', ' ').title()
|
||||
|
||||
# Format reason
|
||||
reason_map = {
|
||||
'cloudflare': 'Cloudflare Challenge',
|
||||
'cloudflare_challenge': 'Cloudflare Challenge',
|
||||
'rate_limit': 'Rate Limited (429)',
|
||||
'forbidden': 'Access Forbidden (403)',
|
||||
'timeout': 'Connection Timeout',
|
||||
'authentication': 'Authentication Required',
|
||||
'captcha': 'CAPTCHA Challenge',
|
||||
'blocked': 'IP Blocked',
|
||||
'unknown': 'Unknown Error'
|
||||
}
|
||||
reason_text = reason_map.get(reason.lower(), reason)
|
||||
|
||||
# Build message
|
||||
title = f"⚠️ Service Alert: {service_name}"
|
||||
message = f"""Status: Stuck/Blocked
|
||||
Issue: {reason_text}
|
||||
Failed Since: {now.strftime('%b %d, %I:%M %p')} ({state['consecutive_failures']} consecutive failures)
|
||||
|
||||
Last successful download: {time_stuck if state['last_success'] else 'Never'}
|
||||
|
||||
Action may be required.
|
||||
"""
|
||||
|
||||
# Send notification
|
||||
try:
|
||||
priority = self.config.get('pushover', {}).get('priority', 0)
|
||||
sound = self.config.get('pushover', {}).get('sound', 'pushover')
|
||||
|
||||
self.pushover.send_notification(
|
||||
title=title,
|
||||
message=message,
|
||||
priority=priority,
|
||||
sound=sound
|
||||
)
|
||||
|
||||
self.logger.info(f"Sent alert notification for {service}: {reason}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to send alert notification: {e}")
|
||||
|
||||
def _send_recovery_notification(self, service: str, now: datetime):
|
||||
"""Send recovery notification (optional)"""
|
||||
# Recovery notifications are optional - can be disabled
|
||||
if not self.config.get('send_recovery_notifications', False):
|
||||
return
|
||||
|
||||
state = self._get_service_state(service)
|
||||
service_name = service.replace('_', ' ').title()
|
||||
|
||||
title = f"✅ Service Recovered: {service_name}"
|
||||
message = f"""Status: Healthy
|
||||
Service is working again.
|
||||
|
||||
Recovered at: {now.strftime('%b %d, %I:%M %p')}
|
||||
"""
|
||||
|
||||
try:
|
||||
self.pushover.send_notification(
|
||||
title=title,
|
||||
message=message,
|
||||
priority=-1, # Low priority for recovery
|
||||
sound='magic'
|
||||
)
|
||||
|
||||
self.logger.info(f"Sent recovery notification for {service}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to send recovery notification: {e}")
|
||||
|
||||
def get_service_status(self, service: str) -> Dict:
|
||||
"""Get current status for a service"""
|
||||
return self._get_service_state(service).copy()
|
||||
|
||||
def get_all_status(self) -> Dict:
|
||||
"""Get status for all services"""
|
||||
return self.state['service_health'].copy()
|
||||
|
||||
def reset_service(self, service: str):
|
||||
"""Reset state for a service"""
|
||||
if service in self.state['service_health']:
|
||||
del self.state['service_health'][service]
|
||||
self._save_state()
|
||||
Reference in New Issue
Block a user