#!/usr/bin/env python3 """ Base Module - Shared functionality for all media downloader modules Provides: - LoggingMixin: Consistent logging with universal logger and backwards-compatible callback support - CookieManagerMixin: Centralized cookie loading/saving for scrapers - RateLimitMixin: Smart delay handling for rate limiting - DeferredDownloadsMixin: Track downloads for batch database recording """ import random import time from typing import Any, Dict, List, Optional from modules.universal_logger import get_logger class LoggingMixin: """ Mixin providing consistent logging across all modules. Uses the universal logger for all logging, with optional callback support for backwards compatibility with existing code. Usage: class MyModule(LoggingMixin): def __init__(self, log_callback=None): self._init_logger('MyModule', log_callback) # ... rest of init def do_something(self): self.log("Starting operation", "info") # ... self.log("Operation complete", "success") """ _logger_name: str = 'Unknown' _default_module: str = 'Core' logger = None log_callback = None show_debug: bool = True def _init_logger(self, logger_name: str, log_callback=None, default_module: str = 'Core', show_debug: bool = True): """ Initialize logging for this module. Args: logger_name: Name for the logger (e.g., 'Instagram', 'TikTok', 'Forum') log_callback: Optional callback function for backwards compatibility default_module: Default module name for log messages (default: 'Core') show_debug: Whether to show debug messages (default: True) """ self._logger_name = logger_name self._default_module = default_module self.log_callback = log_callback self.show_debug = show_debug self.logger = get_logger(logger_name) def log(self, message: str, level: str = "info", module: str = None): """ Log a message using universal logger with optional callback. Args: message: The message to log level: Log level ('debug', 'info', 'warning', 'error', 'success', 'critical') module: Module name for the log entry (default: uses _default_module) """ level_lower = level.lower() # Skip debug messages if show_debug is False if level_lower == "debug" and not self.show_debug: return # Use universal logger (always log here first) actual_module = module or self._default_module self.logger.log(message, level.upper(), module=actual_module) # Call log_callback for backwards compatibility if self.log_callback: self.log_callback(f"[{self._logger_name}] {message}", level_lower) class CookieManagerMixin: """ Mixin providing centralized cookie management for scrapers. Handles loading and saving cookies to/from the database. Usage: class MyScraper(LoggingMixin, CookieManagerMixin): def __init__(self, unified_db=None): self._init_logger('MyScraper') self._init_cookie_manager(unified_db, 'my_scraper') self._load_cookies_from_db() def after_auth(self, cookies): self._save_cookies_to_db(cookies) """ unified_db = None scraper_id: str = '' cf_handler = None # CloudflareHandler if used user_agent: str = '' def _init_cookie_manager(self, unified_db, scraper_id: str, cf_handler=None, user_agent: str = ''): """ Initialize cookie management. Args: unified_db: UnifiedDatabase instance scraper_id: ID for this scraper in database cf_handler: Optional CloudflareHandler instance user_agent: User agent string """ self.unified_db = unified_db self.scraper_id = scraper_id self.cf_handler = cf_handler self.user_agent = user_agent def _load_cookies_from_db(self) -> Optional[List[Dict]]: """ Load cookies from database if available. Returns: List of cookie dicts or None if not available """ if not self.unified_db: return None try: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) if cookies: # Load into CloudflareHandler if available if self.cf_handler: self.cf_handler._cookies = cookies if hasattr(self, 'log'): self.log(f"Loaded {len(cookies)} cookies from database", "debug") return cookies except Exception as e: if hasattr(self, 'log'): self.log(f"Error loading cookies from database: {e}", "warning") return None def _save_cookies_to_db(self, cookies: List[Dict], merge: bool = True, user_agent: str = None): """ Save cookies to database. Args: cookies: List of cookie dicts merge: Whether to merge with existing cookies user_agent: User agent to associate with cookies (important for cf_clearance). If not provided, uses self.user_agent as fallback. """ if not self.unified_db: return try: # Use provided user_agent or fall back to self.user_agent ua = user_agent or self.user_agent self.unified_db.save_scraper_cookies( self.scraper_id, cookies, user_agent=ua, merge=merge ) if hasattr(self, 'log'): self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50] if ua else 'None'}...)", "debug") except Exception as e: if hasattr(self, 'log'): self.log(f"Error saving cookies to database: {e}", "warning") def _cookies_expired(self) -> bool: """ Check if cookies are expired. Returns: True if expired, False otherwise """ if self.cf_handler: return self.cf_handler.cookies_expired() return True def _get_cookies_for_requests(self) -> Dict[str, str]: """ Get cookies in format for requests library. Returns: Dict of cookie name -> value """ if self.cf_handler: return self.cf_handler.get_cookies_dict() return {} class RateLimitMixin: """ Mixin providing smart rate limiting for scrapers. Handles delays between requests to avoid detection and rate limiting. Usage: class MyScraper(LoggingMixin, RateLimitMixin): def __init__(self): self._init_logger('MyScraper') self._init_rate_limiter(min_delay=5, max_delay=15, batch_delay=30) def download_batch(self, items): for i, item in enumerate(items): self.download_item(item) is_batch_end = (i + 1) % 10 == 0 self._smart_delay(is_batch_end) """ min_delay: float = 5.0 max_delay: float = 15.0 batch_delay_min: float = 30.0 batch_delay_max: float = 60.0 error_delay: float = 120.0 def _init_rate_limiter( self, min_delay: float = 5.0, max_delay: float = 15.0, batch_delay_min: float = 30.0, batch_delay_max: float = 60.0, error_delay: float = 120.0 ): """ Initialize rate limiting. Args: min_delay: Minimum delay between requests (seconds) max_delay: Maximum delay between requests (seconds) batch_delay_min: Minimum delay between batches (seconds) batch_delay_max: Maximum delay between batches (seconds) error_delay: Delay after errors (seconds) """ self.min_delay = min_delay self.max_delay = max_delay self.batch_delay_min = batch_delay_min self.batch_delay_max = batch_delay_max self.error_delay = error_delay def _smart_delay(self, is_batch_end: bool = False, had_error: bool = False): """ Apply smart delay between requests. Args: is_batch_end: True if this is the end of a batch had_error: True if there was an error (uses longer delay) """ if had_error: delay = self.error_delay elif is_batch_end: delay = random.uniform(self.batch_delay_min, self.batch_delay_max) else: delay = random.uniform(self.min_delay, self.max_delay) if hasattr(self, 'log'): self.log(f"Waiting {delay:.1f}s before next request", "debug") time.sleep(delay) def _delay_after_error(self): """Apply error delay.""" self._smart_delay(had_error=True) def _delay_between_items(self): """Apply normal delay between items.""" self._smart_delay(is_batch_end=False) def _delay_between_batches(self): """Apply batch delay.""" self._smart_delay(is_batch_end=True) class DeferredDownloadsMixin: """ Mixin for tracking downloads to be recorded in batch. Allows deferring database writes for better performance. Usage: class MyScraper(LoggingMixin, DeferredDownloadsMixin): def __init__(self): self._init_logger('MyScraper') self._init_deferred_downloads() def download_file(self, url, path): # ... download logic ... self._add_pending_download({ 'platform': 'my_platform', 'source': 'username', 'file_path': str(path), # ... other fields ... }) def finish_batch(self): downloads = self.get_pending_downloads() self.db.record_downloads_batch(downloads) self.clear_pending_downloads() """ pending_downloads: List[Dict] = None def _init_deferred_downloads(self): """Initialize deferred downloads tracking.""" self.pending_downloads = [] def _add_pending_download(self, download_info: Dict[str, Any]): """ Add a download to pending list. Args: download_info: Dict with download metadata """ if self.pending_downloads is None: self.pending_downloads = [] self.pending_downloads.append(download_info) def get_pending_downloads(self) -> List[Dict[str, Any]]: """ Get all pending downloads. Returns: List of pending download dicts """ return self.pending_downloads or [] def clear_pending_downloads(self): """Clear pending downloads list.""" self.pending_downloads = [] def has_pending_downloads(self) -> bool: """Check if there are pending downloads.""" return bool(self.pending_downloads) class BaseDatabaseAdapter: """ Base class for platform-specific database adapters. Provides common functionality for recording and querying downloads. Platform-specific adapters should inherit from this class. Usage: class MyPlatformAdapter(BaseDatabaseAdapter): def __init__(self, unified_db): super().__init__(unified_db, platform='my_platform') def record_download(self, content_id, username, filename, **kwargs): # Platform-specific URL construction url = f"https://my_platform.com/{username}/{content_id}" return self._record_download_internal( url=url, source=username, filename=filename, **kwargs ) """ def __init__(self, unified_db, platform: str, method: str = None): """ Initialize base adapter. Args: unified_db: UnifiedDatabase instance platform: Platform name (e.g., 'instagram', 'tiktok') method: Optional method identifier for multi-method platforms """ self.db = unified_db self.unified_db = unified_db # Alias for compatibility self.platform = platform self.method = method or platform def get_connection(self, for_write: bool = False): """Get database connection (delegates to UnifiedDatabase).""" return self.db.get_connection(for_write) def get_file_hash(self, file_path: str) -> Optional[str]: """Calculate SHA256 hash of a file.""" return self.db.get_file_hash(file_path) def get_download_by_file_hash(self, file_hash: str) -> Optional[Dict]: """Get download record by file hash.""" return self.db.get_download_by_file_hash(file_hash) def get_download_by_media_id(self, media_id: str) -> Optional[Dict]: """Get download record by media_id.""" return self.db.get_download_by_media_id(media_id, self.platform, self.method) def is_already_downloaded_by_hash(self, file_path: str) -> bool: """Check if file is already downloaded by comparing file hash.""" file_hash = self.get_file_hash(file_path) if not file_hash: return False return self.get_download_by_file_hash(file_hash) is not None def is_already_downloaded_by_media_id(self, media_id: str) -> bool: """Check if content is already downloaded by media_id.""" with self.db.get_connection() as conn: cursor = conn.cursor() cursor.execute(''' SELECT 1 FROM downloads WHERE platform = ? AND media_id = ? LIMIT 1 ''', (self.platform, media_id)) return cursor.fetchone() is not None def _calculate_file_hash(self, file_path: str) -> Optional[str]: """Helper to safely calculate file hash.""" if not file_path: return None try: from pathlib import Path if Path(file_path).exists(): return self.get_file_hash(file_path) except Exception: pass return None def _detect_content_type(self, filename: str) -> str: """Detect content type from filename extension.""" from pathlib import Path ext = Path(filename).suffix.lower() image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif', '.webp', '.bmp', '.tiff'} return 'image' if ext in image_exts else 'video' def _record_download_internal( self, url: str, source: str, filename: str, content_type: str = None, file_path: str = None, post_date=None, metadata: Dict = None, file_hash: str = None, **extra_kwargs ) -> bool: """ Internal method to record a download. Args: url: Unique URL/identifier for the content source: Username or source identifier filename: Downloaded filename content_type: 'image' or 'video' (auto-detected if not provided) file_path: Full path to downloaded file post_date: Original post date metadata: Additional metadata dict file_hash: Pre-computed file hash (computed if not provided and file_path exists) **extra_kwargs: Additional arguments passed to unified_db.record_download """ # Auto-detect content type if not provided if not content_type: content_type = self._detect_content_type(filename) # Calculate file hash if not provided if not file_hash and file_path: file_hash = self._calculate_file_hash(file_path) return self.db.record_download( url=url, platform=self.platform, source=source, content_type=content_type, filename=filename, file_path=file_path, file_hash=file_hash, post_date=post_date, metadata=metadata, method=self.method, **extra_kwargs )