Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

478
modules/base_module.py Normal file
View File

@@ -0,0 +1,478 @@
#!/usr/bin/env python3
"""
Base Module - Shared functionality for all media downloader modules
Provides:
- LoggingMixin: Consistent logging with universal logger and backwards-compatible callback support
- CookieManagerMixin: Centralized cookie loading/saving for scrapers
- RateLimitMixin: Smart delay handling for rate limiting
- DeferredDownloadsMixin: Track downloads for batch database recording
"""
import random
import time
from typing import Any, Dict, List, Optional
from modules.universal_logger import get_logger
class LoggingMixin:
"""
Mixin providing consistent logging across all modules.
Uses the universal logger for all logging, with optional callback support
for backwards compatibility with existing code.
Usage:
class MyModule(LoggingMixin):
def __init__(self, log_callback=None):
self._init_logger('MyModule', log_callback)
# ... rest of init
def do_something(self):
self.log("Starting operation", "info")
# ...
self.log("Operation complete", "success")
"""
_logger_name: str = 'Unknown'
_default_module: str = 'Core'
logger = None
log_callback = None
show_debug: bool = True
def _init_logger(self, logger_name: str, log_callback=None, default_module: str = 'Core', show_debug: bool = True):
"""
Initialize logging for this module.
Args:
logger_name: Name for the logger (e.g., 'Instagram', 'TikTok', 'Forum')
log_callback: Optional callback function for backwards compatibility
default_module: Default module name for log messages (default: 'Core')
show_debug: Whether to show debug messages (default: True)
"""
self._logger_name = logger_name
self._default_module = default_module
self.log_callback = log_callback
self.show_debug = show_debug
self.logger = get_logger(logger_name)
def log(self, message: str, level: str = "info", module: str = None):
"""
Log a message using universal logger with optional callback.
Args:
message: The message to log
level: Log level ('debug', 'info', 'warning', 'error', 'success', 'critical')
module: Module name for the log entry (default: uses _default_module)
"""
level_lower = level.lower()
# Skip debug messages if show_debug is False
if level_lower == "debug" and not self.show_debug:
return
# Use universal logger (always log here first)
actual_module = module or self._default_module
self.logger.log(message, level.upper(), module=actual_module)
# Call log_callback for backwards compatibility
if self.log_callback:
self.log_callback(f"[{self._logger_name}] {message}", level_lower)
class CookieManagerMixin:
"""
Mixin providing centralized cookie management for scrapers.
Handles loading and saving cookies to/from the database.
Usage:
class MyScraper(LoggingMixin, CookieManagerMixin):
def __init__(self, unified_db=None):
self._init_logger('MyScraper')
self._init_cookie_manager(unified_db, 'my_scraper')
self._load_cookies_from_db()
def after_auth(self, cookies):
self._save_cookies_to_db(cookies)
"""
unified_db = None
scraper_id: str = ''
cf_handler = None # CloudflareHandler if used
user_agent: str = ''
def _init_cookie_manager(self, unified_db, scraper_id: str, cf_handler=None, user_agent: str = ''):
"""
Initialize cookie management.
Args:
unified_db: UnifiedDatabase instance
scraper_id: ID for this scraper in database
cf_handler: Optional CloudflareHandler instance
user_agent: User agent string
"""
self.unified_db = unified_db
self.scraper_id = scraper_id
self.cf_handler = cf_handler
self.user_agent = user_agent
def _load_cookies_from_db(self) -> Optional[List[Dict]]:
"""
Load cookies from database if available.
Returns:
List of cookie dicts or None if not available
"""
if not self.unified_db:
return None
try:
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
if cookies:
# Load into CloudflareHandler if available
if self.cf_handler:
self.cf_handler._cookies = cookies
if hasattr(self, 'log'):
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
return cookies
except Exception as e:
if hasattr(self, 'log'):
self.log(f"Error loading cookies from database: {e}", "warning")
return None
def _save_cookies_to_db(self, cookies: List[Dict], merge: bool = True, user_agent: str = None):
"""
Save cookies to database.
Args:
cookies: List of cookie dicts
merge: Whether to merge with existing cookies
user_agent: User agent to associate with cookies (important for cf_clearance).
If not provided, uses self.user_agent as fallback.
"""
if not self.unified_db:
return
try:
# Use provided user_agent or fall back to self.user_agent
ua = user_agent or self.user_agent
self.unified_db.save_scraper_cookies(
self.scraper_id,
cookies,
user_agent=ua,
merge=merge
)
if hasattr(self, 'log'):
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50] if ua else 'None'}...)", "debug")
except Exception as e:
if hasattr(self, 'log'):
self.log(f"Error saving cookies to database: {e}", "warning")
def _cookies_expired(self) -> bool:
"""
Check if cookies are expired.
Returns:
True if expired, False otherwise
"""
if self.cf_handler:
return self.cf_handler.cookies_expired()
return True
def _get_cookies_for_requests(self) -> Dict[str, str]:
"""
Get cookies in format for requests library.
Returns:
Dict of cookie name -> value
"""
if self.cf_handler:
return self.cf_handler.get_cookies_dict()
return {}
class RateLimitMixin:
"""
Mixin providing smart rate limiting for scrapers.
Handles delays between requests to avoid detection and rate limiting.
Usage:
class MyScraper(LoggingMixin, RateLimitMixin):
def __init__(self):
self._init_logger('MyScraper')
self._init_rate_limiter(min_delay=5, max_delay=15, batch_delay=30)
def download_batch(self, items):
for i, item in enumerate(items):
self.download_item(item)
is_batch_end = (i + 1) % 10 == 0
self._smart_delay(is_batch_end)
"""
min_delay: float = 5.0
max_delay: float = 15.0
batch_delay_min: float = 30.0
batch_delay_max: float = 60.0
error_delay: float = 120.0
def _init_rate_limiter(
self,
min_delay: float = 5.0,
max_delay: float = 15.0,
batch_delay_min: float = 30.0,
batch_delay_max: float = 60.0,
error_delay: float = 120.0
):
"""
Initialize rate limiting.
Args:
min_delay: Minimum delay between requests (seconds)
max_delay: Maximum delay between requests (seconds)
batch_delay_min: Minimum delay between batches (seconds)
batch_delay_max: Maximum delay between batches (seconds)
error_delay: Delay after errors (seconds)
"""
self.min_delay = min_delay
self.max_delay = max_delay
self.batch_delay_min = batch_delay_min
self.batch_delay_max = batch_delay_max
self.error_delay = error_delay
def _smart_delay(self, is_batch_end: bool = False, had_error: bool = False):
"""
Apply smart delay between requests.
Args:
is_batch_end: True if this is the end of a batch
had_error: True if there was an error (uses longer delay)
"""
if had_error:
delay = self.error_delay
elif is_batch_end:
delay = random.uniform(self.batch_delay_min, self.batch_delay_max)
else:
delay = random.uniform(self.min_delay, self.max_delay)
if hasattr(self, 'log'):
self.log(f"Waiting {delay:.1f}s before next request", "debug")
time.sleep(delay)
def _delay_after_error(self):
"""Apply error delay."""
self._smart_delay(had_error=True)
def _delay_between_items(self):
"""Apply normal delay between items."""
self._smart_delay(is_batch_end=False)
def _delay_between_batches(self):
"""Apply batch delay."""
self._smart_delay(is_batch_end=True)
class DeferredDownloadsMixin:
"""
Mixin for tracking downloads to be recorded in batch.
Allows deferring database writes for better performance.
Usage:
class MyScraper(LoggingMixin, DeferredDownloadsMixin):
def __init__(self):
self._init_logger('MyScraper')
self._init_deferred_downloads()
def download_file(self, url, path):
# ... download logic ...
self._add_pending_download({
'platform': 'my_platform',
'source': 'username',
'file_path': str(path),
# ... other fields ...
})
def finish_batch(self):
downloads = self.get_pending_downloads()
self.db.record_downloads_batch(downloads)
self.clear_pending_downloads()
"""
pending_downloads: List[Dict] = None
def _init_deferred_downloads(self):
"""Initialize deferred downloads tracking."""
self.pending_downloads = []
def _add_pending_download(self, download_info: Dict[str, Any]):
"""
Add a download to pending list.
Args:
download_info: Dict with download metadata
"""
if self.pending_downloads is None:
self.pending_downloads = []
self.pending_downloads.append(download_info)
def get_pending_downloads(self) -> List[Dict[str, Any]]:
"""
Get all pending downloads.
Returns:
List of pending download dicts
"""
return self.pending_downloads or []
def clear_pending_downloads(self):
"""Clear pending downloads list."""
self.pending_downloads = []
def has_pending_downloads(self) -> bool:
"""Check if there are pending downloads."""
return bool(self.pending_downloads)
class BaseDatabaseAdapter:
"""
Base class for platform-specific database adapters.
Provides common functionality for recording and querying downloads.
Platform-specific adapters should inherit from this class.
Usage:
class MyPlatformAdapter(BaseDatabaseAdapter):
def __init__(self, unified_db):
super().__init__(unified_db, platform='my_platform')
def record_download(self, content_id, username, filename, **kwargs):
# Platform-specific URL construction
url = f"https://my_platform.com/{username}/{content_id}"
return self._record_download_internal(
url=url,
source=username,
filename=filename,
**kwargs
)
"""
def __init__(self, unified_db, platform: str, method: str = None):
"""
Initialize base adapter.
Args:
unified_db: UnifiedDatabase instance
platform: Platform name (e.g., 'instagram', 'tiktok')
method: Optional method identifier for multi-method platforms
"""
self.db = unified_db
self.unified_db = unified_db # Alias for compatibility
self.platform = platform
self.method = method or platform
def get_connection(self, for_write: bool = False):
"""Get database connection (delegates to UnifiedDatabase)."""
return self.db.get_connection(for_write)
def get_file_hash(self, file_path: str) -> Optional[str]:
"""Calculate SHA256 hash of a file."""
return self.db.get_file_hash(file_path)
def get_download_by_file_hash(self, file_hash: str) -> Optional[Dict]:
"""Get download record by file hash."""
return self.db.get_download_by_file_hash(file_hash)
def get_download_by_media_id(self, media_id: str) -> Optional[Dict]:
"""Get download record by media_id."""
return self.db.get_download_by_media_id(media_id, self.platform, self.method)
def is_already_downloaded_by_hash(self, file_path: str) -> bool:
"""Check if file is already downloaded by comparing file hash."""
file_hash = self.get_file_hash(file_path)
if not file_hash:
return False
return self.get_download_by_file_hash(file_hash) is not None
def is_already_downloaded_by_media_id(self, media_id: str) -> bool:
"""Check if content is already downloaded by media_id."""
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT 1 FROM downloads
WHERE platform = ?
AND media_id = ?
LIMIT 1
''', (self.platform, media_id))
return cursor.fetchone() is not None
def _calculate_file_hash(self, file_path: str) -> Optional[str]:
"""Helper to safely calculate file hash."""
if not file_path:
return None
try:
from pathlib import Path
if Path(file_path).exists():
return self.get_file_hash(file_path)
except Exception:
pass
return None
def _detect_content_type(self, filename: str) -> str:
"""Detect content type from filename extension."""
from pathlib import Path
ext = Path(filename).suffix.lower()
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif', '.webp', '.bmp', '.tiff'}
return 'image' if ext in image_exts else 'video'
def _record_download_internal(
self,
url: str,
source: str,
filename: str,
content_type: str = None,
file_path: str = None,
post_date=None,
metadata: Dict = None,
file_hash: str = None,
**extra_kwargs
) -> bool:
"""
Internal method to record a download.
Args:
url: Unique URL/identifier for the content
source: Username or source identifier
filename: Downloaded filename
content_type: 'image' or 'video' (auto-detected if not provided)
file_path: Full path to downloaded file
post_date: Original post date
metadata: Additional metadata dict
file_hash: Pre-computed file hash (computed if not provided and file_path exists)
**extra_kwargs: Additional arguments passed to unified_db.record_download
"""
# Auto-detect content type if not provided
if not content_type:
content_type = self._detect_content_type(filename)
# Calculate file hash if not provided
if not file_hash and file_path:
file_hash = self._calculate_file_hash(file_path)
return self.db.record_download(
url=url,
platform=self.platform,
source=source,
content_type=content_type,
filename=filename,
file_path=file_path,
file_hash=file_hash,
post_date=post_date,
metadata=metadata,
method=self.method,
**extra_kwargs
)