Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

491
modules/activity_status.py Normal file
View File

@@ -0,0 +1,491 @@
#!/usr/bin/env python3
"""
Activity Status Manager
Centralized module for tracking and updating real-time download activity status
Stores status in database for reliable, concurrent access
Supports:
- Single main activity (scheduler) via activity_status table
- Multiple background tasks (YouTube monitor, etc.) via background_task_status table
"""
import json
from datetime import datetime
from typing import Optional, Dict, Any, List
from pathlib import Path
from modules.universal_logger import get_logger
logger = get_logger('ActivityStatus')
class ActivityStatusManager:
"""Manages real-time activity status updates stored in database"""
def __init__(self, unified_db=None):
"""
Initialize activity status manager
Args:
unified_db: UnifiedDatabase instance (optional, will create if needed)
"""
self.db = unified_db
if not self.db:
from modules.unified_database import UnifiedDatabase
self.db = UnifiedDatabase()
self._ensure_table()
def _ensure_table(self):
"""Ensure activity_status and background_task_status tables exist"""
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
# Main scheduler activity table (single row)
cursor.execute('''
CREATE TABLE IF NOT EXISTS activity_status (
id INTEGER PRIMARY KEY CHECK (id = 1),
active INTEGER NOT NULL DEFAULT 0,
task_id TEXT,
platform TEXT,
account TEXT,
start_time TEXT,
status TEXT,
detailed_status TEXT,
progress_current INTEGER,
progress_total INTEGER,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
# Add account progress columns if missing
cursor.execute("PRAGMA table_info(activity_status)")
columns = [col[1] for col in cursor.fetchall()]
if 'account_current' not in columns:
cursor.execute('ALTER TABLE activity_status ADD COLUMN account_current INTEGER')
if 'account_total' not in columns:
cursor.execute('ALTER TABLE activity_status ADD COLUMN account_total INTEGER')
# Insert default row if doesn't exist
cursor.execute('''
INSERT OR IGNORE INTO activity_status (id, active)
VALUES (1, 0)
''')
# Background tasks table (multiple concurrent tasks like YouTube monitor)
cursor.execute('''
CREATE TABLE IF NOT EXISTS background_task_status (
task_id TEXT PRIMARY KEY,
active INTEGER NOT NULL DEFAULT 0,
task_type TEXT,
display_name TEXT,
start_time TEXT,
status TEXT,
detailed_status TEXT,
progress_current INTEGER,
progress_total INTEGER,
extra_data TEXT,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
except Exception as e:
logger.error(f"Failed to create activity tables: {e}")
def start_activity(self, task_id: str, platform: str, account: str, status: str = "Running"):
"""
Mark activity as started
Args:
task_id: Unique task identifier
platform: Platform name (instagram, snapchat, etc)
account: Account/username being processed
status: Initial status message
"""
try:
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE activity_status
SET active = 1,
task_id = ?,
platform = ?,
account = ?,
start_time = ?,
status = ?,
detailed_status = NULL,
progress_current = NULL,
progress_total = NULL,
account_current = NULL,
account_total = NULL,
updated_at = ?
WHERE id = 1
''', (task_id, platform, account, datetime.now().isoformat(),
status, datetime.now().isoformat()))
conn.commit()
except Exception as e:
logger.error(f"Failed to start activity: {e}")
def update_status(self, detailed_status: str, progress_current: Optional[int] = None,
progress_total: Optional[int] = None):
"""Update detailed status message and progress."""
try:
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE activity_status
SET detailed_status = ?,
progress_current = COALESCE(?, progress_current),
progress_total = COALESCE(?, progress_total),
updated_at = ?
WHERE id = 1 AND active = 1
''', (detailed_status, progress_current, progress_total,
datetime.now().isoformat()))
conn.commit()
except Exception as e:
logger.error(f"Failed to update status: {e}")
def update_account_name(self, account: str):
"""Update the current account name being processed."""
try:
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE activity_status
SET account = ?,
updated_at = ?
WHERE id = 1 AND active = 1
''', (account, datetime.now().isoformat()))
conn.commit()
except Exception as e:
logger.error(f"Failed to update account name: {e}")
def update_account_progress(self, account_current: int, account_total: int):
"""Update account-level progress and reset file-level progress for the new account"""
try:
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE activity_status
SET account_current = ?,
account_total = ?,
progress_current = NULL,
progress_total = NULL,
updated_at = ?
WHERE id = 1 AND active = 1
''', (account_current, account_total, datetime.now().isoformat()))
conn.commit()
except Exception as e:
logger.error(f"Failed to update account progress: {e}")
def stop_activity(self):
"""Mark activity as stopped"""
try:
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE activity_status
SET active = 0,
detailed_status = NULL,
progress_current = NULL,
progress_total = NULL,
account_current = NULL,
account_total = NULL,
updated_at = ?
WHERE id = 1
''', (datetime.now().isoformat(),))
conn.commit()
except Exception as e:
logger.error(f"Failed to stop activity: {e}")
def get_current_activity(self) -> Dict[str, Any]:
"""
Get current activity status
Returns:
Dict with activity information
"""
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT active, task_id, platform, account, start_time, status,
detailed_status, progress_current, progress_total,
account_current, account_total
FROM activity_status
WHERE id = 1
''')
row = cursor.fetchone()
if row:
result = {
'active': bool(row[0]),
'task_id': row[1],
'platform': row[2],
'account': row[3],
'start_time': row[4],
'status': row[5]
}
# Add optional fields only if they exist
if row[6]: # detailed_status
result['detailed_status'] = row[6]
if row[7] is not None and row[8] is not None: # progress
result['progress'] = {
'current': row[7],
'total': row[8]
}
if row[9] is not None and row[10] is not None: # account_progress
result['account_progress'] = {
'current': row[9],
'total': row[10]
}
return result
return {
'active': False,
'task_id': None,
'platform': None,
'account': None,
'start_time': None,
'status': None
}
except Exception as e:
logger.error(f"Failed to get current activity: {e}")
return {
'active': False,
'task_id': None,
'platform': None,
'account': None,
'start_time': None,
'status': None
}
# =========================================================================
# BACKGROUND TASK METHODS (for concurrent tasks like YouTube monitor)
# =========================================================================
def start_background_task(self, task_id: str, task_type: str, display_name: str,
status: str = "Running", extra_data: Dict = None):
"""
Start a background task (doesn't interfere with main activity).
Args:
task_id: Unique task identifier (e.g., 'youtube_monitor')
task_type: Type of task (e.g., 'youtube_monitor', 'video_processor')
display_name: Human-readable name for display
status: Initial status message
extra_data: Optional extra data to store as JSON
"""
try:
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Check if task is already running - don't reset if so
cursor.execute('''
SELECT active FROM background_task_status WHERE task_id = ?
''', (task_id,))
row = cursor.fetchone()
if row and row[0] == 1:
# Task already running, just update status without resetting counter
logger.debug(f"Background task {task_id} already running, not resetting")
return
cursor.execute('''
INSERT OR REPLACE INTO background_task_status
(task_id, active, task_type, display_name, start_time, status,
detailed_status, progress_current, progress_total, extra_data, updated_at)
VALUES (?, 1, ?, ?, ?, ?, NULL, NULL, NULL, ?, ?)
''', (task_id, task_type, display_name, datetime.now().isoformat(),
status, json.dumps(extra_data) if extra_data else None,
datetime.now().isoformat()))
conn.commit()
except Exception as e:
logger.error(f"Failed to start background task {task_id}: {e}")
def update_background_task(self, task_id: str, detailed_status: str,
progress_current: Optional[int] = None,
progress_total: Optional[int] = None,
extra_data: Dict = None):
"""Update a background task's status."""
try:
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
if extra_data is not None:
cursor.execute('''
UPDATE background_task_status
SET detailed_status = ?,
progress_current = ?,
progress_total = ?,
extra_data = ?,
updated_at = ?
WHERE task_id = ? AND active = 1
''', (detailed_status, progress_current, progress_total,
json.dumps(extra_data), datetime.now().isoformat(), task_id))
else:
cursor.execute('''
UPDATE background_task_status
SET detailed_status = ?,
progress_current = ?,
progress_total = ?,
updated_at = ?
WHERE task_id = ? AND active = 1
''', (detailed_status, progress_current, progress_total,
datetime.now().isoformat(), task_id))
conn.commit()
except Exception as e:
logger.error(f"Failed to update background task {task_id}: {e}")
def stop_background_task(self, task_id: str):
"""Mark a background task as stopped."""
try:
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE background_task_status
SET active = 0,
updated_at = ?
WHERE task_id = ?
''', (datetime.now().isoformat(), task_id))
conn.commit()
except Exception as e:
logger.error(f"Failed to stop background task {task_id}: {e}")
def stop_all_background_tasks(self):
"""Mark all background tasks as stopped (used on scheduler startup to clear stale state)."""
try:
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE background_task_status
SET active = 0,
updated_at = ?
WHERE active = 1
''', (datetime.now().isoformat(),))
count = cursor.rowcount
conn.commit()
if count > 0:
logger.info(f"Cleared {count} stale background task(s) from previous run")
except Exception as e:
logger.error(f"Failed to stop all background tasks: {e}")
def get_background_task(self, task_id: str) -> Optional[Dict[str, Any]]:
"""
Get a specific background task's status.
Args:
task_id: Task identifier
Returns:
Dict with task information or None
"""
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT task_id, active, task_type, display_name, start_time,
status, detailed_status, progress_current, progress_total,
extra_data, updated_at
FROM background_task_status
WHERE task_id = ?
''', (task_id,))
row = cursor.fetchone()
if row:
result = {
'task_id': row[0],
'active': bool(row[1]),
'task_type': row[2],
'display_name': row[3],
'start_time': row[4],
'status': row[5],
'updated_at': row[10]
}
if row[6]: # detailed_status
result['detailed_status'] = row[6]
if row[7] is not None and row[8] is not None: # progress
result['progress'] = {
'current': row[7],
'total': row[8]
}
if row[9]: # extra_data
try:
result['extra_data'] = json.loads(row[9])
except (json.JSONDecodeError, TypeError, ValueError) as e:
logger.debug(f"Failed to parse extra_data for task {task_id}: {e}")
result['extra_data'] = {}
return result
return None
except Exception as e:
logger.error(f"Failed to get background task {task_id}: {e}")
return None
def get_active_background_tasks(self) -> List[Dict[str, Any]]:
"""
Get all active background tasks.
Returns:
List of active task dictionaries
"""
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT task_id, active, task_type, display_name, start_time,
status, detailed_status, progress_current, progress_total,
extra_data, updated_at
FROM background_task_status
WHERE active = 1
ORDER BY start_time DESC
''')
tasks = []
for row in cursor.fetchall():
task = {
'task_id': row[0],
'active': bool(row[1]),
'task_type': row[2],
'display_name': row[3],
'start_time': row[4],
'status': row[5],
'updated_at': row[10]
}
if row[6]: # detailed_status
task['detailed_status'] = row[6]
if row[7] is not None and row[8] is not None: # progress
task['progress'] = {
'current': row[7],
'total': row[8]
}
if row[9]: # extra_data
try:
task['extra_data'] = json.loads(row[9])
except (json.JSONDecodeError, TypeError, ValueError):
task['extra_data'] = {}
tasks.append(task)
return tasks
except Exception as e:
logger.error(f"Failed to get active background tasks: {e}")
return []
# Global instance with thread-safe initialization
_activity_manager = None
_activity_manager_lock = __import__('threading').Lock()
def get_activity_manager(unified_db=None):
"""Get or create global activity manager instance (thread-safe)"""
global _activity_manager
if _activity_manager is None:
with _activity_manager_lock:
# Double-check inside lock to prevent race condition
if _activity_manager is None:
_activity_manager = ActivityStatusManager(unified_db)
return _activity_manager

478
modules/base_module.py Normal file
View File

@@ -0,0 +1,478 @@
#!/usr/bin/env python3
"""
Base Module - Shared functionality for all media downloader modules
Provides:
- LoggingMixin: Consistent logging with universal logger and backwards-compatible callback support
- CookieManagerMixin: Centralized cookie loading/saving for scrapers
- RateLimitMixin: Smart delay handling for rate limiting
- DeferredDownloadsMixin: Track downloads for batch database recording
"""
import random
import time
from typing import Any, Dict, List, Optional
from modules.universal_logger import get_logger
class LoggingMixin:
"""
Mixin providing consistent logging across all modules.
Uses the universal logger for all logging, with optional callback support
for backwards compatibility with existing code.
Usage:
class MyModule(LoggingMixin):
def __init__(self, log_callback=None):
self._init_logger('MyModule', log_callback)
# ... rest of init
def do_something(self):
self.log("Starting operation", "info")
# ...
self.log("Operation complete", "success")
"""
_logger_name: str = 'Unknown'
_default_module: str = 'Core'
logger = None
log_callback = None
show_debug: bool = True
def _init_logger(self, logger_name: str, log_callback=None, default_module: str = 'Core', show_debug: bool = True):
"""
Initialize logging for this module.
Args:
logger_name: Name for the logger (e.g., 'Instagram', 'TikTok', 'Forum')
log_callback: Optional callback function for backwards compatibility
default_module: Default module name for log messages (default: 'Core')
show_debug: Whether to show debug messages (default: True)
"""
self._logger_name = logger_name
self._default_module = default_module
self.log_callback = log_callback
self.show_debug = show_debug
self.logger = get_logger(logger_name)
def log(self, message: str, level: str = "info", module: str = None):
"""
Log a message using universal logger with optional callback.
Args:
message: The message to log
level: Log level ('debug', 'info', 'warning', 'error', 'success', 'critical')
module: Module name for the log entry (default: uses _default_module)
"""
level_lower = level.lower()
# Skip debug messages if show_debug is False
if level_lower == "debug" and not self.show_debug:
return
# Use universal logger (always log here first)
actual_module = module or self._default_module
self.logger.log(message, level.upper(), module=actual_module)
# Call log_callback for backwards compatibility
if self.log_callback:
self.log_callback(f"[{self._logger_name}] {message}", level_lower)
class CookieManagerMixin:
"""
Mixin providing centralized cookie management for scrapers.
Handles loading and saving cookies to/from the database.
Usage:
class MyScraper(LoggingMixin, CookieManagerMixin):
def __init__(self, unified_db=None):
self._init_logger('MyScraper')
self._init_cookie_manager(unified_db, 'my_scraper')
self._load_cookies_from_db()
def after_auth(self, cookies):
self._save_cookies_to_db(cookies)
"""
unified_db = None
scraper_id: str = ''
cf_handler = None # CloudflareHandler if used
user_agent: str = ''
def _init_cookie_manager(self, unified_db, scraper_id: str, cf_handler=None, user_agent: str = ''):
"""
Initialize cookie management.
Args:
unified_db: UnifiedDatabase instance
scraper_id: ID for this scraper in database
cf_handler: Optional CloudflareHandler instance
user_agent: User agent string
"""
self.unified_db = unified_db
self.scraper_id = scraper_id
self.cf_handler = cf_handler
self.user_agent = user_agent
def _load_cookies_from_db(self) -> Optional[List[Dict]]:
"""
Load cookies from database if available.
Returns:
List of cookie dicts or None if not available
"""
if not self.unified_db:
return None
try:
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
if cookies:
# Load into CloudflareHandler if available
if self.cf_handler:
self.cf_handler._cookies = cookies
if hasattr(self, 'log'):
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
return cookies
except Exception as e:
if hasattr(self, 'log'):
self.log(f"Error loading cookies from database: {e}", "warning")
return None
def _save_cookies_to_db(self, cookies: List[Dict], merge: bool = True, user_agent: str = None):
"""
Save cookies to database.
Args:
cookies: List of cookie dicts
merge: Whether to merge with existing cookies
user_agent: User agent to associate with cookies (important for cf_clearance).
If not provided, uses self.user_agent as fallback.
"""
if not self.unified_db:
return
try:
# Use provided user_agent or fall back to self.user_agent
ua = user_agent or self.user_agent
self.unified_db.save_scraper_cookies(
self.scraper_id,
cookies,
user_agent=ua,
merge=merge
)
if hasattr(self, 'log'):
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50] if ua else 'None'}...)", "debug")
except Exception as e:
if hasattr(self, 'log'):
self.log(f"Error saving cookies to database: {e}", "warning")
def _cookies_expired(self) -> bool:
"""
Check if cookies are expired.
Returns:
True if expired, False otherwise
"""
if self.cf_handler:
return self.cf_handler.cookies_expired()
return True
def _get_cookies_for_requests(self) -> Dict[str, str]:
"""
Get cookies in format for requests library.
Returns:
Dict of cookie name -> value
"""
if self.cf_handler:
return self.cf_handler.get_cookies_dict()
return {}
class RateLimitMixin:
"""
Mixin providing smart rate limiting for scrapers.
Handles delays between requests to avoid detection and rate limiting.
Usage:
class MyScraper(LoggingMixin, RateLimitMixin):
def __init__(self):
self._init_logger('MyScraper')
self._init_rate_limiter(min_delay=5, max_delay=15, batch_delay=30)
def download_batch(self, items):
for i, item in enumerate(items):
self.download_item(item)
is_batch_end = (i + 1) % 10 == 0
self._smart_delay(is_batch_end)
"""
min_delay: float = 5.0
max_delay: float = 15.0
batch_delay_min: float = 30.0
batch_delay_max: float = 60.0
error_delay: float = 120.0
def _init_rate_limiter(
self,
min_delay: float = 5.0,
max_delay: float = 15.0,
batch_delay_min: float = 30.0,
batch_delay_max: float = 60.0,
error_delay: float = 120.0
):
"""
Initialize rate limiting.
Args:
min_delay: Minimum delay between requests (seconds)
max_delay: Maximum delay between requests (seconds)
batch_delay_min: Minimum delay between batches (seconds)
batch_delay_max: Maximum delay between batches (seconds)
error_delay: Delay after errors (seconds)
"""
self.min_delay = min_delay
self.max_delay = max_delay
self.batch_delay_min = batch_delay_min
self.batch_delay_max = batch_delay_max
self.error_delay = error_delay
def _smart_delay(self, is_batch_end: bool = False, had_error: bool = False):
"""
Apply smart delay between requests.
Args:
is_batch_end: True if this is the end of a batch
had_error: True if there was an error (uses longer delay)
"""
if had_error:
delay = self.error_delay
elif is_batch_end:
delay = random.uniform(self.batch_delay_min, self.batch_delay_max)
else:
delay = random.uniform(self.min_delay, self.max_delay)
if hasattr(self, 'log'):
self.log(f"Waiting {delay:.1f}s before next request", "debug")
time.sleep(delay)
def _delay_after_error(self):
"""Apply error delay."""
self._smart_delay(had_error=True)
def _delay_between_items(self):
"""Apply normal delay between items."""
self._smart_delay(is_batch_end=False)
def _delay_between_batches(self):
"""Apply batch delay."""
self._smart_delay(is_batch_end=True)
class DeferredDownloadsMixin:
"""
Mixin for tracking downloads to be recorded in batch.
Allows deferring database writes for better performance.
Usage:
class MyScraper(LoggingMixin, DeferredDownloadsMixin):
def __init__(self):
self._init_logger('MyScraper')
self._init_deferred_downloads()
def download_file(self, url, path):
# ... download logic ...
self._add_pending_download({
'platform': 'my_platform',
'source': 'username',
'file_path': str(path),
# ... other fields ...
})
def finish_batch(self):
downloads = self.get_pending_downloads()
self.db.record_downloads_batch(downloads)
self.clear_pending_downloads()
"""
pending_downloads: List[Dict] = None
def _init_deferred_downloads(self):
"""Initialize deferred downloads tracking."""
self.pending_downloads = []
def _add_pending_download(self, download_info: Dict[str, Any]):
"""
Add a download to pending list.
Args:
download_info: Dict with download metadata
"""
if self.pending_downloads is None:
self.pending_downloads = []
self.pending_downloads.append(download_info)
def get_pending_downloads(self) -> List[Dict[str, Any]]:
"""
Get all pending downloads.
Returns:
List of pending download dicts
"""
return self.pending_downloads or []
def clear_pending_downloads(self):
"""Clear pending downloads list."""
self.pending_downloads = []
def has_pending_downloads(self) -> bool:
"""Check if there are pending downloads."""
return bool(self.pending_downloads)
class BaseDatabaseAdapter:
"""
Base class for platform-specific database adapters.
Provides common functionality for recording and querying downloads.
Platform-specific adapters should inherit from this class.
Usage:
class MyPlatformAdapter(BaseDatabaseAdapter):
def __init__(self, unified_db):
super().__init__(unified_db, platform='my_platform')
def record_download(self, content_id, username, filename, **kwargs):
# Platform-specific URL construction
url = f"https://my_platform.com/{username}/{content_id}"
return self._record_download_internal(
url=url,
source=username,
filename=filename,
**kwargs
)
"""
def __init__(self, unified_db, platform: str, method: str = None):
"""
Initialize base adapter.
Args:
unified_db: UnifiedDatabase instance
platform: Platform name (e.g., 'instagram', 'tiktok')
method: Optional method identifier for multi-method platforms
"""
self.db = unified_db
self.unified_db = unified_db # Alias for compatibility
self.platform = platform
self.method = method or platform
def get_connection(self, for_write: bool = False):
"""Get database connection (delegates to UnifiedDatabase)."""
return self.db.get_connection(for_write)
def get_file_hash(self, file_path: str) -> Optional[str]:
"""Calculate SHA256 hash of a file."""
return self.db.get_file_hash(file_path)
def get_download_by_file_hash(self, file_hash: str) -> Optional[Dict]:
"""Get download record by file hash."""
return self.db.get_download_by_file_hash(file_hash)
def get_download_by_media_id(self, media_id: str) -> Optional[Dict]:
"""Get download record by media_id."""
return self.db.get_download_by_media_id(media_id, self.platform, self.method)
def is_already_downloaded_by_hash(self, file_path: str) -> bool:
"""Check if file is already downloaded by comparing file hash."""
file_hash = self.get_file_hash(file_path)
if not file_hash:
return False
return self.get_download_by_file_hash(file_hash) is not None
def is_already_downloaded_by_media_id(self, media_id: str) -> bool:
"""Check if content is already downloaded by media_id."""
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT 1 FROM downloads
WHERE platform = ?
AND media_id = ?
LIMIT 1
''', (self.platform, media_id))
return cursor.fetchone() is not None
def _calculate_file_hash(self, file_path: str) -> Optional[str]:
"""Helper to safely calculate file hash."""
if not file_path:
return None
try:
from pathlib import Path
if Path(file_path).exists():
return self.get_file_hash(file_path)
except Exception:
pass
return None
def _detect_content_type(self, filename: str) -> str:
"""Detect content type from filename extension."""
from pathlib import Path
ext = Path(filename).suffix.lower()
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif', '.webp', '.bmp', '.tiff'}
return 'image' if ext in image_exts else 'video'
def _record_download_internal(
self,
url: str,
source: str,
filename: str,
content_type: str = None,
file_path: str = None,
post_date=None,
metadata: Dict = None,
file_hash: str = None,
**extra_kwargs
) -> bool:
"""
Internal method to record a download.
Args:
url: Unique URL/identifier for the content
source: Username or source identifier
filename: Downloaded filename
content_type: 'image' or 'video' (auto-detected if not provided)
file_path: Full path to downloaded file
post_date: Original post date
metadata: Additional metadata dict
file_hash: Pre-computed file hash (computed if not provided and file_path exists)
**extra_kwargs: Additional arguments passed to unified_db.record_download
"""
# Auto-detect content type if not provided
if not content_type:
content_type = self._detect_content_type(filename)
# Calculate file hash if not provided
if not file_hash and file_path:
file_hash = self._calculate_file_hash(file_path)
return self.db.record_download(
url=url,
platform=self.platform,
source=source,
content_type=content_type,
filename=filename,
file_path=file_path,
file_hash=file_hash,
post_date=post_date,
metadata=metadata,
method=self.method,
**extra_kwargs
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,873 @@
#!/usr/bin/env python3
"""
Coppermine Photo Gallery Downloader Module
Downloads full-resolution images from Coppermine-based galleries
"""
import os
import re
import time
import hashlib
import requests
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Set
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs
from modules.base_module import LoggingMixin
from modules.cloudflare_handler import CloudflareHandler, SiteStatus, get_flaresolverr_user_agent
class CoppermineDownloader(LoggingMixin):
"""
Coppermine Photo Gallery downloader
Example usage:
from coppermine_module import CoppermineDownloader
downloader = CoppermineDownloader()
count = downloader.download(
gallery_url="https://hqdiesel.net/thumbnails.php?album=lastup&cat=123",
output_dir="downloads/coppermine",
days_back=7
)
print(f"Downloaded {count} items")
"""
def __init__(self, show_progress=True, use_database=True,
log_callback=None, unified_db=None, config=None):
"""
Initialize the downloader
Args:
show_progress: Print progress messages
use_database: Use database to track downloads
log_callback: Optional callback function for logging
unified_db: Optional UnifiedDatabase instance
config: Optional config dict with flaresolverr settings
"""
# Initialize logging via mixin
self._init_logger('Coppermine', log_callback, default_module='Download')
self.show_progress = show_progress
self.use_database = use_database
self.downloaded_files = set()
self.download_count = 0
self.unified_db = unified_db # Store for scraper config access
self.scraper_id = 'coppermine' # Scraper ID in database
# Use unified database if provided
if unified_db and use_database:
from modules.unified_database import CoppermineDatabaseAdapter
self.db = CoppermineDatabaseAdapter(unified_db)
else:
self.db = None
self.use_database = False
# Initialize activity status manager for real-time updates
from modules.activity_status import get_activity_manager
self.activity_manager = get_activity_manager(unified_db)
# Rate limiting
self.min_delay = 1
self.max_delay = 3
self.pending_downloads = [] # Track downloads for deferred database recording
# Load scraper configuration from database if available
self.proxy_url = None
self.cookie_file = None # Default to None (use database)
if unified_db:
scraper_config = unified_db.get_scraper(self.scraper_id)
if scraper_config:
# Get proxy configuration
if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
self.proxy_url = scraper_config['proxy_url']
self.log(f"Using proxy: {self.proxy_url}", "info")
# Fall back to config file for cookie_file if database not available
if not unified_db and config:
self.cookie_file = config.get('cookie_file', '/opt/media-downloader/cookies/coppermine_cookies.json')
# Session with proper headers
self.session = requests.Session()
self.user_agent = get_flaresolverr_user_agent()
self.session.headers.update({
'User-Agent': self.user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
})
# Configure session proxy if available
if self.proxy_url:
self.session.proxies = {
'http': self.proxy_url,
'https': self.proxy_url
}
# Initialize universal Cloudflare handler with conservative expiry
# Pass proxy_url if configured, and cookie_file=None for database storage
self.cf_handler = CloudflareHandler(
module_name="Coppermine",
cookie_file=self.cookie_file, # None when using database
user_agent=self.user_agent,
logger=self.logger,
aggressive_expiry=False, # Conservative mode for Coppermine
proxy_url=self.proxy_url # Pass proxy to FlareSolverr
)
# Keep for backwards compatibility
self.flaresolverr_url = self.cf_handler.flaresolverr_url
self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled
# Load cookies from file if exists
self._load_cookies()
def _record_download(self, url: str, platform: str, source: str, content_type: str,
filename: str, file_path: str, file_size: int, file_hash: str,
post_date=None, metadata: dict = None, deferred: bool = False):
"""Record a download in the database
Args:
deferred: If True, don't record to database now - add to pending_downloads list
for later recording after file move is complete
"""
# If deferred, store for later recording instead of recording now
if deferred:
self.pending_downloads.append({
'url': url,
'platform': platform,
'source': source,
'content_type': content_type,
'filename': filename,
'file_path': file_path,
'file_size': file_size,
'file_hash': file_hash,
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
'metadata': metadata
})
self.log(f"Deferred recording for {filename}", "debug")
return True
if not self.use_database or not self.db:
return
try:
self.db.add_download(
url=url,
platform=platform,
source=source,
content_type=content_type,
filename=filename,
file_path=file_path,
file_size=file_size,
file_hash=file_hash,
post_date=post_date,
metadata=metadata
)
except Exception as e:
self.log(f"Failed to record download: {e}", "debug")
def get_pending_downloads(self):
"""Get list of downloads that were deferred for later recording"""
return self.pending_downloads.copy()
def clear_pending_downloads(self):
"""Clear the pending downloads list after they've been recorded"""
self.pending_downloads = []
def _load_cookies(self):
"""Load cookies from database or file"""
# Try database first if available
if self.unified_db:
try:
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
if cookies:
cf_clearance_found = False
for cookie in cookies:
try:
self.session.cookies.set(
cookie['name'],
cookie['value'],
domain=cookie.get('domain', ''),
path=cookie.get('path', '/')
)
if cookie['name'] == 'cf_clearance':
cf_clearance_found = True
except Exception as e:
self.log(f"Error setting cookie {cookie.get('name')}: {e}", "warning")
if cf_clearance_found:
self.log(f"✓ Loaded {len(cookies)} cookies including cf_clearance from database", "info")
else:
self.log(f"⚠ Loaded {len(cookies)} cookies from database but cf_clearance NOT found", "warning")
# Also load cookies into CloudflareHandler for consistency
self.cf_handler._cookies = cookies
return
else:
self.log("No cookies found in database", "debug")
except Exception as e:
self.log(f"Error loading cookies from database: {e}", "warning")
# Fall back to cookie file if no database
if not self.cookie_file:
self.log("No cookie file configured", "debug")
return
cookie_path = Path(self.cookie_file)
if not cookie_path.exists():
self.log(f"Cookie file does not exist: {self.cookie_file}", "info")
return
try:
import json
with open(cookie_path, 'r') as f:
data = json.load(f)
# Handle both old format (list) and new format (dict with 'cookies' and 'timestamp')
if isinstance(data, dict) and 'cookies' in data:
cookies = data['cookies']
elif isinstance(data, list):
cookies = data
else:
self.log(f"Invalid cookie file format", "warning")
return
# Count critical cookies
cf_clearance_found = False
for cookie in cookies:
try:
# Set cookie with basic attributes (requests.Session compatible)
self.session.cookies.set(
cookie['name'],
cookie['value'],
domain=cookie.get('domain', ''),
path=cookie.get('path', '/')
)
if cookie['name'] == 'cf_clearance':
cf_clearance_found = True
except Exception as e:
self.log(f"Error setting cookie {cookie.get('name')}: {e}", "warning")
if cf_clearance_found:
self.log(f"✓ Loaded {len(cookies)} cookies including cf_clearance from {self.cookie_file}", "info")
else:
self.log(f"⚠ Loaded {len(cookies)} cookies but cf_clearance NOT found", "warning")
except Exception as e:
self.log(f"Error loading cookies: {e}", "warning")
def _cookies_expired(self):
"""Check if cookies are expired - delegates to CloudflareHandler"""
return self.cf_handler.cookies_expired()
def _save_cookies(self, cookies: list, user_agent: str = None):
"""Save cookies to database or file with timestamp
Args:
cookies: List of cookie dictionaries
user_agent: User agent to associate with cookies (important for cf_clearance).
If not provided, uses self.user_agent as fallback.
"""
# Use provided user_agent or fall back to self.user_agent
ua = user_agent or self.user_agent
# Try database first if available
if self.unified_db:
try:
self.unified_db.save_scraper_cookies(
self.scraper_id,
cookies,
user_agent=ua,
merge=True # Merge with existing cookies
)
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50] if ua else 'None'}...)", "debug")
return
except Exception as e:
self.log(f"Error saving cookies to database: {e}", "warning")
# Fall back to file
if not self.cookie_file:
return
try:
import json
from datetime import datetime
cookie_path = Path(self.cookie_file)
cookie_path.parent.mkdir(parents=True, exist_ok=True)
storage_data = {
'cookies': cookies,
'timestamp': datetime.now().isoformat()
}
with open(cookie_path, 'w') as f:
json.dump(storage_data, f, indent=2)
self.log(f"Saved {len(cookies)} cookies to {self.cookie_file}", "debug")
except Exception as e:
self.log(f"Error saving cookies: {e}", "warning")
def _get_cookies_via_flaresolverr(self, url: str, max_retries: int = 2) -> bool:
"""Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler
Args:
url: URL to fetch
max_retries: Maximum number of retry attempts (default: 2)
Returns:
True if cookies obtained successfully, False otherwise
"""
# Delegate to CloudflareHandler
success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries)
# If successful, also load cookies into the session and save to database
if success:
cookies_dict = self.cf_handler.get_cookies_dict()
for name, value in cookies_dict.items():
# Extract domain from URL
from urllib.parse import urlparse
parsed = urlparse(url)
domain = parsed.netloc
self.session.cookies.set(name, value, domain=domain, path='/')
# Save cookies to database (the handler already saved to file if configured)
if self.unified_db:
cookies_list = self.cf_handler.get_cookies_list()
if cookies_list:
# CRITICAL: Get the user_agent from FlareSolverr solution, not self.user_agent
# cf_clearance cookies are fingerprinted to the browser that solved the challenge
flaresolverr_ua = self.cf_handler.get_user_agent()
self._save_cookies(cookies_list, user_agent=flaresolverr_ua)
return success
def _request_with_retry(self, url: str, timeout: int = 30, max_attempts: int = 2):
"""Make HTTP request with automatic Cloudflare challenge retry
Args:
url: URL to fetch
timeout: Request timeout in seconds
max_attempts: Maximum number of attempts (default: 2)
Returns:
requests.Response object
Raises:
Exception if all retry attempts fail
"""
last_error = None
for attempt in range(1, max_attempts + 1):
try:
response = self.session.get(url, timeout=timeout)
# Detect Cloudflare challenges
is_cloudflare = False
if response.status_code in [403, 503]:
is_cloudflare = True
self.log(f"Cloudflare challenge detected (HTTP {response.status_code})", "warning")
elif len(response.text) < 1000:
is_cloudflare = True
self.log(f"Cloudflare challenge detected (short response: {len(response.text)} bytes)", "warning")
elif 'challenge' in response.text.lower()[:500]:
is_cloudflare = True
self.log("Cloudflare challenge detected in HTML", "warning")
# If Cloudflare detected and we have retry attempts left
if is_cloudflare and attempt < max_attempts:
if self.flaresolverr_enabled:
self.log(f"Attempt {attempt}/{max_attempts}: Refreshing cookies via FlareSolverr...", "info")
if self._get_cookies_via_flaresolverr(url):
self.log("Cookies refreshed, retrying request...", "info")
continue # Retry the request
else:
raise Exception("Failed to refresh cookies via FlareSolverr")
else:
raise Exception("Cloudflare challenge detected but FlareSolverr is disabled")
# No Cloudflare challenge or final attempt - check status and return
response.raise_for_status()
return response
except Exception as e:
last_error = e
if attempt < max_attempts:
self.log(f"Attempt {attempt}/{max_attempts} failed: {e}", "warning")
else:
self.log(f"All {max_attempts} attempts failed", "error")
# All attempts failed
raise last_error
def _parse_date(self, date_str: str) -> Optional[datetime]:
"""
Parse Coppermine date format: 'Date added=Sep 29, 2025'
Args:
date_str: Date string from Coppermine
Returns:
datetime object or None
"""
try:
# Extract date from "Date added=Sep 29, 2025" format
match = re.search(r'Date added=([A-Za-z]+ \d+, \d{4})', date_str)
if match:
date_part = match.group(1)
return datetime.strptime(date_part, '%b %d, %Y')
except Exception as e:
self.log(f"Error parsing date '{date_str}': {e}", "debug")
return None
def _extract_full_image_url(self, base_url: str, thumbnail_url: str) -> str:
"""
Convert thumbnail URL to full-resolution URL
Pattern:
Thumbnail: albums/userpics/1052219/thumb_1000523798.jpg
Normal: albums/userpics/1052219/normal_1000523798.jpg
Full: albums/userpics/1052219/1000523798.jpg
Args:
base_url: Base URL of the gallery (e.g., https://hqdiesel.net)
thumbnail_url: Relative thumbnail URL
Returns:
Full-resolution image URL
"""
# Remove thumb_ or normal_ prefix
full_path = re.sub(r'/(thumb_|normal_)', '/', thumbnail_url)
return urljoin(base_url, full_path)
def _parse_gallery_page(self, html: str, base_url: str) -> List[Dict]:
"""
Parse a Coppermine gallery page to extract image information
Args:
html: HTML content of the page
base_url: Base URL of the gallery
Returns:
List of dicts with image info
"""
soup = BeautifulSoup(html, 'html.parser')
images = []
# Find all thumbnail cells
thumbnail_cells = soup.find_all('td', class_='thumbnails')
self.log(f"Found {len(thumbnail_cells)} thumbnail cells on page", "debug")
for cell in thumbnail_cells:
try:
# Find image link
link = cell.find('a', href=re.compile(r'displayimage\.php'))
if not link:
continue
# Extract PID from URL
href = link.get('href', '')
parsed = parse_qs(urlparse(href).query)
pid = parsed.get('pid', [None])[0]
if not pid:
continue
# Find thumbnail image
img = link.find('img')
if not img:
continue
thumbnail_url = img.get('src', '')
if not thumbnail_url:
continue
# Get image title (contains metadata)
title = img.get('title', '')
# Extract filename
filename_match = re.search(r'Filename=([^\s]+)', title)
filename = filename_match.group(1) if filename_match else None
# Extract date from dedicated span (more reliable)
upload_date = None
date_span = cell.find('span', class_='thumb_caption_ctime')
if date_span and date_span.text.strip():
try:
upload_date = datetime.strptime(date_span.text.strip(), '%b %d, %Y')
except Exception:
# Fallback to title parsing
upload_date = self._parse_date(title)
else:
upload_date = self._parse_date(title)
# Extract uploader
uploader = None
uploader_link = cell.find('a', href=re.compile(r'profile\.php'))
if uploader_link:
uploader = uploader_link.text.strip()
# Extract dimensions
dimensions_match = re.search(r'Dimensions=(\d+x\d+)', title)
dimensions = dimensions_match.group(1) if dimensions_match else None
# Extract filesize
filesize_match = re.search(r'Filesize=([^\s]+)', title)
filesize = filesize_match.group(1) if filesize_match else None
# Extract views
views = None
views_span = cell.find('span', class_='thumb_title_views')
if views_span:
views_match = re.search(r'(\d+)\s+views?', views_span.text)
if views_match:
views = int(views_match.group(1))
# Construct full-resolution URL
full_url = self._extract_full_image_url(base_url, thumbnail_url)
images.append({
'pid': pid,
'filename': filename,
'thumbnail_url': urljoin(base_url, thumbnail_url),
'full_url': full_url,
'upload_date': upload_date,
'dimensions': dimensions,
'filesize': filesize,
'uploader': uploader,
'views': views,
'title': title
})
except Exception as e:
self.log(f"Error parsing thumbnail cell: {e}", "debug")
continue
return images
def _get_total_pages(self, html: str) -> int:
"""
Extract total number of pages from gallery
Args:
html: HTML content
Returns:
Number of pages
"""
try:
soup = BeautifulSoup(html, 'html.parser')
# Look for pagination info like "2005 files on 20 page(s)"
text = soup.get_text()
match = re.search(r'(\d+)\s+files?\s+on\s+(\d+)\s+page', text)
if match:
return int(match.group(2))
except Exception as e:
self.log(f"Error extracting page count: {e}", "debug")
return 1
def _download_image(self, image_info: Dict, output_dir: Path,
gallery_name: str) -> Optional[str]:
"""
Download a single image
Args:
image_info: Image information dict
output_dir: Output directory
gallery_name: Name of gallery for database tracking
Returns:
Path to downloaded file or None
"""
try:
url = image_info['full_url']
pid = image_info['pid']
filename = image_info['filename']
# Check if already downloaded
if self.use_database and self.db:
if self.db.is_downloaded(url, platform='coppermine'):
self.log(f"Already downloaded (database): {filename} (PID: {pid})", "info")
return None
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)
# Construct output filename
output_file = output_dir / filename
# Skip if file exists
if output_file.exists():
self.log(f"File already exists: {filename}", "info")
return str(output_file)
# Download image
self.log(f"Downloading: {filename} (PID: {pid})", "info")
response = self._request_with_retry(url, timeout=30)
# Save image
with open(output_file, 'wb') as f:
f.write(response.content)
# Check for duplicate hash before recording
if self.db and hasattr(self.db, 'unified_db'):
from pathlib import Path as PathLib
# Check for duplicate hash (hash blacklist persists even if original deleted)
file_hash_check = self.db.unified_db.get_file_hash(str(output_file))
if file_hash_check:
existing = self.db.unified_db.get_download_by_file_hash(file_hash_check)
if existing and existing.get('file_path') and str(output_file) != existing.get('file_path'):
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
# Delete the duplicate regardless of whether original file still exists
try:
output_file.unlink()
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
return
except Exception as e:
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
return
# Calculate SHA256 file hash from saved file (consistent with other modules)
file_hash = None
if self.db and hasattr(self.db, 'unified_db'):
try:
file_hash = self.db.unified_db.get_file_hash(str(output_file))
except Exception as e:
self.log(f"Failed to calculate file hash: {e}", "warning")
# Track timestamp for this file
if image_info.get('upload_date'):
self.file_timestamps[filename] = image_info['upload_date']
# Record in database
self._record_download(
url=url,
platform='coppermine',
source=gallery_name,
content_type='image',
filename=filename,
file_path=str(output_file),
file_size=len(response.content),
file_hash=file_hash,
post_date=image_info.get('upload_date'),
metadata={
'pid': pid,
'dimensions': image_info.get('dimensions'),
'filesize': image_info.get('filesize')
},
deferred=getattr(self, 'defer_database', False)
)
self.download_count += 1
time.sleep(self.min_delay + (self.max_delay - self.min_delay) * __import__('random').random())
return str(output_file)
except Exception as e:
self.log(f"Error downloading {image_info.get('filename', 'unknown')}: {e}", "error")
return None
def download(self, gallery_url: str, output_dir: str,
days_back: Optional[int] = None, max_pages: Optional[int] = None,
gallery_name: Optional[str] = None, defer_database: bool = False) -> tuple:
"""
Download images from a Coppermine gallery
Args:
gallery_url: URL to the gallery page (e.g., thumbnails.php?album=lastup&cat=123)
output_dir: Directory to save images
days_back: Only download images from last N days (None = all)
max_pages: Maximum number of pages to process (None = all)
gallery_name: Name for database tracking (extracted from URL if not provided)
defer_database: If True, don't record to database immediately - store in
pending_downloads for later recording after file move is complete
Returns:
Tuple of (file_timestamps dict, download_count)
file_timestamps: Dict mapping filename -> upload_date
"""
self.defer_database = defer_database # Store for use in download methods
# Clear downloaded_files cache between galleries to prevent memory growth
self.downloaded_files.clear()
# Check site status before doing anything else
self.log("Checking Coppermine gallery site status...", "debug")
site_status, error_msg = self.cf_handler.check_site_status(gallery_url, timeout=10)
if self.cf_handler.should_skip_download(site_status):
self.log(f"Skipping download - Coppermine gallery is unavailable: {error_msg}", "warning")
return ({}, 0)
elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
self.log("Cloudflare challenge detected, will attempt bypass during download", "info")
self.download_count = 0
self.file_timestamps = {} # Track timestamps for each file
output_path = Path(output_dir)
# Extract base URL and gallery name
parsed_url = urlparse(gallery_url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
if not gallery_name:
# Extract category from URL
query_params = parse_qs(parsed_url.query)
cat = query_params.get('cat', ['unknown'])[0]
album = query_params.get('album', ['unknown'])[0]
gallery_name = f"{parsed_url.netloc}_cat{cat}_{album}"
self.log(f"Starting download from: {gallery_url}", "info")
self.activity_manager.update_status(f"Checking gallery: {gallery_name}")
self.log(f"Gallery: {gallery_name}", "info")
if days_back:
self.log(f"Filtering: Last {days_back} days", "info")
# Calculate cutoff date
cutoff_date = None
if days_back:
cutoff_date = datetime.now() - timedelta(days=days_back)
# Check if cookies have expired before testing
cookies_valid = False
cookie_count = len(self.session.cookies)
# Check for short-lived session cookies that may have expired
if self.cf_handler.cookies_expired():
self.log(f"Cookies expired, skipping test and refreshing via FlareSolverr", "info")
else:
self.log(f"Testing with {cookie_count} existing cookies...", "info")
try:
# Try with existing cookies first (short timeout for fast fail)
test_response = self.session.get(gallery_url, timeout=5)
# Check if we got a Cloudflare challenge or error
if test_response.status_code == 403 or test_response.status_code == 503:
self.log(f"Existing cookies failed (HTTP {test_response.status_code}), need FlareSolverr", "info")
elif len(test_response.text) < 1000:
self.log(f"Response too short ({len(test_response.text)} bytes), likely Cloudflare challenge", "info")
elif 'challenge' in test_response.text.lower()[:500]:
self.log("Cloudflare challenge detected in response", "info")
else:
# Cookies work (or no challenge presented)!
cookies_valid = True
self.log(f"✓ Existing cookies valid ({cookie_count} cookies, skipped FlareSolverr)", "info")
response = test_response
except Exception as e:
self.log(f"Test request failed ({type(e).__name__}: {e}), need FlareSolverr", "info")
# Only call FlareSolverr if existing cookies don't work
if not cookies_valid:
if self.flaresolverr_enabled:
self.log("Calling FlareSolverr to get fresh cookies...", "info")
if not self._get_cookies_via_flaresolverr(gallery_url):
self.log("Failed to bypass Cloudflare", "error")
return ({}, 0)
else:
self.log("FlareSolverr disabled and cookies invalid", "error")
return ({}, 0)
# Fetch first page to get total pages (reuse response if cookies were valid)
try:
if not cookies_valid:
response = self._request_with_retry(gallery_url, timeout=30)
total_pages = self._get_total_pages(response.text)
if max_pages:
total_pages = min(total_pages, max_pages)
self.log(f"Total pages to process: {total_pages}", "info")
except Exception as e:
self.log(f"Error fetching gallery: {e}", "error")
return ({}, 0)
# Set initial progress so dashboard shows 0/N immediately
self.activity_manager.update_status(
"Downloading images",
progress_current=0,
progress_total=total_pages
)
# Process each page
for page_num in range(1, total_pages + 1):
try:
# Construct page URL
if page_num == 1:
page_url = gallery_url
else:
separator = '&' if '?' in gallery_url else '?'
page_url = f"{gallery_url}{separator}page={page_num}"
self.log(f"Processing page {page_num}/{total_pages}...", "info")
# Fetch page with automatic Cloudflare retry
response = self._request_with_retry(page_url, timeout=30)
# Debug: Check what we received
self.log(f"Fetched page, status: {response.status_code}, length: {len(response.text)} bytes", "debug")
if len(response.text) < 10000:
self.log(f"WARNING: Response seems too short! First 1000 chars: {response.text[:1000]}", "warning")
# Parse images
images = self._parse_gallery_page(response.text, base_url)
self.log(f"Found {len(images)} images on page {page_num}", "info")
# Track if we found any new images on this page
found_new_images = False
skipped_old_images = 0
# Filter by date and download
for image_info in images:
# Apply date filter
if cutoff_date and image_info.get('upload_date'):
if image_info['upload_date'] < cutoff_date:
skipped_old_images += 1
self.log(f"Skipping old image: {image_info['filename']} "
f"(uploaded {image_info['upload_date'].date()})", "debug")
continue
# Log image being processed
upload_date_str = image_info.get('upload_date').strftime('%Y-%m-%d') if image_info.get('upload_date') else 'unknown'
self.log(f"Processing image: {image_info['filename']} (uploaded {upload_date_str})", "info")
# This image is within date range
found_new_images = True
# Download image
self._download_image(image_info, output_path, gallery_name)
# If using date filter and ALL images on this page were too old, stop processing
# (assumes gallery is sorted newest-first, which is true for album=lastup)
if cutoff_date and not found_new_images and len(images) > 0:
self.log(f"All {skipped_old_images} images on page {page_num} are older than {days_back} days. "
f"Stopping pagination (assuming chronological order).", "info")
break
# Update activity status with page progress
self.activity_manager.update_status(
"Downloading images",
progress_current=page_num,
progress_total=total_pages
)
# Rate limiting between pages
if page_num < total_pages:
time.sleep(self.min_delay)
except Exception as e:
self.log(f"Error processing page {page_num}: {e}", "error")
continue
self.log(f"Download complete! Total: {self.download_count} images", "info")
return (self.file_timestamps, self.download_count)
def cleanup(self):
"""Cleanup resources"""
if self.session:
self.session.close()

473
modules/date_utils.py Executable file
View File

@@ -0,0 +1,473 @@
#!/usr/bin/env python3
"""
Shared date utilities module for media downloaders
Provides comprehensive date extraction and timestamp updating
Features:
- Extract dates from text/titles (multiple formats)
- Extract TV show season/episode info and lookup air dates via OMDB
- Update filesystem timestamps (mtime, atime)
- Update creation time (platform-specific)
- Update EXIF metadata for images
- Update video metadata
"""
import os
import re
import platform
import subprocess
import requests
from datetime import datetime
from pathlib import Path
from typing import Optional, Union, Tuple
from modules.universal_logger import get_logger
logger = get_logger('DateUtils')
class DateHandler:
"""Comprehensive date extraction and timestamp updating"""
# OMDB API key (should be set by user)
OMDB_API_KEY = None
# TV show season/episode patterns
TV_PATTERNS = [
r'S(\d{1,2})E(\d{1,2})', # S01E01
r'Season\s+(\d{1,2})\s+Episode\s+(\d{1,2})', # Season 1 Episode 1
r'(\d{1,2})x(\d{1,2})', # 1x01
r's(\d{1,2})\s*e(\d{1,2})', # s01 e01 or s01e01
]
# Year pattern for fallback
YEAR_PATTERN = r'\b(19\d{2}|20\d{2})\b'
# Date patterns for extraction from text
DATE_PATTERNS = [
# Instagram filename format: YYYYMMDD_HHMMSS (e.g., "20251027_155842")
(r'(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2})', 'instagram'),
# DD.MM.YYYY or DD/MM/YYYY or DD-MM-YYYY or DD_MM_YYYY (underscore for forum titles)
(r'(\d{1,2})[\.\/\-_](\d{1,2})[\.\/\-_](\d{4})', 'dmy'),
# YYYY-MM-DD or YYYY/MM/DD or YYYY_MM_DD
(r'(\d{4})[\-\/_](\d{1,2})[\-\/_](\d{1,2})', 'ymd'),
# Month DD, YYYY (e.g., "August 15, 2025")
(r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),?\s+(\d{4})', 'mdy_name'),
# Month YYYY (e.g., "April 2025") - use first day of month
(r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{4})', 'my_name'),
# DD Mon YYYY (e.g., "15 Aug 2025")
(r'(\d{1,2})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(\d{4})', 'dmy_abbr'),
# Mon DD, YYYY (e.g., "Aug 15, 2025")
(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(\d{1,2}),?\s+(\d{4})', 'mdy_abbr'),
# Mon YYYY (e.g., "Apr 2025") - use first day of month
(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(\d{4})', 'my_abbr'),
]
MONTH_MAP = {
'January': 1, 'February': 2, 'March': 3, 'April': 4,
'May': 5, 'June': 6, 'July': 7, 'August': 8,
'September': 9, 'October': 10, 'November': 11, 'December': 12,
'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}
@classmethod
def set_omdb_api_key(cls, api_key: str):
"""Set OMDB API key for TV show lookups"""
cls.OMDB_API_KEY = api_key
@classmethod
def extract_tv_info(cls, text: str) -> Optional[Tuple[str, int, int]]:
"""
Extract TV show name, season, and episode from text
Returns:
Tuple of (show_name, season, episode) or None
"""
for pattern in cls.TV_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE)
if match:
season = int(match.group(1))
episode = int(match.group(2))
# Extract show name (everything before the season/episode)
show_part = text[:match.start()].strip()
# Look for common TV show names in the text
# Common pattern: "Actor Name & Actor Name - Show Name S01E01"
if ' - ' in show_part:
# Split on dash and take the last part as show name
parts = show_part.split(' - ')
show_name = parts[-1].strip()
else:
# Clean up common separators
show_name = re.sub(r'[-_.]', ' ', show_part)
show_name = re.sub(r'\s+', ' ', show_name).strip()
# Remove trailing "Season" or similar words
show_name = re.sub(r'\s+(Season|Series|S)\s*$', '', show_name, re.IGNORECASE)
if show_name:
return (show_name, season, episode)
return None
@classmethod
def lookup_tv_episode_date(cls, show_name: str, season: int, episode: int) -> Optional[datetime]:
"""
Lookup TV episode air date using OMDB API
Args:
show_name: Name of the TV show
season: Season number
episode: Episode number
Returns:
Air date of the episode or None
"""
if not cls.OMDB_API_KEY:
logger.debug("OMDB API key not set")
return None
try:
# First, search for the show
search_url = "http://www.omdbapi.com/"
params = {
'apikey': cls.OMDB_API_KEY,
't': show_name,
'type': 'series'
}
response = requests.get(search_url, params=params, timeout=5)
if response.status_code != 200:
return None
show_data = response.json()
if show_data.get('Response') != 'True':
return None
# Get the IMDB ID
imdb_id = show_data.get('imdbID')
if not imdb_id:
return None
# Now get the specific episode
episode_params = {
'apikey': cls.OMDB_API_KEY,
'i': imdb_id,
'Season': season,
'Episode': episode
}
episode_response = requests.get(search_url, params=episode_params, timeout=5)
if episode_response.status_code != 200:
return None
episode_data = episode_response.json()
if episode_data.get('Response') != 'True':
return None
# Parse the release date
release_date = episode_data.get('Released')
if release_date and release_date != 'N/A':
# Try different date formats
for fmt in ['%d %b %Y', '%Y-%m-%d', '%d %B %Y']:
try:
return datetime.strptime(release_date, fmt)
except ValueError:
continue
except Exception as e:
logger.debug(f"OMDB lookup failed: {e}")
return None
@classmethod
def extract_date_from_text(cls, text: str, fallback_date: Optional[datetime] = None, use_omdb: bool = True) -> Optional[datetime]:
"""
Extract date from text using multiple format patterns
Args:
text: Text to search for dates (e.g., post title, caption)
fallback_date: Date to use if no date found in text
use_omdb: Whether to try OMDB lookup for TV shows
Returns:
Extracted datetime or fallback_date if no date found
"""
if not text:
return fallback_date
# First, try TV show lookup if enabled
if use_omdb:
tv_info = cls.extract_tv_info(text)
if tv_info:
show_name, season, episode = tv_info
tv_date = cls.lookup_tv_episode_date(show_name, season, episode)
if tv_date:
logger.info(f"Found TV episode date via OMDB: {show_name} S{season:02d}E{episode:02d} -> {tv_date}")
return tv_date
# Try standard date patterns
for pattern, format_type in cls.DATE_PATTERNS:
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
if format_type == 'instagram':
# Instagram format: YYYYMMDD_HHMMSS
year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3))
hour, minute, second = int(match.group(4)), int(match.group(5)), int(match.group(6))
return datetime(year, month, day, hour, minute, second)
elif format_type == 'dmy':
day, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3))
# Handle ambiguous dates (could be DD/MM or MM/DD)
if '.' in text[match.start():match.end()]:
# European format with dots: DD.MM.YYYY
return datetime(year, month, day)
elif day <= 12 and month <= 12:
# Ambiguous, assume MM/DD/YYYY for US format
return datetime(year, day, month)
else:
# Clear from values which is day/month
if day > 12:
return datetime(year, month, day)
else:
return datetime(year, day, month)
elif format_type == 'ymd':
year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3))
return datetime(year, month, day)
elif format_type == 'mdy_name':
month_str, day, year = match.group(1), int(match.group(2)), int(match.group(3))
month = cls.MONTH_MAP.get(month_str, 0)
if month:
return datetime(year, month, day)
elif format_type == 'my_name':
# Month YYYY (no day) - use first day of month
month_str, year = match.group(1), int(match.group(2))
month = cls.MONTH_MAP.get(month_str, 0)
if month:
return datetime(year, month, 1)
elif format_type == 'dmy_abbr':
day, month_str, year = int(match.group(1)), match.group(2), int(match.group(3))
month = cls.MONTH_MAP.get(month_str, 0)
if month:
return datetime(year, month, day)
elif format_type == 'mdy_abbr':
month_str, day, year = match.group(1), int(match.group(2)), int(match.group(3))
month = cls.MONTH_MAP.get(month_str, 0)
if month:
return datetime(year, month, day)
elif format_type == 'my_abbr':
# Mon YYYY (no day) - use first day of month
month_str, year = match.group(1), int(match.group(2))
month = cls.MONTH_MAP.get(month_str, 0)
if month:
return datetime(year, month, 1)
except (ValueError, IndexError) as e:
logger.debug(f"Failed to parse date from pattern {pattern}: {e}")
continue
# Don't use year-only as fallback - it's too unreliable
# Examples: "Moments of 2025" shouldn't default to Jan 1, 2025
# Instead, use the actual post date from the forum
return fallback_date
@classmethod
def update_file_timestamps(cls, filepath: Union[str, Path], date: datetime) -> bool:
"""
Update all timestamps for a file: filesystem, creation time, and EXIF data
Args:
filepath: Path to the file to update
date: DateTime to set
Returns:
True if successful, False otherwise
"""
filepath = Path(filepath)
if not filepath.exists():
logger.error(f"File not found: {filepath}")
return False
if not date:
logger.warning(f"No date provided for {filepath}")
return False
success = True
# 1. Update EXIF data for images FIRST (this modifies the file)
if filepath.suffix.lower() in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif']:
try:
cls._update_exif_data(filepath, date)
except Exception as e:
logger.debug(f"Failed to update EXIF data: {e}")
# Don't mark as failure since not all images support EXIF
# 2. Update video metadata SECOND (this also modifies the file)
if filepath.suffix.lower() in ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v']:
try:
cls._update_video_metadata(filepath, date)
except Exception as e:
logger.debug(f"Failed to update video metadata: {e}")
# Don't mark as failure since this requires ffmpeg
# 3. Update creation time (platform-specific)
try:
if platform.system() == 'Darwin': # macOS
cls._update_macos_creation_time(filepath, date)
elif platform.system() == 'Windows':
cls._update_windows_creation_time(filepath, date)
# Linux doesn't have a reliable way to set creation time
except Exception as e:
logger.debug(f"Failed to update creation time: {e}")
# Don't mark as failure since this is platform-specific
# 4. Update filesystem timestamps LAST (mtime and atime)
# This must be last because EXIF/video updates modify the file and change mtime
try:
timestamp = date.timestamp()
os.utime(filepath, (timestamp, timestamp))
logger.debug(f"Updated filesystem timestamps for {filepath}")
except Exception as e:
logger.error(f"Failed to update filesystem timestamps: {e}")
success = False
return success
@classmethod
def _update_macos_creation_time(cls, filepath: Path, date: datetime):
"""Update creation time on macOS using SetFile"""
date_str = date.strftime("%m/%d/%Y %H:%M:%S")
try:
result = subprocess.run(
['SetFile', '-d', date_str, str(filepath)],
capture_output=True,
text=True,
check=False
)
if result.returncode == 0:
logger.debug(f"Updated macOS creation time for {filepath}")
else:
logger.debug(f"SetFile failed: {result.stderr}")
except FileNotFoundError:
logger.debug("SetFile not found (Xcode Command Line Tools not installed)")
@classmethod
def _update_windows_creation_time(cls, filepath: Path, date: datetime):
"""Update creation time on Windows using PowerShell"""
date_str = date.strftime("%Y-%m-%d %H:%M:%S")
ps_command = f'''
$file = Get-Item "{filepath}"
$file.CreationTime = "{date_str}"
'''
try:
result = subprocess.run(
['powershell', '-Command', ps_command],
capture_output=True,
text=True,
check=False
)
if result.returncode == 0:
logger.debug(f"Updated Windows creation time for {filepath}")
except FileNotFoundError:
logger.debug("PowerShell not available")
@classmethod
def _update_exif_data(cls, filepath: Path, date: datetime):
"""Update EXIF metadata using exiftool
Sets all date fields comprehensively to ensure consistent timestamps
across all metadata readers (including Immich):
- AllDates (DateTimeOriginal, CreateDate, ModifyDate)
- MetadataDate (used by some photo managers)
- FileModifyDate (filesystem modification time)
- Clears HistoryWhen to avoid conflicting timestamps
"""
date_str = date.strftime("%Y:%m:%d %H:%M:%S")
try:
result = subprocess.run([
'exiftool',
'-overwrite_original',
f'-AllDates={date_str}',
f'-MetadataDate={date_str}',
'-HistoryWhen=',
f'-FileModifyDate={date_str}',
str(filepath)
], capture_output=True, text=True, check=False)
if result.returncode == 0:
logger.debug(f"Updated EXIF data for {filepath}")
else:
logger.debug(f"exiftool failed: {result.stderr}")
except FileNotFoundError:
logger.debug("exiftool not found")
@classmethod
def _update_video_metadata(cls, filepath: Path, date: datetime):
"""Update video metadata using ffmpeg"""
date_str = date.strftime("%Y-%m-%d %H:%M:%S")
temp_file = filepath.with_suffix('.tmp' + filepath.suffix)
try:
result = subprocess.run([
'ffmpeg', '-i', str(filepath),
'-c', 'copy',
'-metadata', f'creation_time={date_str}',
'-y', str(temp_file)
], capture_output=True, text=True, check=False)
if result.returncode == 0 and temp_file.exists():
# Replace original with updated file
temp_file.replace(filepath)
logger.debug(f"Updated video metadata for {filepath}")
else:
if temp_file.exists():
temp_file.unlink()
logger.debug(f"ffmpeg failed: {result.stderr}")
except FileNotFoundError:
logger.debug("ffmpeg not found")
except Exception as e:
if temp_file.exists():
temp_file.unlink()
logger.debug(f"Video metadata update failed: {e}")
# Convenience functions for direct use
def extract_date(text: str, fallback: Optional[datetime] = None) -> Optional[datetime]:
"""Extract date from text"""
return DateHandler.extract_date_from_text(text, fallback)
def update_timestamps(filepath: Union[str, Path], date: datetime) -> bool:
"""Update all timestamps for a file"""
return DateHandler.update_file_timestamps(filepath, date)
if __name__ == "__main__":
# Test examples
test_texts = [
"Eva Longoria - 15.08.2025 Event Photos",
"Photos from 08/15/2025",
"August 15, 2025 - Red Carpet",
"15 Aug 2025 Photoshoot",
"Event 2025-08-15",
]
print("Date extraction tests:")
for text in test_texts:
extracted = extract_date(text)
print(f" '{text}' -> {extracted}")
# Test file timestamp update
test_file = Path("test_image.jpg")
if test_file.exists():
test_date = datetime(2025, 8, 15, 18, 30, 0)
if update_timestamps(test_file, test_date):
print(f"\nSuccessfully updated timestamps for {test_file}")

27
modules/db_bootstrap.py Normal file
View File

@@ -0,0 +1,27 @@
"""
Database Backend Bootstrap
Import this module before any other imports that use sqlite3.
When DATABASE_BACKEND=postgresql, it monkey-patches sys.modules['sqlite3']
with pg_adapter so every subsequent `import sqlite3` gets the PostgreSQL adapter.
Default is 'sqlite' (no change — original behavior preserved).
"""
import os
from pathlib import Path
# Load .env BEFORE checking DATABASE_BACKEND — systemd services don't set
# this env var, so .env is the primary source of truth.
try:
from dotenv import load_dotenv
_env_path = Path(__file__).resolve().parent.parent / '.env'
if _env_path.exists():
load_dotenv(_env_path)
except ImportError:
pass # rely on system env vars
if os.getenv('DATABASE_BACKEND', 'sqlite').lower() == 'postgresql':
import sys
from modules import pg_adapter
sys.modules['sqlite3'] = pg_adapter

View File

@@ -0,0 +1,634 @@
#!/usr/bin/env python3
"""
Dependency Updater - Automatically updates critical dependencies
Only runs in scheduler mode, once per day
Version Compatibility:
- bcrypt <5.0 required for passlib 1.7.4 compatibility
- passlib 1.7.4 requires bcrypt 4.x (not 5.x)
- uvicorn <0.35.0 required (0.40.0+ has breaking loop_factory changes)
- Pinned packages are skipped during auto-updates to prevent incompatibilities
"""
import json
import subprocess
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict
from modules.universal_logger import get_logger
class DependencyUpdater:
"""Manages automatic updates for critical dependencies"""
def __init__(self,
state_file: str = "/opt/media-downloader/database/dependency_updates.json",
config: dict = None,
pushover_notifier = None,
scheduler_mode: bool = False):
"""
Initialize dependency updater
Args:
state_file: Path to JSON file storing update state
config: Configuration dict from settings.json
pushover_notifier: Instance of PushoverNotifier for alerts
scheduler_mode: Only run updates when True (scheduler mode)
"""
self.state_file = Path(state_file)
self.state_file.parent.mkdir(parents=True, exist_ok=True)
self.pushover = pushover_notifier
self.scheduler_mode = scheduler_mode
# Derive venv paths from module location (more portable than hardcoded path)
import sys
self._base_dir = Path(__file__).parent.parent
self._venv_pip = self._base_dir / 'venv' / 'bin' / 'pip'
self._venv_python = self._base_dir / 'venv' / 'bin' / 'python'
# Fallback to sys.executable's directory if venv not found
if not self._venv_pip.exists():
self._venv_pip = Path(sys.executable).parent / 'pip'
if not self._venv_python.exists():
self._venv_python = Path(sys.executable)
# Default configuration
self.config = {
'enabled': True,
'check_interval_hours': 24,
'auto_install': True,
'components': {
'flaresolverr': {
'enabled': True,
'notify_on_update': True
},
'playwright': {
'enabled': True,
'notify_on_update': False
},
'yt_dlp': {
'enabled': True,
'notify_on_update': False
},
'python_packages': {
'enabled': True,
'notify_on_update': True,
'packages': [
# Core API framework
'fastapi',
'uvicorn',
'pydantic',
'python-jose',
'passlib',
'slowapi',
'starlette',
'python-multipart',
'websockets',
# Security & Auth
'bcrypt',
'cryptography',
'certifi',
'2captcha-python',
'duo-universal',
# Image processing
'pillow',
'numpy',
# Face recognition
'insightface',
'onnxruntime',
'deepface',
'tensorflow',
'face-recognition',
'dlib',
# Web scraping & downloads
'requests',
'beautifulsoup4',
'selenium',
'playwright',
'playwright-stealth',
'instaloader',
'yt-dlp',
'curl-cffi',
'gallery-dl',
# Database
'psycopg2-binary',
# Utilities
'python-dotenv',
'python-dateutil',
'pyotp',
'click',
'attrs',
'charset-normalizer',
'idna',
'websocket-client',
'trio',
'typing_extensions'
]
}
},
'pushover': {
'enabled': True,
'priority': -1,
'sound': 'magic'
}
}
# Merge user config
if config:
self._deep_update(self.config, config)
# Load or initialize state
self.state = self._load_state()
# Setup logging
self.logger = get_logger('DependencyUpdater')
# Known version incompatibilities and constraints
# Format: package_name: [constraints, incompatible_with, reason]
self.version_constraints = {
'bcrypt': {
'constraint': '<5.0',
'reason': 'bcrypt 5.x is incompatible with passlib 1.7.4',
'incompatible_with': ['passlib>=1.7.4,<2.0']
},
'passlib': {
'constraint': '>=1.7.4,<2.0',
'reason': 'passlib 1.7.4 requires bcrypt <5.0',
'requires': ['bcrypt>=4.0.0,<5.0']
},
'uvicorn': {
'constraint': '<0.35.0',
'reason': 'uvicorn 0.40.0+ has breaking changes with loop_factory parameter that crashes on startup',
'known_working': '0.34.0'
}
}
# Packages that should not be auto-updated
self.pinned_packages = {
'bcrypt': 'Version constrained for passlib compatibility',
'passlib': 'Version constrained for bcrypt compatibility',
'uvicorn': 'Version 0.40.0+ has breaking changes with loop_factory parameter'
}
def _deep_update(self, base: dict, update: dict):
"""Deep update dict (recursive merge)"""
for key, value in update.items():
if isinstance(value, dict) and key in base and isinstance(base[key], dict):
self._deep_update(base[key], value)
else:
base[key] = value
def _load_state(self) -> Dict:
"""Load update state from file"""
if self.state_file.exists():
try:
with open(self.state_file, 'r') as f:
return json.load(f)
except Exception as e:
self.logger.error(f"Failed to load update state: {e}")
# Initialize empty state
return {
'last_check': None,
'components': {}
}
def _save_state(self):
"""Save update state to file"""
try:
with open(self.state_file, 'w') as f:
json.dump(self.state, f, indent=2, default=str)
except Exception as e:
self.logger.error(f"Failed to save update state: {e}")
def _should_check_updates(self, force: bool = False) -> bool:
"""Check if enough time has passed since last update check
Args:
force: If True, bypass all checks and return True
Returns:
True if updates should be checked, False otherwise
"""
if force:
return True
if not self.config.get('enabled', True):
return False
# Allow manual checks even outside scheduler mode
if not self.scheduler_mode:
# In non-scheduler mode, only proceed if explicitly called
# This allows manual force_update_check() to work
return False
last_check = self.state.get('last_check')
if not last_check:
return True
try:
last_check_time = datetime.fromisoformat(last_check)
interval_hours = self.config.get('check_interval_hours', 24)
return datetime.now() - last_check_time > timedelta(hours=interval_hours)
except Exception:
return True
def check_and_update_all(self, force: bool = False) -> Dict[str, bool]:
"""
Check and update all enabled components
Args:
force: If True, bypass interval checks and update immediately
Returns:
Dict mapping component name to update success status
"""
if not self._should_check_updates(force=force):
return {}
# Check if auto_install is enabled (default: True)
auto_install = self.config.get('auto_install', True)
if auto_install:
self.logger.info("Checking for dependency updates...")
else:
self.logger.info("Checking for dependency updates (auto_install disabled - check only)...")
return {} # Skip updates if auto_install is disabled
results = {}
# Update last check timestamp
self.state['last_check'] = datetime.now().isoformat()
self._save_state()
# Check each component
components = self.config.get('components', {})
if components.get('flaresolverr', {}).get('enabled', True):
results['flaresolverr'] = self._update_flaresolverr()
if components.get('playwright', {}).get('enabled', True):
results['playwright'] = self._update_playwright()
if components.get('yt_dlp', {}).get('enabled', True):
results['yt_dlp'] = self._update_yt_dlp()
if components.get('python_packages', {}).get('enabled', True):
results['python_packages'] = self._update_python_packages()
# Send summary notification if any updates installed
if any(results.values()) and self.pushover:
self._send_update_notification(results)
return results
def _update_flaresolverr(self) -> bool:
"""
Update FlareSolverr Docker container
Returns:
True if update was installed, False otherwise
"""
try:
self.logger.info("Checking FlareSolverr for updates...")
# Pull latest image
result = subprocess.run(
['docker', 'pull', 'ghcr.io/flaresolverr/flaresolverr:latest'],
capture_output=True,
text=True,
timeout=300
)
if result.returncode != 0:
self.logger.error(f"Failed to pull FlareSolverr image: {result.stderr}")
return False
# Check if image was updated (look for "Downloaded newer image" or "Image is up to date")
output = result.stdout + result.stderr
updated = "Downloaded newer image" in output or "pulling from" in output.lower()
if not updated:
self.logger.info("FlareSolverr is already up to date")
self._update_component_state('flaresolverr', False)
return False
# Image was updated - restart container if running
self.logger.info("FlareSolverr image updated, restarting container...")
# Check if container exists
check_result = subprocess.run(
['docker', 'ps', '-a', '--filter', 'name=flaresolverr', '--format', '{{.Names}}'],
capture_output=True,
text=True
)
if 'flaresolverr' in check_result.stdout:
# Stop and remove old container
subprocess.run(['docker', 'stop', 'flaresolverr'], capture_output=True)
subprocess.run(['docker', 'rm', 'flaresolverr'], capture_output=True)
# Start new container with latest image
subprocess.run([
'docker', 'run', '-d',
'--name', 'flaresolverr',
'-p', '8191:8191',
'-e', 'LOG_LEVEL=info',
'--restart', 'unless-stopped',
'ghcr.io/flaresolverr/flaresolverr:latest'
], capture_output=True)
self.logger.info("✓ FlareSolverr updated and restarted successfully")
else:
self.logger.info("✓ FlareSolverr image updated (container not running)")
self._update_component_state('flaresolverr', True)
return True
except subprocess.TimeoutExpired:
self.logger.error("FlareSolverr update timed out")
return False
except Exception as e:
self.logger.error(f"FlareSolverr update error: {e}")
return False
def _update_playwright(self) -> bool:
"""
Update Playwright browsers (Chromium and Firefox)
Returns:
True if update was installed, False otherwise
"""
try:
self.logger.info("Checking Playwright browsers for updates...")
# Use venv python for playwright commands
venv_python = str(self._venv_python)
# Update Chromium
result_chromium = subprocess.run(
[venv_python, '-m', 'playwright', 'install', 'chromium'],
capture_output=True,
text=True,
timeout=600,
cwd=str(self._base_dir)
)
# Update Firefox
result_firefox = subprocess.run(
[venv_python, '-m', 'playwright', 'install', 'firefox'],
capture_output=True,
text=True,
timeout=600,
cwd=str(self._base_dir)
)
success = result_chromium.returncode == 0 and result_firefox.returncode == 0
if success:
# Check if anything was actually updated
output = result_chromium.stdout + result_firefox.stdout
updated = "Downloading" in output or "Installing" in output
if updated:
self.logger.info("✓ Playwright browsers updated successfully")
self._update_component_state('playwright', True)
return True
else:
self.logger.info("Playwright browsers already up to date")
self._update_component_state('playwright', False)
return False
else:
self.logger.error("Failed to update Playwright browsers")
return False
except subprocess.TimeoutExpired:
self.logger.error("Playwright update timed out")
return False
except Exception as e:
self.logger.error(f"Playwright update error: {e}")
return False
def _update_yt_dlp(self) -> bool:
"""
Update yt-dlp (critical for TikTok downloads)
Returns:
True if update was installed, False otherwise
"""
try:
self.logger.info("Checking yt-dlp for updates...")
# Use venv pip (derived from module location for portability)
venv_pip = str(self._venv_pip)
# Try updating via pip
result = subprocess.run(
[venv_pip, 'install', '--upgrade', 'yt-dlp'],
capture_output=True,
text=True,
timeout=120
)
if result.returncode != 0:
self.logger.error(f"Failed to update yt-dlp: {result.stderr}")
return False
# Check if update was installed
output = result.stdout + result.stderr
updated = "Successfully installed" in output and "yt-dlp" in output
if updated:
self.logger.info("✓ yt-dlp updated successfully")
self._update_component_state('yt_dlp', True)
return True
else:
self.logger.info("yt-dlp already up to date")
self._update_component_state('yt_dlp', False)
return False
except subprocess.TimeoutExpired:
self.logger.error("yt-dlp update timed out")
return False
except Exception as e:
self.logger.error(f"yt-dlp update error: {e}")
return False
def _update_python_packages(self) -> bool:
"""
Update Python packages (FastAPI, Uvicorn, Pydantic, etc.)
Returns:
True if any updates were installed, False otherwise
"""
try:
self.logger.info("Checking Python packages for updates...")
# Get list of packages to update
packages = self.config.get('components', {}).get('python_packages', {}).get('packages', [])
if not packages:
self.logger.info("No Python packages configured for updates")
return False
# Use venv pip (derived from module location for portability)
venv_pip = str(self._venv_pip)
updated_packages = []
for package in packages:
try:
# Check if package is pinned (should not be auto-updated)
if package in self.pinned_packages:
self.logger.info(f"⚠ Skipping {package}: {self.pinned_packages[package]}")
continue
# Check for version constraints
if package in self.version_constraints:
constraint_info = self.version_constraints[package]
constraint = constraint_info.get('constraint', '')
reason = constraint_info.get('reason', 'Version constraint')
if constraint:
# Install with constraint instead of --upgrade
package_spec = f"{package}{constraint}"
self.logger.info(f"📌 {package}: Applying constraint {constraint} ({reason})")
result = subprocess.run(
[venv_pip, 'install', package_spec],
capture_output=True,
text=True,
timeout=120
)
else:
# No constraint, normal upgrade
result = subprocess.run(
[venv_pip, 'install', '--upgrade', package],
capture_output=True,
text=True,
timeout=120
)
else:
# Update package normally
result = subprocess.run(
[venv_pip, 'install', '--upgrade', package],
capture_output=True,
text=True,
timeout=120
)
if result.returncode == 0:
output = result.stdout + result.stderr
# Check if package was actually updated
if "Successfully installed" in output and package in output:
updated_packages.append(package)
self.logger.info(f"{package} updated")
elif "Requirement already satisfied" in output:
self.logger.debug(f" {package} already up to date")
else:
self.logger.debug(f" {package} checked")
else:
self.logger.warning(f"Failed to update {package}: {result.stderr}")
except subprocess.TimeoutExpired:
self.logger.warning(f"{package} update timed out")
except Exception as e:
self.logger.warning(f"Error updating {package}: {e}")
if updated_packages:
self.logger.info(f"✓ Updated {len(updated_packages)} Python package(s): {', '.join(updated_packages)}")
self._update_component_state('python_packages', True)
# Store list of updated packages in state
if 'components' not in self.state:
self.state['components'] = {}
if 'python_packages' not in self.state['components']:
self.state['components']['python_packages'] = {}
self.state['components']['python_packages']['updated_packages'] = updated_packages
self._save_state()
return True
else:
self.logger.info("All Python packages already up to date")
self._update_component_state('python_packages', False)
return False
except Exception as e:
self.logger.error(f"Python packages update error: {e}")
return False
def _update_component_state(self, component: str, updated: bool):
"""Update component state in JSON"""
if 'components' not in self.state:
self.state['components'] = {}
if component not in self.state['components']:
self.state['components'][component] = {}
self.state['components'][component]['last_update'] = datetime.now().isoformat() if updated else self.state['components'][component].get('last_update')
self.state['components'][component]['last_check'] = datetime.now().isoformat()
self.state['components'][component]['status'] = 'updated' if updated else 'current'
self._save_state()
def _send_update_notification(self, results: Dict[str, bool]):
"""Send Pushover notification about installed updates"""
if not self.config.get('pushover', {}).get('enabled', True):
return
# Build list of updated components
updated_components = [name for name, updated in results.items() if updated]
if not updated_components:
return
# Check which components should send notifications
notify_components = []
for component in updated_components:
component_config = self.config.get('components', {}).get(component, {})
if component_config.get('notify_on_update', True):
notify_components.append(component)
if not notify_components:
return
# Format component names
component_map = {
'flaresolverr': 'FlareSolverr',
'playwright': 'Playwright Browsers',
'yt_dlp': 'yt-dlp',
'python_packages': 'Python Packages'
}
formatted_names = [component_map.get(c, c) for c in notify_components]
title = "🔄 Dependencies Updated"
if len(formatted_names) == 1:
message = f"{formatted_names[0]} has been updated to the latest version."
else:
message = f"The following components have been updated:\n\n"
for name in formatted_names:
message += f"{name}\n"
message += f"\nUpdated at: {datetime.now().strftime('%b %d, %I:%M %p')}"
try:
priority = self.config.get('pushover', {}).get('priority', -1)
sound = self.config.get('pushover', {}).get('sound', 'magic')
self.pushover.send_notification(
title=title,
message=message,
priority=priority,
sound=sound
)
self.logger.info(f"Sent update notification for: {', '.join(formatted_names)}")
except Exception as e:
self.logger.error(f"Failed to send update notification: {e}")
def get_update_status(self) -> Dict:
"""Get current update status for all components"""
return self.state.copy()
def force_update_check(self) -> Dict[str, bool]:
"""Force immediate update check regardless of interval or scheduler mode"""
return self.check_and_update_all(force=True)

1051
modules/discovery_system.py Normal file

File diff suppressed because it is too large Load Diff

940
modules/download_manager.py Executable file
View File

@@ -0,0 +1,940 @@
#!/usr/bin/env python3
"""
Multi-threaded Download Manager
Handles concurrent downloads with rate limiting, retries, and progress tracking
Can be used by forum_downloader, fastdl_module, and other downloaders
"""
import os
import re
import time
import hashlib
import requests
import threading
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Any, Callable
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock, Semaphore
from dataclasses import dataclass
import sqlite3
from urllib.parse import urlparse
from modules.base_module import LoggingMixin
from modules.universal_logger import get_logger
logger = get_logger('DownloadManager') # For standalone/example usage
@dataclass
class DownloadItem:
"""Single download item"""
url: str
save_path: Path
referer: Optional[str] = None
headers: Optional[Dict[str, str]] = None
metadata: Optional[Dict[str, Any]] = None
post_date: Optional[datetime] = None # Timestamp to set on downloaded file
retry_count: int = 0
max_retries: int = 3
@dataclass
class DownloadResult:
"""Result of a download"""
success: bool
item: DownloadItem
file_size: Optional[int] = None
download_time: Optional[float] = None
error: Optional[str] = None
file_hash: Optional[str] = None
class DownloadManager(LoggingMixin):
"""
Multi-threaded download manager with:
- Concurrent downloads
- Rate limiting
- Automatic retries
- Progress tracking
- Database tracking
- Playwright support for authenticated downloads
"""
def __init__(self,
max_workers: int = 5,
rate_limit: float = 0.5,
timeout: int = 30,
chunk_size: int = 8192,
use_database: bool = False,
db_path: str = None,
show_progress: bool = True,
show_debug: bool = False):
"""
Initialize download manager
Args:
max_workers: Maximum concurrent downloads
rate_limit: Seconds between downloads per thread
timeout: Download timeout in seconds
chunk_size: Chunk size for streaming downloads
use_database: Track downloads in database
db_path: Path to database file
show_progress: Show download progress
show_debug: Show debug messages
"""
self.max_workers = max_workers
self.rate_limit = rate_limit
self.timeout = timeout
self.chunk_size = chunk_size
self.use_database = use_database
self.db_path = db_path
self.show_progress = show_progress
# Initialize logging via mixin
self._init_logger('DownloadManager', None, default_module='Download', show_debug=show_debug)
# Thread synchronization
self.download_lock = Lock()
self.rate_limiter = Semaphore(max_workers)
self.last_download_time = {}
# Thread-local storage for ImageBam sessions (each thread gets its own session)
self._imagebam_session_local = threading.local()
# Statistics
self.stats = {
'total': 0,
'successful': 0,
'failed': 0,
'skipped': 0,
'total_bytes': 0,
'total_time': 0
}
# User agent
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
# Playwright context for authenticated downloads
self.playwright_context = None
# Initialize database only if explicitly enabled AND path provided
if self.use_database and self.db_path:
self._init_database()
elif self.use_database and not self.db_path:
# Disable database if no path provided to prevent creating files in CWD
self.use_database = False
def _init_database(self):
"""Initialize download tracking database"""
if not self.db_path:
return
conn = sqlite3.connect(self.db_path)
try:
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS downloads (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
file_path TEXT NOT NULL,
file_hash TEXT,
file_size INTEGER,
download_date DATETIME DEFAULT CURRENT_TIMESTAMP,
metadata TEXT
)
''')
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_downloads_url ON downloads(url)
''')
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_downloads_hash ON downloads(file_hash)
''')
conn.commit()
finally:
conn.close()
def set_playwright_context(self, context):
"""Set Playwright context for authenticated downloads"""
self.playwright_context = context
# Extract cookies from context for requests library
if context:
try:
self.cookies = {}
cookies = context.cookies()
for cookie in cookies:
self.cookies[cookie['name']] = cookie['value']
except Exception:
self.cookies = {}
def _is_already_downloaded(self, url: str, file_path: Path) -> bool:
"""Check if file was already downloaded"""
if not self.use_database:
return file_path.exists() and file_path.stat().st_size > 0
conn = sqlite3.connect(self.db_path)
try:
cursor = conn.cursor()
cursor.execute(
"SELECT file_path, file_size FROM downloads WHERE url = ?",
(url,)
)
result = cursor.fetchone()
finally:
conn.close()
if result:
# Check if file still exists and has expected size
saved_path = Path(result[0])
if saved_path.exists() and saved_path.stat().st_size == result[1]:
return True
return False
def _apply_rate_limit(self, thread_id: int):
"""Apply rate limiting per thread"""
with self.download_lock:
if thread_id in self.last_download_time:
elapsed = time.time() - self.last_download_time[thread_id]
if elapsed < self.rate_limit:
time.sleep(self.rate_limit - elapsed)
self.last_download_time[thread_id] = time.time()
def _extract_pixhost_direct_url(self, show_url: str) -> Optional[str]:
"""Extract direct image URL from pixhost show URL"""
try:
# Pattern to extract ID and filename from show URL
show_pattern = re.compile(r"https?://(?:www\.)?pixhost\.to/show/(\d+)/([^/]+)$", re.IGNORECASE)
match = show_pattern.match(show_url)
if not match:
return None
img_id = match.group(1)
filename = match.group(2)
# Try common hosts in order
common_hosts = [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100]
for host_num in common_hosts:
test_url = f"https://img{host_num}.pixhost.to/images/{img_id}/{filename}"
try:
# Quick HEAD request to check if URL exists
response = requests.head(test_url, timeout=2, allow_redirects=False)
if response.status_code == 200:
return test_url
except requests.RequestException:
continue
# Try sequential scan if common hosts don't work
for host_num in range(1, 121):
if host_num in common_hosts:
continue
test_url = f"https://img{host_num}.pixhost.to/images/{img_id}/{filename}"
try:
response = requests.head(test_url, timeout=1, allow_redirects=False)
if response.status_code == 200:
return test_url
except requests.RequestException:
continue
return None
except Exception as e:
self.log(f"Error extracting pixhost URL: {e}", "error")
return None
def _extract_imagebam_direct_url(self, imagebam_url: str) -> Optional[str]:
"""Extract direct image URL from ImageBam page"""
try:
# Get or create thread-local ImageBam session (thread-safe)
session = getattr(self._imagebam_session_local, 'session', None)
if session is None:
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
# Set cookies to bypass the interstitial ad page (both old and new cookies)
session.cookies.set('nsfw_inter', '1', domain='.imagebam.com')
session.cookies.set('sfw_inter', '1', domain='.imagebam.com')
self._imagebam_session_local.session = session
# ImageBam now requires two requests - first to get session cookies, second to get image
# First request sets up the session
response = session.get(imagebam_url, timeout=5)
if response.status_code != 200:
self.log(f"ImageBam page returned {response.status_code}", "warning")
return None
# Check if we got the interstitial page (contains "Continue to your image")
if 'Continue to your image' in response.text or 'Please wait' in response.text:
# Make sure bypass cookies are set and request again
session.cookies.set('sfw_inter', '1', domain='.imagebam.com')
session.cookies.set('nsfw_inter', '1', domain='.imagebam.com')
response = session.get(imagebam_url, timeout=5)
# Look for the direct image URL in the HTML
# ImageBam stores the full image with _o suffix
# First try to find the full resolution image
full_img_pattern = r'(https?://images\d*\.imagebam\.com/[a-f0-9/]+/[A-Z0-9]+_o\.\w+)'
matches = re.findall(full_img_pattern, response.text, re.IGNORECASE)
if matches:
# Return the first full resolution image found
direct_url = matches[0]
self.log(f"Extracted ImageBam direct URL: {direct_url}", "debug")
return direct_url
# Fallback: look for any image on images*.imagebam.com
fallback_patterns = [
r'<img[^>]+src="(https?://images\d*\.imagebam\.com/[^"]+)"',
r'"(https?://images\d*\.imagebam\.com/[^"]+\.(?:jpg|jpeg|png|gif))"',
]
for pattern in fallback_patterns:
matches = re.findall(pattern, response.text, re.IGNORECASE)
if matches:
direct_url = matches[0]
self.log(f"Extracted ImageBam direct URL (fallback): {direct_url}", "debug")
return direct_url
self.log("No direct image URL found in ImageBam HTML", "warning")
return None
except requests.Timeout:
self.log(f"ImageBam extraction timed out for {imagebam_url}", "warning")
return None
except Exception as e:
self.log(f"Error extracting ImageBam URL: {e}", "error")
return None
def _download_with_gallery_dl(self, item: DownloadItem) -> DownloadResult:
"""Download using gallery-dl for supported hosts (ImageTwist, etc.)"""
import subprocess
start_time = time.time()
try:
# Ensure parent directory exists
item.save_path.parent.mkdir(parents=True, exist_ok=True)
# Build gallery-dl command
cmd = [
"gallery-dl",
"--dest", str(item.save_path.parent),
"--filename", item.save_path.name,
"--no-skip",
"--no-part",
"--quiet"
]
# Add referer if provided
if item.referer:
cmd.extend(["--header", f"Referer: {item.referer}"])
cmd.append(item.url)
# Run gallery-dl with timeout
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60
)
if result.returncode == 0 and item.save_path.exists():
file_size = item.save_path.stat().st_size
download_time = time.time() - start_time
# Calculate hash (SHA256 for consistency with unified database)
with open(item.save_path, 'rb') as f:
file_hash = hashlib.sha256(f.read()).hexdigest()
# Set file timestamp if we have a date
if item.post_date:
try:
timestamp_unix = item.post_date.timestamp()
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
except Exception as e:
self.log(f"Failed to set timestamp: {e}", "warning")
self.log(f"Downloaded via gallery-dl: {item.save_path.name}", "success")
return DownloadResult(
success=True,
item=item,
file_size=file_size,
download_time=download_time,
file_hash=file_hash
)
else:
error_msg = result.stderr or "Unknown error"
return DownloadResult(
success=False,
item=item,
error=f"gallery-dl failed: {error_msg}"
)
except subprocess.TimeoutExpired:
return DownloadResult(
success=False,
item=item,
error="gallery-dl timed out"
)
except Exception as e:
return DownloadResult(
success=False,
item=item,
error=str(e)
)
def _download_from_imagetwist(self, item: DownloadItem) -> DownloadResult:
"""Download image from ImageTwist using gallery-dl for URL resolution"""
import subprocess
start_time = time.time()
# Rate limiting for ImageTwist (they return error images if too fast)
if not hasattr(self, '_imagetwist_last_request'):
self._imagetwist_last_request = 0
with self.download_lock:
elapsed = time.time() - self._imagetwist_last_request
if elapsed < 2.0: # Minimum 2 seconds between ImageTwist requests
time.sleep(2.0 - elapsed)
self._imagetwist_last_request = time.time()
try:
# Use gallery-dl to get the actual image URL
result = subprocess.run(
['/opt/media-downloader/venv/bin/gallery-dl', '-g', item.url],
capture_output=True, text=True, timeout=30
)
if result.returncode != 0 or not result.stdout.strip():
# Fallback to manual parsing
return self._download_from_imagetwist_fallback(item, start_time)
img_url = result.stdout.strip().split('\n')[0]
if not img_url or 'imagetwist' not in img_url:
return self._download_from_imagetwist_fallback(item, start_time)
# Rate limit again before actual download
with self.download_lock:
elapsed = time.time() - self._imagetwist_last_request
if elapsed < 2.0:
time.sleep(2.0 - elapsed)
self._imagetwist_last_request = time.time()
# Download the actual image - use imagetwist page as Referer
item.save_path.parent.mkdir(parents=True, exist_ok=True)
headers = {
'User-Agent': self.user_agent,
'Referer': item.url # Use imagetwist page URL as Referer
}
img_response = requests.get(img_url, headers=headers, timeout=30, stream=True)
img_response.raise_for_status()
# Check for ImageTwist error placeholder (8346 bytes - rate limited or deleted)
content_length = img_response.headers.get('Content-Length', '')
if content_length == '8346':
self.log(f"ImageTwist rate limited or unavailable: {item.url}", "warning")
return DownloadResult(success=False, item=item, error="ImageTwist error image (rate limited)")
# Validate it's an image, not HTML
chunks = []
for chunk in img_response.iter_content(chunk_size=8192):
if not chunks: # First chunk
if chunk[:100].lower().find(b'<html') != -1 or chunk[:100].lower().find(b'<!doctype') != -1:
return DownloadResult(
success=False,
item=item,
error="Got HTML instead of image"
)
chunks.append(chunk)
# Save the image
with open(item.save_path, 'wb') as f:
for chunk in chunks:
f.write(chunk)
file_size = item.save_path.stat().st_size
download_time = time.time() - start_time
# Calculate hash (SHA256 for consistency with unified database)
with open(item.save_path, 'rb') as f:
file_hash = hashlib.sha256(f.read()).hexdigest()
# Set file timestamp if we have a date
if item.post_date:
try:
timestamp_unix = item.post_date.timestamp()
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
except Exception:
pass
self.log(f"Downloaded ImageTwist: {item.save_path.name}", "success")
return DownloadResult(
success=True,
item=item,
file_size=file_size,
download_time=download_time,
file_hash=file_hash
)
except Exception as e:
return DownloadResult(
success=False,
item=item,
error=f"ImageTwist download failed: {e}"
)
def _download_from_imagetwist_fallback(self, item: DownloadItem, start_time: float) -> DownloadResult:
"""Fallback method using manual page parsing"""
from bs4 import BeautifulSoup
import re
try:
headers = {
'User-Agent': self.user_agent,
'Referer': item.referer or 'https://forum.phun.org/'
}
response = requests.get(item.url, headers=headers, timeout=30)
response.raise_for_status()
page_content = response.text
img_url = None
# Method 1: Look for pic class
soup = BeautifulSoup(page_content, 'html.parser')
pic_img = soup.find('img', class_='pic')
if pic_img and pic_img.get('src'):
img_url = pic_img['src']
# Method 2: Regex for i*.imagetwist.com/i/ pattern
if not img_url:
match = re.search(r'(https?://i\d*(?:phun)?\.imagetwist\.com/i/[^"\'>\s]+)', page_content)
if match:
img_url = match.group(1)
if not img_url:
return DownloadResult(
success=False,
item=item,
error="Could not find direct image URL on ImageTwist page"
)
# Download the actual image
item.save_path.parent.mkdir(parents=True, exist_ok=True)
img_response = requests.get(img_url, headers=headers, timeout=30, stream=True)
img_response.raise_for_status()
chunks = []
for chunk in img_response.iter_content(chunk_size=8192):
if not chunks:
if chunk[:100].lower().find(b'<html') != -1:
return DownloadResult(success=False, item=item, error="Got HTML instead of image")
chunks.append(chunk)
with open(item.save_path, 'wb') as f:
for chunk in chunks:
f.write(chunk)
file_size = item.save_path.stat().st_size
download_time = time.time() - start_time
with open(item.save_path, 'rb') as f:
file_hash = hashlib.sha256(f.read()).hexdigest()
self.log(f"Downloaded ImageTwist (fallback): {item.save_path.name}", "success")
return DownloadResult(success=True, item=item, file_size=file_size, download_time=download_time, file_hash=file_hash)
except Exception as e:
return DownloadResult(success=False, item=item, error=f"ImageTwist fallback failed: {e}")
def _download_with_playwright(self, item: DownloadItem) -> DownloadResult:
"""Download using Playwright for authenticated sessions"""
if not self.playwright_context:
return self._download_with_requests(item)
start_time = time.time()
try:
page = self.playwright_context.new_page()
try:
# Set headers
headers = item.headers or {}
if item.referer:
headers['Referer'] = item.referer
if headers:
page.set_extra_http_headers(headers)
# Direct download (pixhost should already be processed)
response = page.goto(item.url, wait_until='networkidle',
timeout=self.timeout * 1000)
if response and response.ok:
content = response.body()
# Check for HTML error pages
if content[:1000].lower().find(b'<!doctype') != -1 or \
content[:1000].lower().find(b'<html') != -1:
return DownloadResult(
success=False,
item=item,
error="Got HTML instead of expected file"
)
# Save file
item.save_path.parent.mkdir(parents=True, exist_ok=True)
item.save_path.write_bytes(content)
# Calculate hash (SHA256 for consistency with unified database)
file_hash = hashlib.sha256(content).hexdigest()
# Update timestamps if we have a date
if item.post_date:
try:
timestamp_unix = item.post_date.timestamp()
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
self.log(f"Set timestamp to {item.post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
except Exception as e:
self.log(f"Failed to set timestamp: {e}", "warning")
download_time = time.time() - start_time
return DownloadResult(
success=True,
item=item,
file_size=len(content),
download_time=download_time,
file_hash=file_hash
)
else:
return DownloadResult(
success=False,
item=item,
error=f"HTTP {response.status if response else 'No response'}"
)
finally:
page.close()
except Exception as e:
return DownloadResult(
success=False,
item=item,
error=str(e)
)
def _download_with_requests(self, item: DownloadItem) -> DownloadResult:
"""Download using requests library"""
start_time = time.time()
try:
headers = item.headers or {}
headers['User-Agent'] = self.user_agent
if item.referer:
headers['Referer'] = item.referer
# Use cookies if available
cookies = getattr(self, 'cookies', {})
response = requests.get(
item.url,
headers=headers,
cookies=cookies if cookies else None,
timeout=self.timeout,
stream=True
)
response.raise_for_status()
# Stream download to memory first to validate content
item.save_path.parent.mkdir(parents=True, exist_ok=True)
content = b''
first_chunk_checked = False
for chunk in response.iter_content(chunk_size=self.chunk_size):
if chunk:
# Check first chunk for HTML error pages
if not first_chunk_checked:
first_chunk_checked = True
if chunk[:100].lower().find(b'<html') != -1 or \
chunk[:100].lower().find(b'<!doctype') != -1 or \
chunk[:100].lower().find(b'<head>') != -1:
return DownloadResult(
success=False,
item=item,
error="Got HTML instead of image"
)
content += chunk
# Save to file only after validation
with open(item.save_path, 'wb') as f:
f.write(content)
# Calculate hash (SHA256 for consistency with unified database)
file_hash = hashlib.sha256(content).hexdigest()
# Set file timestamp if we have a date
if item.post_date:
try:
timestamp_unix = item.post_date.timestamp()
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
self.log(f"Set timestamp to {item.post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
except Exception as e:
self.log(f"Failed to set timestamp: {e}", "warning")
download_time = time.time() - start_time
return DownloadResult(
success=True,
item=item,
file_size=len(content),
download_time=download_time,
file_hash=file_hash
)
except Exception as e:
# Clean up partial download
if item.save_path.exists():
item.save_path.unlink()
return DownloadResult(
success=False,
item=item,
error=str(e)
)
def _download_worker(self, item: DownloadItem, thread_id: int) -> DownloadResult:
"""Worker function for downloading a single item"""
# Process image hosting URLs to get direct URLs
if 'pixhost.to/show/' in item.url:
direct_url = self._extract_pixhost_direct_url(item.url)
if direct_url:
self.log(f"Converted pixhost URL to direct: {direct_url.split('/')[-1]}", "debug")
item.url = direct_url
else:
self.log(f"Failed to extract pixhost direct URL: {item.url}", "warning")
elif 'imagebam.com' in item.url:
direct_url = self._extract_imagebam_direct_url(item.url)
if direct_url:
self.log(f"Converted ImageBam URL to direct: {direct_url.split('/')[-1]}", "debug")
item.url = direct_url
else:
self.log(f"Failed to extract ImageBam direct URL: {item.url}", "warning")
elif 'imagetwist.com' in item.url:
# ImageTwist requires parsing the page to get direct image URL
result = self._download_from_imagetwist(item)
if result.success:
return result
self.log(f"ImageTwist download failed: {item.url}", "warning")
# Check if already downloaded
if self._is_already_downloaded(item.url, item.save_path):
self.log(f"Already downloaded: {item.save_path.name}", "skip")
return DownloadResult(
success=True,
item=item,
file_size=item.save_path.stat().st_size if item.save_path.exists() else 0
)
# Apply rate limiting
self._apply_rate_limit(thread_id)
# Always use requests for direct image downloads (faster)
result = self._download_with_requests(item)
# Handle retries
if not result.success and item.retry_count < item.max_retries:
item.retry_count += 1
self.log(f"Retrying {item.url} ({item.retry_count}/{item.max_retries})", "warning")
time.sleep(self.rate_limit * 2) # Extra delay before retry
return self._download_worker(item, thread_id)
# Save to database if successful
if result.success and self.use_database:
self._save_to_database(result)
# Update statistics
with self.download_lock:
if result.success:
self.stats['successful'] += 1
if result.file_size:
self.stats['total_bytes'] += result.file_size
if result.download_time:
self.stats['total_time'] += result.download_time
else:
self.stats['failed'] += 1
return result
def _save_to_database(self, result: DownloadResult):
"""Save successful download to database"""
conn = sqlite3.connect(self.db_path)
try:
cursor = conn.cursor()
metadata_str = None
if result.item.metadata:
import json
metadata_str = json.dumps(result.item.metadata)
cursor.execute('''
INSERT OR REPLACE INTO downloads
(url, file_path, file_hash, file_size, metadata)
VALUES (?, ?, ?, ?, ?)
''', (
result.item.url,
str(result.item.save_path),
result.file_hash,
result.file_size,
metadata_str
))
conn.commit()
finally:
conn.close()
def download_batch(self, items: List[DownloadItem],
progress_callback: Optional[Callable] = None) -> List[DownloadResult]:
"""
Download multiple items concurrently
Args:
items: List of DownloadItem objects
progress_callback: Optional callback for progress updates
Returns:
List of DownloadResult objects
"""
self.stats['total'] = len(items)
results = []
self.log(f"Starting batch download of {len(items)} items with {self.max_workers} workers", "info")
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# Submit all downloads
futures = {
executor.submit(self._download_worker, item, i % self.max_workers): item
for i, item in enumerate(items)
}
# Process completed downloads
completed = 0
for future in as_completed(futures):
result = future.result()
results.append(result)
completed += 1
# Progress update
if progress_callback:
progress_callback(completed, len(items), result)
if self.show_progress:
pct = (completed / len(items)) * 100
status = "" if result.success else ""
self.log(
f"[{completed}/{len(items)}] {pct:.1f}% - {status} {result.item.save_path.name}",
"success" if result.success else "error"
)
# Summary
self.log(f"Batch complete: {self.stats['successful']} successful, {self.stats['failed']} failed", "info")
if self.stats['successful'] > 0:
avg_speed = self.stats['total_bytes'] / self.stats['total_time'] / 1024 / 1024
self.log(f"Average speed: {avg_speed:.2f} MB/s", "info")
return results
def download_urls(self, urls: List[str], base_path: Path,
referer: Optional[str] = None,
metadata: Optional[Dict] = None) -> List[DownloadResult]:
"""
Convenience method to download URLs to a directory
Args:
urls: List of URLs to download
base_path: Directory to save files
referer: Optional referer header
metadata: Optional metadata for all downloads
Returns:
List of DownloadResult objects
"""
items = []
for url in urls:
filename = os.path.basename(urlparse(url).path) or f"download_{hashlib.sha256(url.encode()).hexdigest()[:8]}"
save_path = base_path / filename
items.append(DownloadItem(
url=url,
save_path=save_path,
referer=referer,
metadata=metadata
))
return self.download_batch(items)
def get_statistics(self) -> Dict:
"""Get download statistics"""
return self.stats.copy()
def cleanup_old_downloads(self, days: int = 30):
"""Remove old download records from database"""
if not self.use_database:
return 0
conn = sqlite3.connect(self.db_path)
try:
cursor = conn.cursor()
cursor.execute('''
DELETE FROM downloads
WHERE download_date < datetime('now', ? || ' days')
''', (-days,))
deleted = cursor.rowcount
conn.commit()
finally:
conn.close()
self.log(f"Cleaned up {deleted} old download records", "info")
return deleted
# Example usage
if __name__ == "__main__":
from pathlib import Path
# Test download manager
manager = DownloadManager(
max_workers=3,
rate_limit=0.5,
show_progress=True
)
# Test URLs
urls = [
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
"https://sample-videos.com/img/Sample-jpg-image-50kb.jpg",
"https://www.w3schools.com/html/img_girl.jpg"
]
# Download
results = manager.download_urls(urls, Path("/tmp/test-downloads"))
# Print results
logger.info(f"Downloaded {len([r for r in results if r.success])} of {len(results)} files")
logger.info(f"Total bytes: {manager.stats['total_bytes'] / 1024:.1f} KB")
logger.info(f"Total time: {manager.stats['total_time']:.2f} seconds")

View File

@@ -0,0 +1,375 @@
#!/usr/bin/env python3
"""
Downloader Monitoring Module
Tracks download success/failure and sends alerts when downloaders are consistently failing
"""
import sqlite3
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Dict, List
from modules.universal_logger import get_logger
class DownloaderMonitor:
"""Monitor downloader health and send alerts on persistent failures"""
def __init__(self, unified_db=None, settings_manager=None):
"""
Initialize monitor
Args:
unified_db: UnifiedDatabase instance
settings_manager: SettingsManager instance for config
"""
self.db = unified_db
self.settings_manager = settings_manager
self.logger = get_logger('DownloaderMonitor')
# Default config
self.config = {
'enabled': True,
'failure_window_hours': 3,
'min_consecutive_failures': 2,
'pushover': {
'enabled': True,
'priority': 1 # High priority
},
'downloaders': {
'fastdl': True,
'imginn': True,
'toolzu': True,
'instagram': True,
'snapchat': True,
'tiktok': True,
'forums': True
}
}
# Load config from settings manager
if self.settings_manager:
try:
monitoring_config = self.settings_manager.get('monitoring', {})
if monitoring_config:
self.config.update(monitoring_config)
except Exception as e:
self.logger.warning(f"Could not load monitoring config: {e}")
def log_download_attempt(self, downloader: str, username: str, success: bool,
file_count: int = 0, error_message: str = None):
"""
Log a download attempt
Args:
downloader: Downloader name (fastdl, imginn, toolzu, etc.)
username: Username being downloaded
success: Whether download succeeded
file_count: Number of files downloaded
error_message: Error message if failed
"""
if not self.config.get('enabled', True):
return
# Check if this downloader is being monitored
if not self.config.get('downloaders', {}).get(downloader, True):
return
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
INSERT INTO download_monitor
(downloader, username, timestamp, success, file_count, error_message, alert_sent)
VALUES (?, ?, ?, ?, ?, ?, 0)
""", (
downloader,
username,
datetime.now().isoformat(),
1 if success else 0,
file_count,
error_message
))
conn.commit()
self.logger.debug(f"Logged {downloader}/{username}: {'success' if success else 'failure'} ({file_count} files)")
# Check if we should send an alert
if not success:
self._check_and_alert(downloader, username)
except Exception as e:
self.logger.error(f"Failed to log download attempt: {e}")
def _check_and_alert(self, downloader: str, username: str):
"""
Check if downloader has been failing consistently and send alert
Args:
downloader: Downloader name
username: Username
"""
try:
window_hours = self.config.get('failure_window_hours', 3)
min_failures = self.config.get('min_consecutive_failures', 2)
cutoff_time = datetime.now() - timedelta(hours=window_hours)
with self.db.get_connection() as conn:
cursor = conn.cursor()
# Get recent attempts within the window
cursor.execute("""
SELECT timestamp, success, file_count, error_message, alert_sent
FROM download_monitor
WHERE downloader = ? AND username = ?
AND timestamp > ?
ORDER BY timestamp DESC
LIMIT 10
""", (downloader, username, cutoff_time.isoformat()))
attempts = cursor.fetchall()
if not attempts:
return
# Count consecutive failures from most recent
consecutive_failures = 0
latest_error = None
last_success_time = None
for attempt in attempts:
if attempt['success'] == 0:
consecutive_failures += 1
if latest_error is None and attempt['error_message']:
latest_error = attempt['error_message']
else:
last_success_time = attempt['timestamp']
break
# Check if we should alert
if consecutive_failures >= min_failures:
# Check if we already sent an alert recently
cursor.execute("""
SELECT COUNT(*) FROM download_monitor
WHERE downloader = ? AND username = ?
AND alert_sent = 1
AND timestamp > ?
""", (downloader, username, cutoff_time.isoformat()))
result = cursor.fetchone()
alert_count = result[0] if result else 0
if alert_count == 0:
# Send alert
self._send_alert(
downloader,
username,
consecutive_failures,
last_success_time,
latest_error
)
# Mark most recent failure as alerted
cursor.execute("""
UPDATE download_monitor
SET alert_sent = 1
WHERE id = (
SELECT id FROM download_monitor
WHERE downloader = ? AND username = ?
ORDER BY timestamp DESC
LIMIT 1
)
""", (downloader, username))
conn.commit()
except Exception as e:
self.logger.error(f"Failed to check for alerts: {e}")
def _send_alert(self, downloader: str, username: str, failure_count: int,
last_success_time: str, error_message: str):
"""
Send Pushover alert for persistent failures
Args:
downloader: Downloader name
username: Username
failure_count: Number of consecutive failures
last_success_time: Timestamp of last success (ISO format)
error_message: Latest error message
"""
if not self.config.get('pushover', {}).get('enabled', True):
return
try:
from modules.pushover_notifier import PushoverNotifier
# Get pushover config from settings
pushover_config = {}
if self.settings_manager:
pushover_config = self.settings_manager.get('pushover', {})
if not pushover_config.get('enabled'):
return
notifier = PushoverNotifier(
api_token=pushover_config.get('api_token'),
user_key=pushover_config.get('user_key')
)
# Calculate time since last success
time_since_success = "Never"
if last_success_time:
try:
last_success = datetime.fromisoformat(last_success_time)
delta = datetime.now() - last_success
hours = int(delta.total_seconds() / 3600)
if hours < 24:
time_since_success = f"{hours} hours ago"
else:
days = hours // 24
time_since_success = f"{days} days ago"
except (ValueError, TypeError) as e:
self.logger.warning(f"Failed to parse last_success_time '{last_success_time}': {e}")
time_since_success = "Unknown (parse error)"
# Format downloader name nicely
downloader_display = downloader.replace('_', ' ').title()
# Build message
title = f"🚨 {downloader_display} Failing"
message = f"""Downloader has been failing for {self.config.get('failure_window_hours', 3)}+ hours
Username: {username}
Consecutive Failures: {failure_count}
Last Success: {time_since_success}
Latest Error: {error_message or 'Unknown'}
Check logs for details."""
# Send notification with high priority
notifier.send_notification(
title=title,
message=message,
priority=self.config.get('pushover', {}).get('priority', 1)
)
self.logger.warning(f"Sent alert for {downloader}/{username} ({failure_count} failures)")
except Exception as e:
self.logger.error(f"Failed to send alert: {e}")
def get_downloader_status(self, downloader: str = None, hours: int = 24) -> List[Dict]:
"""
Get recent status for downloader(s)
Args:
downloader: Specific downloader (None = all)
hours: How many hours to look back
Returns:
List of status dicts with stats per downloader
"""
try:
cutoff = datetime.now() - timedelta(hours=hours)
with self.db.get_connection() as conn:
cursor = conn.cursor()
if downloader:
cursor.execute("""
SELECT
downloader,
COUNT(*) as total_attempts,
SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful,
SUM(CASE WHEN success = 0 THEN 1 ELSE 0 END) as failed,
SUM(file_count) as total_files,
MAX(CASE WHEN success = 1 THEN timestamp END) as last_success,
MAX(timestamp) as last_attempt
FROM download_monitor
WHERE downloader = ? AND timestamp > ?
GROUP BY downloader
""", (downloader, cutoff.isoformat()))
else:
cursor.execute("""
SELECT
downloader,
COUNT(*) as total_attempts,
SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful,
SUM(CASE WHEN success = 0 THEN 1 ELSE 0 END) as failed,
SUM(file_count) as total_files,
MAX(CASE WHEN success = 1 THEN timestamp END) as last_success,
MAX(timestamp) as last_attempt
FROM download_monitor
WHERE timestamp > ?
GROUP BY downloader
ORDER BY downloader
""", (cutoff.isoformat(),))
results = []
for row in cursor.fetchall():
results.append({
'downloader': row['downloader'],
'total_attempts': row['total_attempts'],
'successful': row['successful'] or 0,
'failed': row['failed'] or 0,
'total_files': row['total_files'] or 0,
'success_rate': round((row['successful'] or 0) / row['total_attempts'] * 100, 1) if row['total_attempts'] > 0 else 0,
'last_success': row['last_success'],
'last_attempt': row['last_attempt']
})
return results
except Exception as e:
self.logger.error(f"Failed to get downloader status: {e}")
return []
def clear_old_logs(self, days: int = 30):
"""
Clear monitoring logs older than specified days
Args:
days: How many days to keep
"""
try:
cutoff = datetime.now() - timedelta(days=days)
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
DELETE FROM download_monitor
WHERE timestamp < ?
""", (cutoff.isoformat(),))
deleted = cursor.rowcount
conn.commit()
self.logger.info(f"Cleared {deleted} old monitoring logs (older than {days} days)")
except Exception as e:
self.logger.error(f"Failed to clear old logs: {e}")
# Singleton instance with thread-safe initialization
_monitor_instance = None
_monitor_instance_lock = __import__('threading').Lock()
def get_monitor(unified_db=None, settings_manager=None):
"""Get or create monitor singleton (thread-safe)"""
global _monitor_instance
if _monitor_instance is None:
with _monitor_instance_lock:
# Double-check inside lock to prevent race condition
if _monitor_instance is None:
# Auto-initialize database if not provided
if unified_db is None:
from modules.unified_database import UnifiedDatabase
unified_db = UnifiedDatabase()
# Auto-initialize settings manager if not provided
if settings_manager is None:
from modules.settings_manager import SettingsManager
settings_manager = SettingsManager('/opt/media-downloader/database/media_downloader.db')
_monitor_instance = DownloaderMonitor(unified_db, settings_manager)
return _monitor_instance

502
modules/easynews_client.py Normal file
View File

@@ -0,0 +1,502 @@
"""
Easynews Client Module
Provides a client for interacting with the Easynews API to search for and download files.
All connections use HTTPS with HTTP Basic Auth.
"""
import re
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Callable, Dict, List, Optional
from urllib.parse import quote, urljoin
import requests
from requests.auth import HTTPBasicAuth
from modules.universal_logger import get_logger
logger = get_logger('EasynewsClient')
@dataclass
class EasynewsResult:
"""Represents a single search result from Easynews."""
filename: str
download_url: str
size_bytes: int
post_date: Optional[str]
subject: Optional[str]
poster: Optional[str]
newsgroup: Optional[str]
extension: Optional[str]
def to_dict(self) -> Dict[str, Any]:
return {
'filename': self.filename,
'download_url': self.download_url,
'size_bytes': self.size_bytes,
'post_date': self.post_date,
'subject': self.subject,
'poster': self.poster,
'newsgroup': self.newsgroup,
'extension': self.extension,
}
class EasynewsClient:
"""
Client for interacting with Easynews search and download APIs.
All connections use HTTPS with HTTP Basic Auth.
Supports HTTP, HTTPS, SOCKS4, and SOCKS5 proxies.
"""
BASE_URL = "https://members.easynews.com"
SEARCH_URL = "https://members.easynews.com/2.0/search/solr-search/advanced"
# Quality patterns for parsing
QUALITY_PATTERNS = [
(r'2160p|4k|uhd', '2160p'),
(r'1080p|fhd', '1080p'),
(r'720p|hd', '720p'),
(r'480p|sd', '480p'),
(r'360p', '360p'),
]
# Audio codec patterns (order matters - check combinations first)
AUDIO_PATTERNS = [
(r'truehd.*atmos|atmos.*truehd', 'Atmos'),
(r'atmos', 'Atmos'),
(r'truehd', 'TrueHD'),
(r'dts[\.\-]?hd[\.\-]?ma', 'DTS-HD'),
(r'dts[\.\-]?hd', 'DTS-HD'),
(r'dts[\.\-]?x', 'DTS:X'),
(r'dts', 'DTS'),
(r'7[\.\-]?1', '7.1'),
(r'ddp[\.\-\s]?5[\.\-]?1|eac3|e[\.\-]?ac[\.\-]?3|dd[\.\-]?5[\.\-]?1|ac3|5[\.\-]?1', '5.1'),
(r'ddp|dd\+', '5.1'),
(r'aac[\.\-]?5[\.\-]?1', '5.1'),
(r'aac', 'AAC'),
(r'flac', 'FLAC'),
(r'mp3', 'MP3'),
]
# Source/release type patterns
SOURCE_PATTERNS = [
(r'remux', 'Remux'),
(r'blu[\.\-]?ray|bdrip|brrip', 'BluRay'),
(r'web[\.\-]?dl', 'WEB-DL'),
(r'webrip', 'WEBRip'),
(r'web', 'WEB'),
(r'hdtv', 'HDTV'),
(r'dvdrip', 'DVDRip'),
(r'dvd', 'DVD'),
(r'hdcam|cam', 'CAM'),
]
def __init__(
self,
username: str,
password: str,
proxy_enabled: bool = False,
proxy_type: str = 'http',
proxy_host: Optional[str] = None,
proxy_port: Optional[int] = None,
proxy_username: Optional[str] = None,
proxy_password: Optional[str] = None,
):
"""
Initialize the Easynews client.
Args:
username: Easynews username
password: Easynews password
proxy_enabled: Whether to use a proxy
proxy_type: Proxy type (http, https, socks4, socks5)
proxy_host: Proxy hostname/IP
proxy_port: Proxy port
proxy_username: Proxy auth username (optional)
proxy_password: Proxy auth password (optional)
"""
self.username = username
self.password = password
self.auth = HTTPBasicAuth(username, password)
# Set up session with retry logic
self.session = requests.Session()
self.session.auth = self.auth
# Configure proxy if enabled
self.proxies = {}
if proxy_enabled and proxy_host and proxy_port:
proxy_url = self._build_proxy_url(
proxy_type, proxy_host, proxy_port,
proxy_username, proxy_password
)
self.proxies = {
'http': proxy_url,
'https': proxy_url,
}
self.session.proxies.update(self.proxies)
logger.info(f"Easynews client configured with {proxy_type} proxy: {proxy_host}:{proxy_port}")
def _build_proxy_url(
self,
proxy_type: str,
host: str,
port: int,
username: Optional[str] = None,
password: Optional[str] = None,
) -> str:
"""Build a proxy URL with optional authentication."""
scheme = proxy_type.lower()
if scheme not in ('http', 'https', 'socks4', 'socks5'):
scheme = 'http'
if username and password:
return f"{scheme}://{quote(username)}:{quote(password)}@{host}:{port}"
return f"{scheme}://{host}:{port}"
def test_connection(self) -> Dict[str, Any]:
"""
Test the connection to Easynews with current credentials.
Returns:
Dict with 'success' bool and 'message' string
"""
try:
# Try to access the members area
response = self.session.get(
f"{self.BASE_URL}/",
timeout=30,
)
if response.status_code == 200:
# Check if we're actually authenticated (not redirected to login)
if 'login' in response.url.lower() or 'sign in' in response.text.lower():
return {
'success': False,
'message': 'Invalid credentials - authentication failed'
}
return {
'success': True,
'message': 'Successfully connected to Easynews'
}
elif response.status_code == 401:
return {
'success': False,
'message': 'Invalid credentials - authentication failed'
}
else:
return {
'success': False,
'message': f'Unexpected response: HTTP {response.status_code}'
}
except requests.exceptions.ProxyError as e:
return {
'success': False,
'message': f'Proxy connection failed: {str(e)}'
}
except requests.exceptions.ConnectionError as e:
return {
'success': False,
'message': f'Connection failed: {str(e)}'
}
except requests.exceptions.Timeout:
return {
'success': False,
'message': 'Connection timed out'
}
except Exception as e:
logger.error(f"Easynews connection test failed: {e}")
return {
'success': False,
'message': f'Connection test failed: {str(e)}'
}
def search(
self,
query: str,
page: int = 1,
results_per_page: int = 50,
file_types: Optional[List[str]] = None,
) -> List[EasynewsResult]:
"""
Search Easynews for files matching the query.
Args:
query: Search query string
page: Page number (1-indexed)
results_per_page: Number of results per page (max 250)
file_types: Optional list of file extensions to filter (e.g., ['mkv', 'mp4'])
Returns:
List of EasynewsResult objects
"""
try:
# Build search parameters
params = {
'gps': query,
'pby': min(results_per_page, 250),
'pno': page,
'sS': 1, # Safe search off
'saession': '', # Session
'sb': 1, # Sort by date
'sbj': 1, # Subject search
'fly': 2, # File type filter mode
'fex': 'mkv,mp4', # Only mkv and mp4 files
}
# Add file type filter if specified
if file_types:
params['fty[]'] = file_types
else:
# Default to video file types
params['fty[]'] = ['VIDEO']
response = self.session.get(
self.SEARCH_URL,
params=params,
timeout=60,
)
response.raise_for_status()
# Check for empty response
if not response.content or not response.content.strip():
logger.warning(f"Easynews search for '{query}' returned empty response (HTTP {response.status_code})")
return []
try:
data = response.json()
except (ValueError, Exception) as json_err:
logger.warning(f"Easynews search for '{query}' returned invalid JSON (HTTP {response.status_code}, body: {response.text[:200]}): {json_err}")
return []
results = []
# Parse the response
if 'data' in data and isinstance(data['data'], list):
for item in data['data']:
result = self._parse_search_result(item)
if result:
results.append(result)
logger.info(f"Easynews search for '{query}' returned {len(results)} results")
return results
except requests.exceptions.RequestException as e:
logger.error(f"Easynews search failed: {e}")
return []
except Exception as e:
logger.error(f"Error parsing Easynews search results: {e}")
return []
def _parse_search_result(self, item: Dict[str, Any]) -> Optional[EasynewsResult]:
"""Parse a single search result from the API response."""
try:
# Extract the filename
filename = item.get('fn', '') or item.get('0', '')
if not filename:
return None
# Build download URL
# Format: https://username:password@members.easynews.com/dl/{hash}/{filename}
file_hash = item.get('hash', '') or item.get('0', '')
sig = item.get('sig', '')
if file_hash and sig:
# Use the authenticated download URL format
download_path = f"/dl/{file_hash}/{quote(filename)}?sig={sig}"
download_url = f"https://{quote(self.username)}:{quote(self.password)}@members.easynews.com{download_path}"
else:
# Fallback to basic URL
download_url = item.get('url', '') or item.get('rawURL', '')
if download_url and not download_url.startswith('http'):
download_url = urljoin(self.BASE_URL, download_url)
if not download_url:
return None
# Parse size
size_bytes = 0
size_str = item.get('rawSize', '') or item.get('size', '')
if isinstance(size_str, (int, float)):
size_bytes = int(size_str)
elif isinstance(size_str, str):
size_bytes = self._parse_size(size_str)
# Parse date
post_date = item.get('date', '') or item.get('d', '')
if post_date:
try:
# Try to parse and standardize the date format
if isinstance(post_date, str):
post_date = post_date.strip()
except Exception:
pass
# Get extension from API field (more reliable than parsing filename)
extension = item.get('extension', '') or item.get('11', '') or item.get('2', '')
if extension and not extension.startswith('.'):
extension = '.' + extension
return EasynewsResult(
filename=filename,
download_url=download_url,
size_bytes=size_bytes,
post_date=post_date if post_date else None,
subject=item.get('subject', '') or item.get('s', ''),
poster=item.get('poster', '') or item.get('p', ''),
newsgroup=item.get('newsgroup', '') or item.get('ng', ''),
extension=extension if extension else self._get_extension(filename),
)
except Exception as e:
logger.debug(f"Failed to parse search result: {e}")
return None
def _parse_size(self, size_str: str) -> int:
"""Parse a size string like '1.5 GB' to bytes."""
try:
size_str = size_str.strip().upper()
multipliers = {
'B': 1,
'KB': 1024,
'MB': 1024 ** 2,
'GB': 1024 ** 3,
'TB': 1024 ** 4,
}
for suffix, multiplier in multipliers.items():
if size_str.endswith(suffix):
value = float(size_str[:-len(suffix)].strip())
return int(value * multiplier)
# Try to parse as plain number
return int(float(size_str))
except Exception:
return 0
def _get_extension(self, filename: str) -> Optional[str]:
"""Extract file extension from filename."""
if '.' in filename:
return filename.rsplit('.', 1)[-1].lower()
return None
@staticmethod
def detect_quality(filename: str) -> Optional[str]:
"""Detect video quality from filename."""
filename_lower = filename.lower()
for pattern, quality in EasynewsClient.QUALITY_PATTERNS:
if re.search(pattern, filename_lower):
return quality
return None
@staticmethod
def detect_audio(filename: str) -> Optional[str]:
"""Detect audio codec from filename."""
filename_lower = filename.lower()
for pattern, audio in EasynewsClient.AUDIO_PATTERNS:
if re.search(pattern, filename_lower):
return audio
return None
@staticmethod
def detect_source(filename: str) -> Optional[str]:
"""Detect source/release type from filename."""
filename_lower = filename.lower()
for pattern, source in EasynewsClient.SOURCE_PATTERNS:
if re.search(pattern, filename_lower):
return source
return None
def download_file(
self,
url: str,
dest_path: str,
progress_callback: Optional[Callable[[int, int], None]] = None,
chunk_size: int = 8192,
) -> Dict[str, Any]:
"""
Download a file from Easynews.
Args:
url: Download URL (with authentication embedded or using session)
dest_path: Destination file path
progress_callback: Optional callback(downloaded_bytes, total_bytes)
chunk_size: Download chunk size in bytes
Returns:
Dict with 'success' bool and 'message' or 'path'
"""
try:
# Start the download with streaming
response = self.session.get(
url,
stream=True,
timeout=30,
)
response.raise_for_status()
total_size = int(response.headers.get('content-length', 0))
downloaded = 0
with open(dest_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=chunk_size):
if chunk:
f.write(chunk)
downloaded += len(chunk)
if progress_callback:
progress_callback(downloaded, total_size)
logger.info(f"Downloaded file to {dest_path} ({downloaded} bytes)")
return {
'success': True,
'path': dest_path,
'size': downloaded,
}
except requests.exceptions.RequestException as e:
logger.error(f"Download failed: {e}")
return {
'success': False,
'message': f'Download failed: {str(e)}'
}
except IOError as e:
logger.error(f"Failed to write file: {e}")
return {
'success': False,
'message': f'Failed to write file: {str(e)}'
}
except Exception as e:
logger.error(f"Unexpected error during download: {e}")
return {
'success': False,
'message': f'Download error: {str(e)}'
}
def get_file_info(self, url: str) -> Dict[str, Any]:
"""
Get information about a file without downloading it.
Args:
url: File URL
Returns:
Dict with file information (size, content-type, etc.)
"""
try:
response = self.session.head(url, timeout=30)
response.raise_for_status()
return {
'success': True,
'size': int(response.headers.get('content-length', 0)),
'content_type': response.headers.get('content-type', ''),
'last_modified': response.headers.get('last-modified', ''),
}
except Exception as e:
logger.error(f"Failed to get file info: {e}")
return {
'success': False,
'message': str(e)
}

1650
modules/easynews_monitor.py Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

3776
modules/fastdl_module.py Executable file

File diff suppressed because it is too large Load Diff

382
modules/filename_parser.py Normal file
View File

@@ -0,0 +1,382 @@
#!/usr/bin/env python3
"""
Filename Parser Module for Manual Import
Parses filenames based on configurable patterns to extract metadata
"""
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any
class FilenameParser:
"""
Parse filenames using configurable patterns to extract metadata.
Supported pattern tokens:
- {username} - Username/source (alphanumeric, underscores, periods)
- {YYYYMMDD} - Date as 8 digits (20251127)
- {HHMMSS} - Time as 6 digits (172753)
- {YYYYMMDD_HHMMSS} - Combined date_time with underscore
- {id} - Media ID (any characters until next separator)
- {description} - Text content (any characters until next separator)
- {num} - Sequence number (digits)
- {ext} - File extension (optional, auto-handled)
Example patterns:
- Instagram Stories: "{username}_{YYYYMMDD}_{HHMMSS}_{id}"
- Instagram Posts: "{username}_{YYYYMMDD}_{HHMMSS}_{id}"
- TikTok: "{YYYYMMDD}_{description}_{id}_{num}"
"""
# Token definitions: token_name -> (regex_pattern, is_greedy)
TOKEN_PATTERNS = {
'username': (r'[a-zA-Z0-9_.]+', False),
'YYYYMMDD': (r'\d{8}', False),
'HHMMSS': (r'\d{6}', False),
'YYYYMMDD_HHMMSS': (r'\d{8}_\d{6}', False),
'id': (r'.+', True), # Greedy - matches everything until separator
'description': (r'.+', True), # Greedy
'num': (r'\d+', False),
'ext': (r'\.[a-zA-Z0-9]+', False),
}
def __init__(self, pattern: str):
"""
Initialize parser with a filename pattern.
Args:
pattern: Pattern string like "{username}-{YYYYMMDD}_{HHMMSS}-{id}"
"""
self.pattern = pattern
self.regex, self.token_order = self._compile_pattern(pattern)
def _compile_pattern(self, pattern: str) -> tuple:
"""
Convert pattern string to compiled regex.
Returns:
Tuple of (compiled_regex, list_of_token_names)
"""
# Find all tokens in the pattern
token_regex = r'\{(\w+)\}'
tokens = re.findall(token_regex, pattern)
# Build regex pattern
regex_pattern = pattern
# Escape special regex characters in the pattern (except our tokens)
# First, temporarily replace tokens
for i, token in enumerate(tokens):
regex_pattern = regex_pattern.replace(f'{{{token}}}', f'__TOKEN_{i}__', 1)
# Escape special chars
regex_pattern = re.escape(regex_pattern)
# Replace tokens back with their regex patterns
for i, token in enumerate(tokens):
if token in self.TOKEN_PATTERNS:
token_pattern, is_greedy = self.TOKEN_PATTERNS[token]
# Use non-greedy for greedy tokens when there's a separator after
if is_greedy:
# Make it non-greedy so it stops at the next separator
token_pattern = r'.+?'
regex_pattern = regex_pattern.replace(f'__TOKEN_{i}__', f'({token_pattern})', 1)
else:
# Unknown token - treat as any characters
regex_pattern = regex_pattern.replace(f'__TOKEN_{i}__', r'(.+?)', 1)
# Handle the last greedy token specially - it should be truly greedy
# Find the last greedy token and make it greedy
for token in reversed(tokens):
if token in self.TOKEN_PATTERNS:
_, is_greedy = self.TOKEN_PATTERNS[token]
if is_greedy:
# The last occurrence of .+? for this token should be .+
# We need to be more careful here - just make the whole pattern work
break
# Add start anchor, but allow extension at end
regex_pattern = '^' + regex_pattern + r'(?:\.[a-zA-Z0-9]+)?$'
try:
compiled = re.compile(regex_pattern)
except re.error as e:
raise ValueError(f"Invalid pattern '{pattern}': {e}")
return compiled, tokens
def parse(self, filename: str) -> Dict[str, Any]:
"""
Parse a filename and extract metadata.
Args:
filename: Filename to parse (with or without extension)
Returns:
Dictionary with extracted metadata:
- username: str or None
- datetime: datetime object or None
- media_id: str or None
- description: str or None
- num: int or None
- extension: str or None
- valid: bool
- error: str or None (if valid is False)
"""
result = {
'username': None,
'datetime': None,
'media_id': None,
'description': None,
'num': None,
'extension': None,
'valid': False,
'error': None,
'raw_values': {}
}
# Extract extension
path = Path(filename)
extension = path.suffix.lower() if path.suffix else None
basename = path.stem
result['extension'] = extension
# Try to match the pattern
match = self.regex.match(basename) or self.regex.match(filename)
if not match:
result['error'] = f"Filename doesn't match pattern: {self.pattern}"
return result
# Extract values for each token
groups = match.groups()
for i, token in enumerate(self.token_order):
if i < len(groups):
value = groups[i]
result['raw_values'][token] = value
# Map tokens to result fields
if token == 'username':
result['username'] = value.lower()
elif token == 'id':
result['media_id'] = value
elif token == 'description':
result['description'] = value
elif token == 'num':
try:
result['num'] = int(value)
except ValueError:
result['num'] = value
# Parse datetime from date/time tokens
result['datetime'] = self._parse_datetime(result['raw_values'])
result['valid'] = True
return result
def _parse_datetime(self, raw_values: Dict[str, str]) -> Optional[datetime]:
"""
Parse datetime from extracted raw values.
Supports:
- YYYYMMDD_HHMMSS combined
- YYYYMMDD + HHMMSS separate
- YYYYMMDD only (time defaults to 00:00:00)
"""
try:
if 'YYYYMMDD_HHMMSS' in raw_values:
dt_str = raw_values['YYYYMMDD_HHMMSS']
return datetime.strptime(dt_str, '%Y%m%d_%H%M%S')
if 'YYYYMMDD' in raw_values:
date_str = raw_values['YYYYMMDD']
if 'HHMMSS' in raw_values:
time_str = raw_values['HHMMSS']
return datetime.strptime(f'{date_str}_{time_str}', '%Y%m%d_%H%M%S')
else:
# Date only, no time
return datetime.strptime(date_str, '%Y%m%d')
return None
except ValueError:
return None
def validate_pattern(self) -> tuple:
"""
Validate the pattern is properly formed.
Returns:
Tuple of (is_valid: bool, error_message: str or None)
"""
try:
# Check for at least one recognized token
token_regex = r'\{(\w+)\}'
tokens = re.findall(token_regex, self.pattern)
if not tokens:
return False, "Pattern must contain at least one token"
# Check all tokens are recognized
unknown_tokens = [t for t in tokens if t not in self.TOKEN_PATTERNS]
if unknown_tokens:
return False, f"Unknown tokens: {', '.join(unknown_tokens)}"
return True, None
except Exception as e:
return False, str(e)
def create_parser(pattern: str) -> FilenameParser:
"""
Factory function to create a FilenameParser.
Args:
pattern: Pattern string
Returns:
FilenameParser instance
"""
return FilenameParser(pattern)
def parse_with_fallbacks(filename: str, patterns: List[str]) -> Dict[str, Any]:
"""
Try parsing a filename with multiple patterns, return first successful match.
Args:
filename: Filename to parse
patterns: List of pattern strings to try in order
Returns:
Dictionary with extracted metadata (same as FilenameParser.parse)
"""
last_error = None
for pattern in patterns:
try:
parser = FilenameParser(pattern)
result = parser.parse(filename)
if result['valid']:
result['matched_pattern'] = pattern
return result
last_error = result.get('error')
except Exception as e:
last_error = str(e)
# Return failure with last error
return {
'username': None,
'datetime': None,
'media_id': None,
'description': None,
'num': None,
'extension': Path(filename).suffix.lower() if Path(filename).suffix else None,
'valid': False,
'error': last_error or f"Filename doesn't match any of {len(patterns)} patterns",
'raw_values': {}
}
# Instagram has many filename formats from different download sources
INSTAGRAM_PATTERNS = [
# Standard gallery-dl formats
'{username}_{YYYYMMDD}_{HHMMSS}_{id}', # gallery-dl default (underscores)
'{username}-{YYYYMMDD}_{HHMMSS}-{id}', # alternative format (dashes around date)
# Formats with _n suffix (common from some scrapers)
'{username}_{YYYYMMDD}_{HHMMSS}_{id}_n', # with _n suffix
'{username}-{YYYYMMDD}_{HHMMSS}-{id}_n', # dashes + _n suffix
# Formats with hl=en language parameter (imginn/instaloader variants)
'{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}_n', # language tag + _n suffix
'{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}', # language tag, no _n suffix
# Formats with leading underscore (some scrapers prefix underscore)
'_{username}_{YYYYMMDD}_{HHMMSS}_{id}_n', # leading underscore + _n suffix
'_{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}_n', # leading underscore + lang + _n
# Formats with media shortcode before date (some browser extensions / save tools)
'{username}-video-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-video-shortcode-date_hash
'{username}-photo-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-photo-shortcode-date_hash
'{username}-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-shortcode-date_hash (no type prefix, must be last)
]
# Predefined patterns for common platforms
PRESET_PATTERNS = {
'instagram_stories': {
'name': 'Instagram Stories',
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
'alt_patterns': INSTAGRAM_PATTERNS,
'example': 'evalongoria_20251127_172753_AQOGOcCUbrMy...',
'platform': 'instagram',
'content_type': 'stories'
},
'instagram_posts': {
'name': 'Instagram Posts',
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
'alt_patterns': INSTAGRAM_PATTERNS,
'example': 'evalongoria_20251127_172753_18538674661006538',
'platform': 'instagram',
'content_type': 'posts'
},
'instagram_reels': {
'name': 'Instagram Reels',
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
'alt_patterns': INSTAGRAM_PATTERNS,
'example': 'evalongoria_20251127_172753_18538674661006538',
'platform': 'instagram',
'content_type': 'reels'
},
'tiktok_videos': {
'name': 'TikTok Videos',
'pattern': '{YYYYMMDD}_{description}_{id}_{num}',
'example': '20251127_beautiful_sunset_1234567890_1',
'platform': 'tiktok',
'content_type': 'videos'
},
'snapchat_stories': {
'name': 'Snapchat Stories',
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
'example': 'username_20251127_172753_story123',
'platform': 'snapchat',
'content_type': 'stories'
},
'youtube_videos': {
'name': 'YouTube Videos',
'pattern': '{id}',
'example': 'dQw4w9WgXcQ',
'platform': 'youtube',
'content_type': 'videos',
'use_ytdlp': True
}
}
def get_preset_patterns() -> Dict[str, Dict]:
"""Get all predefined filename patterns."""
return PRESET_PATTERNS.copy()
# Test/demo function
if __name__ == '__main__':
# Test with the user's example
test_pattern = '{username}-{YYYYMMDD}_{HHMMSS}-{id}'
test_filename = 'tiannahcgarcia-20251127_172753-AQOGOcCUbrMyAL0VXcQjnpHr6aY6U25C1SbaREqFJv7_MVXNVUvBd290MwlNFmwOTK5PuLx6DtK9cYoot0c5Y6a4vuDtOaug2heLank.jpg'
parser = FilenameParser(test_pattern)
result = parser.parse(test_filename)
print(f"Pattern: {test_pattern}")
print(f"Filename: {test_filename}")
print(f"Result: {result}")
print()
# Test Instagram post format
test_pattern2 = '{username}_{YYYYMMDD}_{HHMMSS}_{id}'
test_filename2 = 'evalongoria_20251027_155842_18538674661006538.jpg'
parser2 = FilenameParser(test_pattern2)
result2 = parser2.parse(test_filename2)
print(f"Pattern: {test_pattern2}")
print(f"Filename: {test_filename2}")
print(f"Result: {result2}")

485
modules/forum_db_adapter.py Executable file
View File

@@ -0,0 +1,485 @@
#!/usr/bin/env python3
"""
Forum Database Adapter for Unified Database
Provides compatibility layer for forum_downloader to use UnifiedDatabase
"""
import sqlite3
import json
import hashlib
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any
import time
from modules.universal_logger import get_logger
logger = get_logger('ForumAdapter')
class ForumDatabaseAdapter:
"""
Adapter to allow forum_downloader to use UnifiedDatabase
Mimics the original forum database interface
"""
def __init__(self, unified_db, db_path=None):
"""
Initialize the adapter
Args:
unified_db: UnifiedDatabase instance
db_path: Ignored - kept for compatibility
"""
self.unified_db = unified_db
self.db_path = db_path # Keep for compatibility but not used
def get_file_hash(self, file_path: str) -> Optional[str]:
"""Calculate SHA256 hash of a file (delegates to UnifiedDatabase)"""
from modules.unified_database import UnifiedDatabase
return UnifiedDatabase.get_file_hash(file_path)
def get_download_by_file_hash(self, file_hash: str) -> Optional[Dict]:
"""Get download record by file hash (delegates to UnifiedDatabase)"""
return self.unified_db.get_download_by_file_hash(file_hash)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def _get_connection(self):
"""Get a connection from unified database"""
return self.unified_db.get_connection(for_write=True)
def _execute_with_retry(self, operation, retries: int = 3, for_write: bool = False):
"""
Execute a database operation with retry logic for lock/deadlock errors.
Args:
operation: A callable that takes a connection and returns a result
retries: Number of retry attempts
for_write: Whether this is a write operation
Returns:
The result of the operation
Raises:
sqlite3.OperationalError: If operation fails after all retries
"""
for attempt in range(retries):
try:
with self.unified_db.get_connection(for_write=for_write) as conn:
return operation(conn)
except sqlite3.OperationalError as e:
if ("locked" in str(e) or "deadlock" in str(e).lower()) and attempt < retries - 1:
delay = 1 + attempt * 2 # Exponential backoff
logger.debug(f"Database locked, retrying in {delay} seconds...")
time.sleep(delay)
continue
else:
logger.error(f"Database operation failed after {attempt + 1} attempts: {e}")
raise
# This point should never be reached due to the raise above,
# but raise explicitly to satisfy type checkers
raise sqlite3.OperationalError("Database operation failed after all retries")
def db_add_thread(self, thread_id: str, forum_name: str, thread_url: str,
thread_title: str = None, monitor_until: datetime = None) -> bool:
"""Add a forum thread to tracking"""
def operation(conn):
cursor = conn.cursor()
cursor.execute('''
INSERT OR IGNORE INTO forum_threads
(thread_id, forum_name, thread_url, thread_title,
created_date, last_checked, status, monitor_until)
VALUES (?, ?, ?, ?, ?, ?, 'active', ?)
''', (thread_id, forum_name, thread_url, thread_title,
datetime.now(), datetime.now(), monitor_until))
conn.commit()
return cursor.rowcount > 0
try:
return self._execute_with_retry(operation, for_write=True)
except Exception as e:
logger.error(f"Error adding thread: {e}")
return False
def db_update_thread(self, thread_id: str, last_post_date: datetime = None,
post_count: int = None) -> bool:
"""Update thread information"""
# Build updates list outside the operation for clarity
updates = ["last_checked = ?"]
params = [datetime.now()]
if last_post_date:
updates.append("last_post_date = ?")
params.append(last_post_date)
if post_count is not None:
updates.append("post_count = ?")
params.append(post_count)
params.append(thread_id)
# Pre-build the SQL query to avoid f-string inside operation
sql = f'UPDATE forum_threads SET {", ".join(updates)} WHERE thread_id = ?'
def operation(conn):
cursor = conn.cursor()
cursor.execute(sql, params)
conn.commit()
return cursor.rowcount > 0
try:
return self._execute_with_retry(operation, for_write=True)
except Exception as e:
logger.error(f"Error updating thread {thread_id}: {e}")
return False
def db_update_thread_last_checked(self, thread_id: str) -> bool:
"""Update the last_checked timestamp for a forum thread"""
def operation(conn):
cursor = conn.cursor()
cursor.execute('''
UPDATE forum_threads
SET last_checked = ?
WHERE thread_id = ?
''', (datetime.now(), thread_id))
conn.commit()
return cursor.rowcount > 0
try:
return self._execute_with_retry(operation, for_write=True)
except Exception as e:
logger.error(f"Error updating last_checked for thread {thread_id}: {e}")
return False
def db_get_thread(self, thread_id: str) -> Optional[Dict]:
"""Get thread information"""
def operation(conn):
cursor = conn.cursor()
cursor.execute(
"SELECT * FROM forum_threads WHERE thread_id = ?",
(thread_id,)
)
row = cursor.fetchone()
return dict(row) if row else None
try:
return self._execute_with_retry(operation, for_write=False)
except Exception as e:
logger.error(f"Error getting thread {thread_id}: {e}")
return None
def db_add_post(self, post_id: str, thread_id: str, post_url: str = None,
author: str = None, post_date: datetime = None,
has_images: bool = False) -> bool:
"""Add a forum post"""
with self._get_connection() as conn:
cursor = conn.cursor()
try:
content_hash = hashlib.sha256(f"{thread_id}:{post_id}".encode()).hexdigest()
cursor.execute('''
INSERT INTO forum_posts
(post_id, thread_id, post_url, author, post_date,
content_hash, has_images)
VALUES (?, ?, ?, ?, ?, ?, ?)
ON CONFLICT (post_id) DO UPDATE SET
thread_id = EXCLUDED.thread_id,
post_url = EXCLUDED.post_url,
author = EXCLUDED.author,
post_date = EXCLUDED.post_date,
content_hash = EXCLUDED.content_hash,
has_images = EXCLUDED.has_images
''', (post_id, thread_id, post_url, author, post_date,
content_hash, has_images))
conn.commit()
return True
except Exception as e:
logger.error(f"Error adding post: {e}")
return False
def db_get_image_id(self, img_url: str) -> Optional[int]:
"""Check if image already exists in downloads"""
url_hash = self.unified_db.get_url_hash(img_url)
def operation(conn):
cursor = conn.cursor()
cursor.execute(
"SELECT id FROM downloads WHERE url_hash = ? AND platform = 'forums'",
(url_hash,)
)
row = cursor.fetchone()
return row[0] if row else None
try:
return self._execute_with_retry(operation, for_write=False)
except Exception as e:
logger.error(f"Error checking image existence: {e}")
return None
def db_add_image(self, img_url: str, thread_id: str, post_id: str,
filename: str, file_path: str, forum_name: str) -> bool:
"""Add image to downloads"""
metadata = {
'thread_id': thread_id,
'post_id': post_id,
'forum_name': forum_name
}
return self.unified_db.record_download(
url=img_url,
platform='forums',
source=forum_name,
content_type='image',
filename=filename,
file_path=file_path,
metadata=metadata
)
def db_search_exists(self, search_id: str) -> bool:
"""Check if search already exists"""
def operation(conn):
cursor = conn.cursor()
cursor.execute(
"SELECT 1 FROM search_monitors WHERE search_id = ?",
(search_id,)
)
return cursor.fetchone() is not None
try:
return self._execute_with_retry(operation, for_write=False)
except Exception as e:
logger.error(f"Error checking search existence: {e}")
return False
def db_add_search(self, search_id: str, forum_name: str, search_query: str,
search_url: str = None, check_frequency_hours: int = 24) -> bool:
"""Add or update search monitor"""
with self._get_connection() as conn:
cursor = conn.cursor()
try:
cursor.execute('''
INSERT OR REPLACE INTO search_monitors
(search_id, platform, source, search_query, search_url,
last_checked, check_frequency_hours, active)
VALUES (?, 'forums', ?, ?, ?, ?, ?, 1)
''', (search_id, forum_name, search_query, search_url,
datetime.now(), check_frequency_hours))
conn.commit()
return True
except Exception as e:
logger.error(f"Error adding search: {e}")
return False
def db_update_search_results(self, search_id: str, results_count: int) -> bool:
"""Update search results count"""
with self._get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE search_monitors
SET last_checked = ?, results_found = ?
WHERE search_id = ?
''', (datetime.now(), results_count, search_id))
conn.commit()
return cursor.rowcount > 0
def add_to_download_queue(self, url: str, referer: str = None, save_path: str = None,
thread_id: str = None, post_id: str = None,
forum_name: str = None, metadata: Dict = None) -> bool:
"""Add item to download queue"""
# Check if already downloaded
if self.unified_db.is_downloaded(url, platform='forums'):
return False
# Check if already in queue (with retry logic)
def check_queue(conn):
cursor = conn.cursor()
cursor.execute(
"SELECT status FROM download_queue WHERE url = ?",
(url,)
)
return cursor.fetchone()
try:
existing = self._execute_with_retry(check_queue, for_write=False)
if existing:
if existing[0] == 'completed':
return False # Already downloaded
elif existing[0] == 'pending':
return False # Already in queue
except Exception as e:
logger.error(f"Error checking download queue: {e}")
return False
# Add to queue
queue_metadata = metadata or {}
queue_metadata.update({
'thread_id': thread_id,
'post_id': post_id,
'forum_name': forum_name
})
with self._get_connection() as conn:
cursor = conn.cursor()
try:
cursor.execute('''
INSERT INTO download_queue
(url, platform, source, referer, save_path, status, metadata)
VALUES (?, 'forums', ?, ?, ?, 'pending', ?)
''', (url, forum_name, referer, str(save_path) if save_path else None, json.dumps(queue_metadata)))
conn.commit()
return True
except sqlite3.IntegrityError:
return False # URL already in queue
except Exception as e:
logger.error(f"Error adding to queue: {e}")
return False
def is_in_download_queue(self, url: str) -> bool:
"""Check if URL is in download queue"""
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT 1 FROM download_queue WHERE url = ? AND status = 'pending'",
(url,)
)
return cursor.fetchone() is not None
def is_already_downloaded(self, url: str, forum_name: str = None) -> bool:
"""Check if thread URL is already being tracked"""
# For thread URLs, check the forum_threads table
import hashlib
thread_id = hashlib.sha256(url.encode()).hexdigest()
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
if forum_name:
# Check for specific forum
cursor.execute('''
SELECT 1 FROM forum_threads
WHERE forum_name = ? AND (thread_url = ? OR thread_id = ?)
LIMIT 1
''', (forum_name, url, thread_id))
else:
# Check any forum
cursor.execute('''
SELECT 1 FROM forum_threads
WHERE thread_url = ? OR thread_id = ?
LIMIT 1
''', (url, thread_id))
return cursor.fetchone() is not None
def mark_download_complete(self, url: str, filename: str = None,
file_path: str = None) -> bool:
"""Mark download as complete in queue"""
with self._get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE download_queue
SET status = 'completed', download_date = ?
WHERE url = ?
''', (datetime.now(), url))
conn.commit()
return cursor.rowcount > 0
def mark_download_failed(self, url: str, error_message: str = None) -> bool:
"""Mark download as failed in queue"""
with self._get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE download_queue
SET status = 'failed', attempts = attempts + 1, error_message = ?
WHERE url = ?
''', (error_message, url))
conn.commit()
return cursor.rowcount > 0
def record_download(self, url: str, thread_id: str = None, post_id: str = None,
filename: str = None, metadata: Dict = None, file_path: str = None,
post_date = None) -> bool:
"""Record a download in the unified database
Args:
url: URL of the downloaded content
thread_id: Forum thread ID
post_id: Forum post ID
filename: Name of downloaded file
metadata: Additional metadata dict
file_path: Full path to downloaded file
post_date: Date of the forum post (datetime or None)
"""
# Extract forum name from metadata if available
forum_name = metadata.get('forum_name') if metadata else None
# Prepare full metadata
full_metadata = metadata or {}
if thread_id:
full_metadata['thread_id'] = thread_id
if post_id:
full_metadata['post_id'] = post_id
# Calculate file hash if file_path provided
file_hash = None
if file_path:
try:
from modules.unified_database import UnifiedDatabase
file_hash = UnifiedDatabase.get_file_hash(file_path)
except Exception:
pass # If hash fails, continue without it
# Record in unified database
return self.unified_db.record_download(
url=url,
platform='forums',
source=forum_name or 'unknown',
content_type='image',
filename=filename,
file_path=file_path,
file_hash=file_hash,
post_date=post_date,
metadata=full_metadata
)
def get_pending_downloads(self, limit: int = 100) -> List[Dict]:
"""Get pending downloads from queue"""
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT * FROM download_queue
WHERE platform = 'forums' AND status = 'pending'
ORDER BY priority, created_date
LIMIT ?
''', (limit,))
return [dict(row) for row in cursor.fetchall()]
def cleanup_old_data(self, days: int = 180):
"""Clean up old data"""
with self._get_connection() as conn:
cursor = conn.cursor()
# Clean old downloads
cursor.execute('''
DELETE FROM downloads
WHERE platform = 'forums'
AND download_date < datetime('now', ? || ' days')
AND status = 'completed'
''', (-days,))
# Clean old queue items
cursor.execute('''
DELETE FROM download_queue
WHERE platform = 'forums'
AND created_date < datetime('now', ? || ' days')
AND status IN ('completed', 'failed')
''', (-days,))
# Expire old monitors
cursor.execute('''
UPDATE forum_threads
SET status = 'expired'
WHERE monitor_until < datetime('now')
AND status = 'active'
''')
conn.commit()

5029
modules/forum_downloader.py Executable file

File diff suppressed because it is too large Load Diff

2019
modules/imginn_api_module.py Normal file

File diff suppressed because it is too large Load Diff

3775
modules/imginn_module.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,410 @@
#!/usr/bin/env python3
"""
Immich Face Integration Module
Integrates with Immich's face recognition system to leverage its existing
face clustering and recognition data for media-downloader files.
Immich uses:
- InsightFace with buffalo_l model (same as media-downloader)
- DBSCAN clustering for face grouping
- 512-dimensional face embeddings
- PostgreSQL for storage
Path mapping:
- Media-downloader: /opt/immich/md/...
- Immich sees: /mnt/media/md/...
"""
import os
import json
from pathlib import Path
from typing import Optional, List, Dict, Any, Tuple
from datetime import datetime
import httpx
from modules.universal_logger import get_logger
logger = get_logger('ImmichFace')
class ImmichFaceIntegration:
"""Interface with Immich's face recognition system."""
# Path mapping between systems
LOCAL_BASE = '/opt/immich'
IMMICH_BASE = '/mnt/media'
def __init__(self, api_url: str = None, api_key: str = None):
"""
Initialize Immich face integration.
Args:
api_url: Immich API URL (default: http://localhost:2283/api)
api_key: Immich API key
"""
self.api_url = (api_url or os.getenv('IMMICH_API_URL', 'http://localhost:2283/api')).rstrip('/')
self.api_key = api_key or os.getenv('IMMICH_API_KEY', '')
self._client = None
self._people_cache = None
self._people_cache_time = None
self._cache_ttl = 300 # 5 minutes
@property
def is_configured(self) -> bool:
"""Check if Immich integration is properly configured."""
return bool(self.api_key)
def _get_client(self) -> httpx.Client:
"""Get or create HTTP client."""
if self._client is None:
self._client = httpx.Client(
base_url=self.api_url,
headers={
'x-api-key': self.api_key,
'Accept': 'application/json'
},
timeout=30.0
)
return self._client
def _local_to_immich_path(self, local_path: str) -> str:
"""
Convert local path to Immich's path format.
Example:
/opt/immich/md/instagram/user/image.jpg
-> /mnt/media/md/instagram/user/image.jpg
"""
return local_path.replace(self.LOCAL_BASE, self.IMMICH_BASE)
def _immich_to_local_path(self, immich_path: str) -> str:
"""
Convert Immich's path to local path format.
Example:
/mnt/media/md/instagram/user/image.jpg
-> /opt/immich/md/instagram/user/image.jpg
"""
return immich_path.replace(self.IMMICH_BASE, self.LOCAL_BASE)
def test_connection(self) -> Dict[str, Any]:
"""
Test connection to Immich API.
Returns:
Dict with 'success', 'message', and optionally 'server_info'
"""
if not self.is_configured:
return {
'success': False,
'message': 'Immich API key not configured'
}
try:
client = self._get_client()
response = client.get('/server/ping')
if response.status_code == 200:
# Get server info
info_response = client.get('/server/version')
server_info = info_response.json() if info_response.status_code == 200 else {}
return {
'success': True,
'message': 'Connected to Immich',
'server_info': server_info
}
else:
return {
'success': False,
'message': f'Immich API returned status {response.status_code}'
}
except httpx.ConnectError as e:
return {
'success': False,
'message': f'Cannot connect to Immich at {self.api_url}: {e}'
}
except Exception as e:
return {
'success': False,
'message': f'Immich API error: {e}'
}
def get_all_people(self, force_refresh: bool = False) -> List[Dict[str, Any]]:
"""
Get all people/faces from Immich.
Returns:
List of people with id, name, thumbnailPath, etc.
"""
if not self.is_configured:
return []
# Check cache
if not force_refresh and self._people_cache is not None:
if self._people_cache_time:
age = (datetime.now() - self._people_cache_time).total_seconds()
if age < self._cache_ttl:
return self._people_cache
try:
client = self._get_client()
response = client.get('/people')
if response.status_code == 200:
data = response.json()
# Immich returns {'people': [...], 'total': N, ...}
people = data.get('people', data) if isinstance(data, dict) else data
# Cache the result
self._people_cache = people
self._people_cache_time = datetime.now()
logger.info(f"Fetched {len(people)} people from Immich")
return people
else:
logger.error(f"Failed to get people: {response.status_code}")
return []
except Exception as e:
logger.error(f"Error getting people from Immich: {e}")
return []
def get_named_people(self) -> List[Dict[str, Any]]:
"""
Get only people with names assigned in Immich.
Returns:
List of named people
"""
people = self.get_all_people()
return [p for p in people if p.get('name')]
def get_asset_by_path(self, local_path: str) -> Optional[Dict[str, Any]]:
"""
Find an Immich asset by its file path.
Args:
local_path: Local file path (e.g., /opt/immich/md/...)
Returns:
Asset dict or None if not found
"""
if not self.is_configured:
return None
immich_path = self._local_to_immich_path(local_path)
try:
client = self._get_client()
# Search by original path
response = client.post('/search/metadata', json={
'originalPath': immich_path
})
if response.status_code == 200:
data = response.json()
assets = data.get('assets', {}).get('items', [])
if assets:
return assets[0]
return None
except Exception as e:
logger.error(f"Error searching asset by path: {e}")
return None
def get_faces_for_asset(self, asset_id: str) -> List[Dict[str, Any]]:
"""
Get all detected faces for an asset.
Args:
asset_id: Immich asset ID
Returns:
List of face data including person info and bounding boxes
"""
if not self.is_configured:
return []
try:
client = self._get_client()
response = client.get(f'/faces', params={'id': asset_id})
if response.status_code == 200:
return response.json()
else:
logger.warning(f"Failed to get faces for asset {asset_id}: {response.status_code}")
return []
except Exception as e:
logger.error(f"Error getting faces for asset: {e}")
return []
def get_faces_for_file(self, local_path: str) -> Dict[str, Any]:
"""
Get face recognition results for a local file using Immich.
This is the main method for integration - given a local file path,
it finds the asset in Immich and returns any detected faces.
Args:
local_path: Local file path (e.g., /opt/immich/md/...)
Returns:
Dict with:
- found: bool - whether file exists in Immich
- faces: list of detected faces with person names
- asset_id: Immich asset ID if found
"""
if not self.is_configured:
return {
'found': False,
'error': 'Immich not configured',
'faces': []
}
# Find the asset
asset = self.get_asset_by_path(local_path)
if not asset:
return {
'found': False,
'error': 'File not found in Immich',
'faces': []
}
asset_id = asset.get('id')
# Get faces for the asset
faces_data = self.get_faces_for_asset(asset_id)
# Process faces into a more usable format
faces = []
for face in faces_data:
person = face.get('person', {})
faces.append({
'face_id': face.get('id'),
'person_id': person.get('id'),
'person_name': person.get('name', ''),
'bounding_box': {
'x1': face.get('boundingBoxX1'),
'y1': face.get('boundingBoxY1'),
'x2': face.get('boundingBoxX2'),
'y2': face.get('boundingBoxY2')
},
'image_width': face.get('imageWidth'),
'image_height': face.get('imageHeight')
})
# Filter to only named faces
named_faces = [f for f in faces if f['person_name']]
return {
'found': True,
'asset_id': asset_id,
'faces': faces,
'named_faces': named_faces,
'face_count': len(faces),
'named_count': len(named_faces)
}
def get_person_by_name(self, name: str) -> Optional[Dict[str, Any]]:
"""
Find a person in Immich by name.
Args:
name: Person name to search for
Returns:
Person dict or None
"""
people = self.get_all_people()
for person in people:
if person.get('name', '').lower() == name.lower():
return person
return None
def get_person_assets(self, person_id: str, limit: int = 1000) -> List[Dict[str, Any]]:
"""
Get all assets containing a specific person using search API.
Args:
person_id: Immich person ID
limit: Maximum number of assets to return
Returns:
List of assets
"""
if not self.is_configured:
return []
try:
client = self._get_client()
# Use the search/metadata endpoint with personIds filter
response = client.post('/search/metadata', json={
'personIds': [person_id],
'size': limit
})
if response.status_code == 200:
data = response.json()
return data.get('assets', {}).get('items', [])
else:
logger.warning(f"Failed to get assets for person {person_id}: {response.status_code}")
return []
except Exception as e:
logger.error(f"Error getting person assets: {e}")
return []
def get_statistics(self) -> Dict[str, Any]:
"""
Get Immich face recognition statistics.
Returns:
Dict with total people, named people, etc.
"""
people = self.get_all_people()
named = [p for p in people if p.get('name')]
return {
'total_people': len(people),
'named_people': len(named),
'unnamed_people': len(people) - len(named),
'people_by_face_count': sorted(
[{'name': p.get('name', 'Unnamed'), 'count': p.get('faces', 0)}
for p in people if p.get('name')],
key=lambda x: x['count'],
reverse=True
)[:20]
}
def close(self):
"""Close HTTP client."""
if self._client:
self._client.close()
self._client = None
# Singleton instance
_immich_integration = None
def get_immich_integration(api_url: str = None, api_key: str = None) -> ImmichFaceIntegration:
"""
Get or create the Immich face integration instance.
Args:
api_url: Optional API URL override
api_key: Optional API key override
Returns:
ImmichFaceIntegration instance
"""
global _immich_integration
if _immich_integration is None:
_immich_integration = ImmichFaceIntegration(api_url, api_key)
elif api_key and api_key != _immich_integration.api_key:
# Recreate if API key changed
_immich_integration.close()
_immich_integration = ImmichFaceIntegration(api_url, api_key)
return _immich_integration

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,868 @@
"""
Instagram Perceptual Duplicate Detector
Detects visually similar Instagram content (even with text overlays, stickers, etc.)
and keeps the cleanest + highest quality version.
Priority: Clean (no overlays) > Quality (resolution/size)
"""
import os
import gc
import json
import uuid
from pathlib import Path
from typing import Optional, Dict, Tuple, List, TYPE_CHECKING
from datetime import datetime
from modules.universal_logger import get_logger
if TYPE_CHECKING:
import numpy as np
try:
import cv2
import numpy as np
OPENCV_AVAILABLE = True
except ImportError:
OPENCV_AVAILABLE = False
np = None # Define np as None when not available
try:
import imagehash
from PIL import Image
IMAGEHASH_AVAILABLE = True
except ImportError:
IMAGEHASH_AVAILABLE = False
# OCR disabled — not currently needed
EASYOCR_AVAILABLE = False
TESSERACT_AVAILABLE = False
class InstagramPerceptualDuplicateDetector:
"""
Detects perceptual duplicates in Instagram content and keeps cleanest + best quality
"""
def __init__(self, unified_db, log_callback=None):
"""
Initialize detector
Args:
unified_db: UnifiedDatabase instance
log_callback: Optional legacy callback (deprecated, uses universal logger)
"""
self.db = unified_db
self.logger = get_logger('Perceptual_Duplicate_Detector')
self.easyocr_reader = None
# Initialize EasyOCR reader (lazy loading - only when needed)
if EASYOCR_AVAILABLE:
try:
# Suppress PyTorch pin_memory warning (we're using CPU anyway)
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='torch.utils.data.dataloader')
self.easyocr_reader = easyocr.Reader(['en'], gpu=False, verbose=False)
self.logger.debug("EasyOCR initialized for text overlay detection", module="Perceptual")
except Exception as e:
self.logger.warning(f"Failed to initialize EasyOCR: {e}, will use Tesseract fallback", module="Perceptual")
self.easyocr_reader = None
# Check dependencies
if not OPENCV_AVAILABLE:
self.logger.warning("OpenCV not available - perceptual duplicate detection disabled", module="Perceptual")
if not IMAGEHASH_AVAILABLE:
self.logger.warning("imagehash not available - perceptual duplicate detection disabled", module="Perceptual")
if not EASYOCR_AVAILABLE and not TESSERACT_AVAILABLE:
self.logger.debug("No OCR available (EasyOCR or pytesseract) - text overlay detection disabled", module="Perceptual")
self.dependencies_available = OPENCV_AVAILABLE and IMAGEHASH_AVAILABLE
def check_and_handle_duplicate(self, file_path: str, platform: str, source: str, content_type: str = None) -> Optional[str]:
"""
Check if file is a perceptual duplicate and handle accordingly
ALWAYS records perceptual hash (even when disabled) to build historical database.
Only performs duplicate detection/handling when enabled.
Returns:
- None if not a duplicate or feature disabled
- "skip" if this file should be skipped (lower quality duplicate)
- file_path if this file should be kept (same or better)
"""
filename = Path(file_path).name
self.logger.debug(f"[PERCEPTUAL] ENTRY: check_and_handle_duplicate called", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] File: {filename}", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] Platform: {platform}", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] Source: {source}", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] Content Type: {content_type}", module="Perceptual")
if not self.dependencies_available:
self.logger.warning(f"[PERCEPTUAL] SKIP: Dependencies not available (OpenCV/ImageHash missing)", module="Perceptual")
return None
# Get settings
settings = self._get_settings()
detection_enabled = settings.get('enabled', False)
self.logger.debug(f"[PERCEPTUAL] Settings loaded:", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] Enabled: {detection_enabled}", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] Platforms: {settings.get('platforms', [])}", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] Threshold: {settings.get('perceptual_hash_threshold', 12)}", module="Perceptual")
try:
# ALWAYS calculate perceptual hash and scores (even when detection disabled)
# This builds the historical database for future use
self.logger.debug(f"[PERCEPTUAL] Calculating perceptual hash for {filename}...", module="Perceptual")
phash = self._calculate_perceptual_hash(file_path)
if not phash:
self.logger.error(f"[PERCEPTUAL] FAILED: Could not calculate perceptual hash for {filename}", module="Perceptual")
return None
self.logger.debug(f"[PERCEPTUAL] Hash calculated: {phash[:32]}...", module="Perceptual")
text_count, text_chars = self._detect_text_overlays(file_path) if settings.get('text_detection_enabled', True) else (0, 0)
quality_metrics = self._get_quality_metrics(file_path)
clean_score = self._calculate_clean_score(text_count, text_chars)
quality_score = self._calculate_quality_score(quality_metrics)
self.logger.debug(f"[PERCEPTUAL] Scores calculated:", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] Clean Score: {clean_score:.2f}", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] Quality Score: {quality_score:.2f}", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] Text Overlays: {text_count} ({text_chars} chars)", module="Perceptual")
# If detection is disabled, just store the hash and return (no duplicate checking)
if not detection_enabled:
self.logger.debug(f"[PERCEPTUAL] SKIP: Detection disabled - storing hash only for {filename}", module="Perceptual")
self._store_perceptual_hash(
file_path, platform, source, content_type,
phash, text_count, text_chars, quality_score, clean_score, quality_metrics
)
return None # Detection disabled, allow file to proceed
# Check if this platform is enabled for detection
platform_enabled = platform.lower() in [p.lower() for p in settings.get('platforms', ['instagram'])]
self.logger.debug(f"[PERCEPTUAL] Platform check: {platform} enabled = {platform_enabled}", module="Perceptual")
if not platform_enabled:
self.logger.debug(f"[PERCEPTUAL] SKIP: Platform '{platform}' not in enabled list - storing hash only", module="Perceptual")
self._store_perceptual_hash(
file_path, platform, source, content_type,
phash, text_count, text_chars, quality_score, clean_score, quality_metrics
)
return None # Platform not enabled, allow file to proceed
# Detection is enabled - perform duplicate checking
self.logger.debug(f"[PERCEPTUAL] CHECKING FOR DUPLICATES: {filename}", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] Platform: {platform}, Source: {source}", module="Perceptual")
self.logger.log(
f"[PERCEPTUAL] New file: {Path(file_path).name} | "
f"Hash: {phash[:16]}... | Clean: {clean_score:.2f} | Quality: {quality_score:.2f}",
"info"
)
# Find perceptual duplicates in database
threshold = settings.get('perceptual_hash_threshold', 12)
self.logger.debug(f"[PERCEPTUAL] Searching for similar files (threshold: {threshold})...", module="Perceptual")
similar_files = self._find_similar_files(
phash,
platform,
source,
threshold
)
self.logger.debug(f"[PERCEPTUAL] Similar files found: {len(similar_files)}", module="Perceptual")
if similar_files:
for i, sim in enumerate(similar_files, 1):
self.logger.debug(
f"[PERCEPTUAL] #{i}: {sim['filename']} | "
f"Distance: {sim['hamming_distance']} | "
f"Clean: {sim['clean_score']:.2f} | "
f"Quality: {sim['quality_score']:.2f}",
module="Perceptual"
)
if not similar_files:
# No duplicates found - store this file's hash and continue
self.logger.debug(f"[PERCEPTUAL] NO DUPLICATES FOUND - keeping {filename}", module="Perceptual")
self._store_perceptual_hash(
file_path, platform, source, content_type,
phash, text_count, text_chars, quality_score, clean_score, quality_metrics
)
return file_path # Keep this file
# Found similar file(s) - compare and decide which to keep
best_existing = self._get_best_existing_file(similar_files)
self.logger.debug(f"[PERCEPTUAL] DUPLICATE DETECTED!", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] Best existing file: {best_existing['filename']}", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] Clean: {best_existing['clean_score']:.2f}", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] Quality: {best_existing['quality_score']:.2f}", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL] Path: {best_existing['file_path']}", module="Perceptual")
# Compare new file vs best existing
self.logger.debug(f"[PERCEPTUAL] Comparing new vs existing...", module="Perceptual")
comparison = self._compare_files(
new_clean=clean_score,
new_quality=quality_score,
existing_clean=best_existing['clean_score'],
existing_quality=best_existing['quality_score'],
settings=settings
)
self.logger.debug(f"[PERCEPTUAL] Comparison result: {comparison}", module="Perceptual")
if comparison == "new_better":
# New file is better - move existing to recycle, keep new
self.logger.info(
f"[PERCEPTUAL] Replacing {best_existing['filename']} with cleaner version: {filename}",
module="Perceptual"
)
# Move existing to recycle bin
self._move_to_recycle(
best_existing['file_path'],
reason='replaced_with_cleaner_duplicate',
new_file=file_path
)
# Update database - replace old entry with new
self._replace_perceptual_hash_entry(
old_id=best_existing['id'],
new_file_path=file_path,
new_phash=phash,
new_text_count=text_count,
new_text_chars=text_chars,
new_quality_score=quality_score,
new_clean_score=clean_score,
new_quality_metrics=quality_metrics
)
return file_path # Keep new file
elif comparison == "existing_better":
# Existing file is better - move new to recycle, keep existing
self.logger.info(
f"[PERCEPTUAL] Skipping {filename} (duplicate of {best_existing['filename']})",
module="Perceptual"
)
# Move new file to recycle bin
self._move_to_recycle(
file_path,
reason='duplicate_lower_quality_or_has_overlays',
kept_file=best_existing['file_path']
)
return "skip" # Skip this file
else:
# Same quality - keep existing (default behavior)
self.logger.info(
f"[PERCEPTUAL] Skipping {filename} (same quality as {best_existing['filename']})",
module="Perceptual"
)
self._move_to_recycle(
file_path,
reason='duplicate_same_quality',
kept_file=best_existing['file_path']
)
return "skip"
except Exception as e:
self.logger.error(f"[PERCEPTUAL] EXCEPTION: {e}", module="Perceptual")
import traceback
self.logger.error(f"[PERCEPTUAL] Traceback:\n{traceback.format_exc()}", module="Perceptual")
return None
def _get_settings(self) -> dict:
"""Get Instagram perceptual duplicate settings from database"""
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT value FROM settings WHERE key = 'instagram_perceptual_duplicates'")
result = cursor.fetchone()
if result:
return json.loads(result[0])
except Exception as e:
self.logger.debug(f"Failed to get perceptual duplicate settings: {e}", module="Perceptual")
return {'enabled': False}
def _calculate_perceptual_hash(self, file_path: str) -> Optional[str]:
"""Calculate perceptual hash for image or video"""
if not IMAGEHASH_AVAILABLE:
return None
frame = None
frame_rgb = None
pil_image = None
try:
# For videos, extract middle frame
if file_path.lower().endswith(('.mp4', '.mov', '.avi', '.mkv')):
frame = self._extract_video_frame(file_path)
if frame is None:
return None
# Convert frame to PIL Image
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(frame_rgb)
else:
# For images, open directly
pil_image = Image.open(file_path)
# Calculate perceptual hash (dHash - difference hash)
phash = str(imagehash.dhash(pil_image, hash_size=16))
return phash
except Exception as e:
self.logger.debug(f"Failed to calculate perceptual hash: {e}", module="Perceptual")
return None
finally:
# Clean up memory
if pil_image is not None:
pil_image.close()
del pil_image
if frame_rgb is not None:
del frame_rgb
if frame is not None:
del frame
gc.collect()
def _extract_video_frame(self, video_path: str, position: float = 0.5) -> Optional['np.ndarray']:
"""Extract a frame from video at given position (0.0 to 1.0)"""
if not OPENCV_AVAILABLE:
return None
try:
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
return None
# Get total frames and seek to middle
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
target_frame = int(total_frames * position)
cap.set(cv2.CAP_PROP_POS_FRAMES, target_frame)
ret, frame = cap.read()
cap.release()
return frame if ret else None
except Exception as e:
self.logger.debug(f"Failed to extract video frame: {e}", module="Perceptual")
return None
def _detect_text_overlays(self, file_path: str) -> Tuple[int, int]:
"""
Detect text overlays in image/video using EasyOCR (primary) or Tesseract (fallback)
Returns:
(text_region_count, total_text_characters)
"""
if not self.easyocr_reader and not TESSERACT_AVAILABLE:
return (0, 0)
if not OPENCV_AVAILABLE:
return (0, 0)
image = None
gray = None
try:
text_regions = 0
total_chars = 0
# Load image or extract video frame
if file_path.lower().endswith(('.mp4', '.mov', '.avi', '.mkv')):
image = self._extract_video_frame(file_path)
if image is None:
return (0, 0)
else:
image = cv2.imread(file_path)
if image is None:
return (0, 0)
# Try EasyOCR first (better for Instagram overlays)
if self.easyocr_reader:
try:
# EasyOCR works directly with image arrays
results = self.easyocr_reader.readtext(image)
# EasyOCR returns list of (bbox, text, confidence)
for bbox, text, conf in results:
if conf > 0.5: # Only use detections with >50% confidence
text_stripped = text.strip()
if text_stripped:
text_regions += 1
total_chars += len(text_stripped)
if text_regions > 0:
self.logger.log(
f"[OVERLAY] EasyOCR detected {text_regions} text regions, {total_chars} chars in {Path(file_path).name}",
"debug"
)
return (text_regions, total_chars)
except Exception as e:
self.logger.debug(f"EasyOCR failed: {e}, falling back to Tesseract", module="Perceptual")
# Fallback to Tesseract if EasyOCR didn't find anything or failed
if TESSERACT_AVAILABLE:
try:
# Convert to grayscale for Tesseract
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Run OCR with detailed data
ocr_data = pytesseract.image_to_data(
gray,
output_type=pytesseract.Output.DICT,
config='--psm 11' # Sparse text mode
)
# Count text regions and characters
text_regions = 0
total_chars = 0
confidence_threshold = 30
for i, conf in enumerate(ocr_data['conf']):
if int(conf) > confidence_threshold:
text = ocr_data['text'][i].strip()
if text:
text_regions += 1
total_chars += len(text)
self.logger.log(
f"[OVERLAY] Tesseract (fallback) detected {text_regions} text regions, {total_chars} chars in {Path(file_path).name}",
"debug"
)
except Exception as e:
self.logger.debug(f"Tesseract OCR failed: {e}", module="Perceptual")
return (text_regions, total_chars)
except Exception as e:
self.logger.debug(f"Text overlay detection failed: {e}", module="Perceptual")
return (0, 0)
finally:
# Clean up memory - these are large numpy arrays
if gray is not None:
del gray
if image is not None:
del image
gc.collect()
def _get_quality_metrics(self, file_path: str) -> dict:
"""Get quality metrics for file"""
import subprocess
metrics = {
'resolution': 0,
'width': 0,
'height': 0,
'file_size': 0,
'bitrate': 0
}
try:
# Get file size
metrics['file_size'] = Path(file_path).stat().st_size
# Use ffprobe for video metadata
cmd = [
'ffprobe',
'-v', 'quiet',
'-print_format', 'json',
'-show_format',
'-show_streams',
file_path
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
if result.returncode == 0:
data = json.loads(result.stdout)
# Get video stream
video_stream = next((s for s in data.get('streams', []) if s.get('codec_type') == 'video'), None)
if video_stream:
metrics['width'] = int(video_stream.get('width', 0))
metrics['height'] = int(video_stream.get('height', 0))
metrics['resolution'] = metrics['width'] * metrics['height']
# Get bitrate
format_info = data.get('format', {})
if 'bit_rate' in format_info:
metrics['bitrate'] = int(format_info['bit_rate']) // 1000
except Exception as e:
self.logger.debug(f"Failed to get quality metrics: {e}", module="Perceptual")
return metrics
def _calculate_clean_score(self, text_count: int, text_chars: int) -> float:
"""
Calculate cleanliness score (0-100)
Higher score = cleaner (less text/overlays)
"""
# Base score starts at 100 (perfectly clean)
score = 100.0
# Penalize for text regions (each region -10 points, max -50)
text_penalty = min(text_count * 10, 50)
score -= text_penalty
# Penalize for character count (each 10 chars -5 points, max -40)
char_penalty = min((text_chars // 10) * 5, 40)
score -= char_penalty
# Ensure score is between 0-100
return max(0.0, min(100.0, score))
def _calculate_quality_score(self, metrics: dict) -> float:
"""
Calculate quality score (0-100)
Based on resolution and file size
"""
score = 0.0
# Resolution score (0-60 points)
# 1080p = 2,073,600 pixels = 60 points
# 720p = 921,600 pixels = 27 points
resolution = metrics.get('resolution', 0)
if resolution > 0:
resolution_score = min((resolution / 2_073_600) * 60, 60)
score += resolution_score
# File size score (0-40 points)
# 10MB = 40 points
# 5MB = 20 points
file_size = metrics.get('file_size', 0)
if file_size > 0:
size_mb = file_size / (1024 * 1024)
size_score = min((size_mb / 10) * 40, 40)
score += size_score
return min(100.0, score)
def _find_similar_files(self, phash: str, platform: str, source: str, threshold: int) -> List[dict]:
"""Find files with similar perceptual hash"""
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
# Get all Instagram files (all methods now use platform='instagram')
self.logger.debug(f"[PERCEPTUAL_SEARCH] Querying database:", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL_SEARCH] Platform: instagram", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL_SEARCH] Source: {source}", module="Perceptual")
self.logger.debug(f"[PERCEPTUAL_SEARCH] Threshold: {threshold}", module="Perceptual")
# Search all Instagram content (regardless of method)
# This catches reposts/duplicates from different accounts
cursor.execute("""
SELECT id, file_path, filename, perceptual_hash,
text_overlay_count, text_overlay_chars,
quality_score, clean_score, resolution, file_size
FROM instagram_perceptual_hashes
WHERE platform = 'instagram'
""")
all_rows = cursor.fetchall()
self.logger.debug(f"[PERCEPTUAL_SEARCH] Database returned {len(all_rows)} existing files (checking across all sources)", module="Perceptual")
results = []
checked_count = 0
within_threshold = 0
missing_files = 0
for row in all_rows:
existing_hash = row[3]
existing_filename = row[2]
# Calculate Hamming distance
distance = self._hamming_distance(phash, existing_hash)
checked_count += 1
if distance <= threshold:
within_threshold += 1
# Check if file still exists
if Path(row[1]).exists():
self.logger.debug(
f"[PERCEPTUAL_SEARCH] MATCH: {existing_filename} (distance: {distance})",
module="Perceptual"
)
results.append({
'id': row[0],
'file_path': row[1],
'filename': row[2],
'perceptual_hash': row[3],
'text_overlay_count': row[4],
'text_overlay_chars': row[5],
'quality_score': row[6],
'clean_score': row[7],
'resolution': row[8],
'file_size': row[9],
'hamming_distance': distance
})
else:
missing_files += 1
self.logger.debug(
f"[PERCEPTUAL_SEARCH] MATCH but file missing: {existing_filename} (distance: {distance})",
module="Perceptual"
)
self.logger.debug(
f"[PERCEPTUAL_SEARCH] Checked {checked_count} hashes, "
f"{within_threshold} within threshold, "
f"{missing_files} missing files, "
f"{len(results)} valid matches",
module="Perceptual"
)
return results
except Exception as e:
self.logger.error(f"Failed to find similar files: {e}", module="Perceptual")
return []
def _hamming_distance(self, hash1: str, hash2: str) -> int:
"""Calculate Hamming distance between two hashes"""
if len(hash1) != len(hash2):
return 999 # Invalid comparison
return sum(c1 != c2 for c1, c2 in zip(hash1, hash2))
def _get_best_existing_file(self, similar_files: List[dict]) -> dict:
"""Get the best existing file from similar files (highest clean + quality score)"""
if not similar_files:
return None
# Sort by clean score (primary), then quality score (secondary)
sorted_files = sorted(
similar_files,
key=lambda f: (f['clean_score'], f['quality_score']),
reverse=True
)
return sorted_files[0]
def _compare_files(self, new_clean: float, new_quality: float,
existing_clean: float, existing_quality: float,
settings: dict) -> str:
"""
Compare new file vs existing file
Returns: "new_better", "existing_better", or "same"
"""
clean_weight = settings.get('clean_score_weight', 3)
quality_weight = settings.get('quality_score_weight', 1)
min_difference = settings.get('min_text_difference', 5)
# IMPORTANT: Check for extreme quality differences first
# If one file has significantly higher quality, prefer it unless clean score is terrible
# This prevents low-resolution files from winning just because they have less detected text
min_acceptable_clean = settings.get('min_acceptable_clean', 30)
quality_ratio_threshold = settings.get('quality_ratio_threshold', 2.0)
# Check if new file has dramatically better quality
if new_quality > 0 and existing_quality > 0:
quality_ratio = new_quality / existing_quality
reverse_ratio = existing_quality / new_quality
# New file has 2x+ better quality and acceptable clean score
if quality_ratio >= quality_ratio_threshold and new_clean >= min_acceptable_clean:
self.logger.debug(
f"[PERCEPTUAL] New file wins: {quality_ratio:.1f}x better quality "
f"(new: Q={new_quality:.1f}/C={new_clean:.1f}, existing: Q={existing_quality:.1f}/C={existing_clean:.1f})",
module="Perceptual"
)
return "new_better"
# Existing file has 2x+ better quality and acceptable clean score
if reverse_ratio >= quality_ratio_threshold and existing_clean >= min_acceptable_clean:
self.logger.debug(
f"[PERCEPTUAL] Existing file wins: {reverse_ratio:.1f}x better quality "
f"(existing: Q={existing_quality:.1f}/C={existing_clean:.1f}, new: Q={new_quality:.1f}/C={new_clean:.1f})",
module="Perceptual"
)
return "existing_better"
# Standard weighted comparison for cases without extreme quality differences
new_score = (new_clean * clean_weight) + (new_quality * quality_weight)
existing_score = (existing_clean * clean_weight) + (existing_quality * quality_weight)
# Check if difference is significant
score_diff = abs(new_score - existing_score)
min_score_diff = min_difference * clean_weight # Scale by weight
if new_score > existing_score and score_diff >= min_score_diff:
return "new_better"
elif existing_score > new_score and score_diff >= min_score_diff:
return "existing_better"
else:
return "same"
def _store_perceptual_hash(self, file_path: str, platform: str, source: str, content_type: str,
phash: str, text_count: int, text_chars: int,
quality_score: float, clean_score: float, quality_metrics: dict):
"""Store perceptual hash and metadata in database (or update if exists)"""
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
# Check if hash already exists for this file
cursor.execute("""
SELECT id FROM instagram_perceptual_hashes
WHERE file_path = ?
""", (str(file_path),))
existing = cursor.fetchone()
if existing:
# Update existing entry
cursor.execute("""
UPDATE instagram_perceptual_hashes
SET filename = ?,
platform = ?,
source = ?,
content_type = ?,
perceptual_hash = ?,
text_overlay_count = ?,
text_overlay_chars = ?,
quality_score = ?,
clean_score = ?,
resolution = ?,
file_size = ?,
width = ?,
height = ?,
created_at = CURRENT_TIMESTAMP
WHERE id = ?
""", (
Path(file_path).name,
platform,
source,
content_type or 'unknown',
phash,
text_count,
text_chars,
quality_score,
clean_score,
quality_metrics.get('resolution', 0),
quality_metrics.get('file_size', 0),
quality_metrics.get('width', 0),
quality_metrics.get('height', 0),
existing[0]
))
self.logger.debug(f"[PERCEPTUAL] Updated hash for {Path(file_path).name}", module="Perceptual")
else:
# Insert new entry
entry_id = str(uuid.uuid4())
cursor.execute("""
INSERT INTO instagram_perceptual_hashes
(id, file_path, filename, platform, source, content_type,
perceptual_hash, text_overlay_count, text_overlay_chars,
quality_score, clean_score, resolution, file_size, width, height)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
entry_id,
str(file_path),
Path(file_path).name,
platform,
source,
content_type or 'unknown',
phash,
text_count,
text_chars,
quality_score,
clean_score,
quality_metrics.get('resolution', 0),
quality_metrics.get('file_size', 0),
quality_metrics.get('width', 0),
quality_metrics.get('height', 0)
))
self.logger.debug(f"[PERCEPTUAL] Stored hash for {Path(file_path).name}", module="Perceptual")
conn.commit()
except Exception as e:
self.logger.error(f"Failed to store perceptual hash: {e}", module="Perceptual")
# Note: Connection context manager handles rollback automatically on exception
def _replace_perceptual_hash_entry(self, old_id: str, new_file_path: str,
new_phash: str, new_text_count: int, new_text_chars: int,
new_quality_score: float, new_clean_score: float,
new_quality_metrics: dict):
"""Replace old hash entry with new file data"""
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
UPDATE instagram_perceptual_hashes
SET file_path = ?,
filename = ?,
perceptual_hash = ?,
text_overlay_count = ?,
text_overlay_chars = ?,
quality_score = ?,
clean_score = ?,
resolution = ?,
file_size = ?,
width = ?,
height = ?,
created_at = CURRENT_TIMESTAMP
WHERE id = ?
""", (
str(new_file_path),
Path(new_file_path).name,
new_phash,
new_text_count,
new_text_chars,
new_quality_score,
new_clean_score,
new_quality_metrics.get('resolution', 0),
new_quality_metrics.get('file_size', 0),
new_quality_metrics.get('width', 0),
new_quality_metrics.get('height', 0),
old_id
))
conn.commit()
except Exception as e:
self.logger.error(f"Failed to replace perceptual hash entry: {e}", module="Perceptual")
def _move_to_recycle(self, file_path: str, reason: str, **metadata):
"""Move file to recycle bin"""
try:
self.db.move_to_recycle_bin(
file_path=file_path,
deleted_from='instagram_perceptual_duplicate_detection',
deleted_by='system',
metadata={
'reason': reason,
**metadata
}
)
self.logger.debug(f"[PERCEPTUAL] Moved to recycle: {Path(file_path).name}", module="Perceptual")
except Exception as e:
self.logger.warning(f"Failed to move file to recycle: {e}", module="Perceptual")
# Fallback to delete if recycle fails
try:
Path(file_path).unlink()
except Exception:
pass

View File

@@ -0,0 +1,163 @@
"""
Shared Instagram API rate limiter.
Tracks authenticated API calls in a rolling 1-hour window and enforces
a configurable max rate. Both the main scraper and paid content modules
use this to avoid exceeding Instagram's rate threshold.
"""
import logging
import os
import threading
import time
from collections import deque
logger = logging.getLogger('media_downloader')
_PAUSE_FILE = '/opt/media-downloader/data/.ig_paused_until'
class InstagramBlockedError(Exception):
"""Raised when Instagram API calls are paused due to account restriction."""
pass
class _InstagramRateLimiter:
def __init__(self, max_calls_per_hour=180, window_seconds=3600):
self.max_calls = max_calls_per_hour
self.window = window_seconds
self._timestamps = deque()
self._lock = threading.Lock()
self._operation_lock = threading.Lock() # Cross-module mutex
self._paused_until = 0 # Unix timestamp — block all calls until this time
self._load_pause_state()
def _load_pause_state(self):
"""Load pause state from disk (survives restarts)."""
try:
if os.path.exists(_PAUSE_FILE):
with open(_PAUSE_FILE) as f:
ts = float(f.read().strip())
if ts > time.time():
self._paused_until = ts
logger.warning(
f"[IG-RateLimit] Loaded pause state — blocked until "
f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))}"
)
else:
# Expired, clean up
os.remove(_PAUSE_FILE)
except Exception:
pass
def pause_until(self, timestamp: float):
"""Block all Instagram API calls until the given unix timestamp."""
self._paused_until = timestamp
try:
os.makedirs(os.path.dirname(_PAUSE_FILE), exist_ok=True)
with open(_PAUSE_FILE, 'w') as f:
f.write(str(timestamp))
except Exception:
pass
logger.warning(
f"[IG-RateLimit] All Instagram API calls PAUSED until "
f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp))}"
)
def resume(self):
"""Resume Instagram API calls."""
self._paused_until = 0
try:
if os.path.exists(_PAUSE_FILE):
os.remove(_PAUSE_FILE)
except Exception:
pass
logger.info("[IG-RateLimit] Instagram API calls RESUMED")
@property
def is_paused(self):
if self._paused_until and time.time() < self._paused_until:
return True
return False
def track_call(self):
"""Record an API call timestamp."""
with self._lock:
self._timestamps.append(time.time())
def wait_if_needed(self):
"""Block if approaching rate limit. Call before each authenticated API request.
Raises InstagramBlockedError if calls are paused due to account restriction.
"""
# Check kill switch first (outside lock — fast path)
# Re-check file if not paused in memory (another process may have set it)
if not self._paused_until:
self._load_pause_state()
if self._paused_until:
now = time.time()
if now < self._paused_until:
remaining = self._paused_until - now
hours = remaining / 3600
raise InstagramBlockedError(
f"Instagram API paused — account restricted. "
f"Resuming in {hours:.1f}h"
)
else:
# Restriction expired, auto-resume
self.resume()
with self._lock:
now = time.time()
cutoff = now - self.window
# Purge old entries
while self._timestamps and self._timestamps[0] < cutoff:
self._timestamps.popleft()
count = len(self._timestamps)
if count >= self.max_calls:
# At limit — wait until the oldest call in window expires
wait_time = self._timestamps[0] - cutoff + 1
logger.warning(
f"[IG-RateLimit] At limit ({count}/{self.max_calls} calls/hr) — "
f"waiting {wait_time:.0f}s"
)
# Release lock while sleeping
self._lock.release()
try:
time.sleep(wait_time)
finally:
self._lock.acquire()
elif count >= self.max_calls * 0.8:
# Approaching limit (80%+) — add progressive delay
ratio = count / self.max_calls
delay = 2 + (ratio - 0.8) * 40 # 2s at 80%, 10s at 100%
logger.info(
f"[IG-RateLimit] Approaching limit ({count}/{self.max_calls} calls/hr) — "
f"adding {delay:.1f}s delay"
)
self._lock.release()
try:
time.sleep(delay)
finally:
self._lock.acquire()
self._timestamps.append(time.time())
@property
def operation_lock(self):
"""Lock for serializing Instagram operations (main scraper vs paid content)."""
return self._operation_lock
@property
def calls_in_window(self):
with self._lock:
now = time.time()
cutoff = now - self.window
while self._timestamps and self._timestamps[0] < cutoff:
self._timestamps.popleft()
return len(self._timestamps)
# Module-level singleton
rate_limiter = _InstagramRateLimiter(max_calls_per_hour=180)

View File

@@ -0,0 +1,782 @@
#!/usr/bin/env python3
"""
Instagram Story Repost Detector Module
Detects when Instagram stories are reposts/screenshots of other users' content,
then replaces low-quality reposts with high-quality originals from the source.
Features:
- OCR-based repost detection (@username extraction)
- ImgInn for downloading both stories and posts
- Perceptual hash matching for content identification
- Smart account filtering (monitored vs non-monitored)
- Automatic cleanup of temporary downloads
- Database tracking of all replacements
"""
import os
import re
import shutil
from pathlib import Path
from typing import Optional, Dict, List, Tuple
from datetime import datetime, timedelta
from modules.base_module import LoggingMixin
from modules.universal_logger import get_logger
# Module-level logger for import-time messages
_module_logger = get_logger('RepostDetector')
# Optional imports - fail gracefully if not available
from PIL import Image # Always needed
# OCR disabled — not currently needed
EASYOCR_AVAILABLE = False
TESSERACT_AVAILABLE = False
try:
import cv2
import numpy as np
CV2_AVAILABLE = True
except ImportError:
CV2_AVAILABLE = False
_module_logger.warning("opencv-python not available - video processing disabled", module='RepostDetector')
try:
import imagehash
IMAGEHASH_AVAILABLE = True
except ImportError:
IMAGEHASH_AVAILABLE = False
_module_logger.warning("imagehash not available - perceptual hashing disabled", module='RepostDetector')
class InstagramRepostDetector(LoggingMixin):
"""
Detects and replaces Instagram story reposts with original content
"""
def __init__(self, unified_db, log_callback=None):
"""
Initialize the detector
Args:
unified_db: UnifiedDatabase instance
log_callback: Optional logging callback function(message, level)
"""
# Initialize logging via mixin
self._init_logger('RepostDetector', log_callback, default_module='RepostDetector')
self.db = unified_db
self.temp_download_path = Path("/tmp/repost_detection")
self.last_original_username = None
self.easyocr_reader = None
# Ensure temp directory exists
self.temp_download_path.mkdir(parents=True, exist_ok=True)
# Initialize EasyOCR reader (lazy loading - only when needed)
if EASYOCR_AVAILABLE:
try:
# Suppress PyTorch pin_memory warning (we're using CPU anyway)
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='torch.utils.data.dataloader')
self.easyocr_reader = easyocr.Reader(['en'], gpu=False, verbose=False)
self.log("EasyOCR initialized for text detection", "info")
except Exception as e:
self.log(f"Failed to initialize EasyOCR: {e}", "warning")
self.easyocr_reader = None
# Check dependencies
self._check_dependencies()
def _check_dependencies(self):
"""Check if all required dependencies are available"""
missing = []
if not TESSERACT_AVAILABLE:
missing.append("pytesseract/PIL (pip3 install pytesseract pillow)")
if not CV2_AVAILABLE:
missing.append("opencv-python (pip3 install opencv-python)")
if not IMAGEHASH_AVAILABLE:
missing.append("imagehash (pip3 install imagehash)")
if missing:
self.log(f"Missing dependencies: {', '.join(missing)}", "warning")
self.log("Repost detection will be disabled until dependencies are installed", "warning")
def check_and_replace_repost(self, file_path: str, source_username: str) -> Optional[str]:
"""
Check if story is a repost and replace with original
Args:
file_path: Path to potential repost file
source_username: Username who posted this story (e.g., evalongoria)
Returns:
Path to replacement file if found, None otherwise
"""
# Check dependencies
if not all([TESSERACT_AVAILABLE, CV2_AVAILABLE, IMAGEHASH_AVAILABLE]):
self.log("Cannot process - missing dependencies", "debug")
return None
file_path = Path(file_path)
if not file_path.exists():
self.log(f"File not found: {file_path}", "error")
return None
self.log(f"Checking for repost: {file_path.name}", "info")
# Step 1: OCR to extract original @username
original_username = self._extract_username_from_repost(str(file_path))
if not original_username:
self.log(f"No @username detected - not a repost", "debug")
return None
# Check if user is reposting their own content
if original_username.lower() == source_username.lower():
self.log(f"@{source_username} is reposting their own content - skipping", "debug")
return None
self.log(f"Detected repost from @{original_username} in @{source_username}'s story", "info")
self.last_original_username = original_username
# Step 2: Check if original user is monitored
is_monitored = self._is_monitored_account(original_username)
# Step 3: Always save repost sources permanently (for face recognition + quality)
# Even non-monitored accounts get saved - they were discovered via reposts
download_path = Path("/opt/immich/md/instagram") / original_username
add_to_database = True
if is_monitored:
self.log(f"@{original_username} is monitored - checking existing content", "info")
else:
self.log(f"@{original_username} NOT monitored - but saving permanently (discovered via repost)", "info")
# Step 4: Check if we already fetched this user's content today
if not self._already_fetched_today(original_username):
# Step 5: Download stories + recent posts
self.log(f"Downloading content from @{original_username} via ImgInn...", "info")
success = self._download_content_via_imginn(
username=original_username,
destination=download_path,
add_to_database=add_to_database
)
if not success:
self.log(f"Failed to download content from @{original_username}", "error")
return None
else:
self.log(f"Content from @{original_username} already fetched today - using cache", "info")
# Step 6: Find matching original via perceptual hash
original_file = self._find_matching_original(
repost_path=str(file_path),
search_dir=download_path
)
if not original_file:
self.log(f"No matching original found for {file_path.name}", "warning")
# Keep all downloaded files - they'll be processed by move manager (face recognition, etc.)
self.log(f"Keeping all downloaded content from @{original_username} for processing", "info")
return None
# Step 7: Replace repost with original
replacement = self._replace_repost_with_original(
repost_path=str(file_path),
original_path=original_file
)
# All files are kept permanently - move manager will process them
self.log(f"All content from @{original_username} saved to {download_path}", "info")
return replacement
def _extract_username_region(self, img: Image.Image) -> Image.Image:
"""Extract just the username region (top-left) and scale up for better OCR"""
if not CV2_AVAILABLE:
# Fallback: just crop using PIL
width, height = img.size
# Crop top 8% of image where username appears
return img.crop((0, 0, width, int(height * 0.08)))
try:
# Convert PIL to OpenCV format
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
# Instagram usernames appear in top-left corner
# Crop to top 8% where username text is located
height, width = img_cv.shape[:2]
username_region = img_cv[0:int(height * 0.08), :]
# Convert to grayscale for better OCR
gray = cv2.cvtColor(username_region, cv2.COLOR_BGR2GRAY)
# Scale up 4x for better OCR on small text
# Instagram story usernames are quite small
scaled = cv2.resize(gray, None, fx=4, fy=4, interpolation=cv2.INTER_CUBIC)
# Convert back to PIL
return Image.fromarray(scaled)
except Exception as e:
self.log(f"Username region extraction failed: {e}", "debug")
return img
def _extract_username_from_repost(self, file_path: str) -> Optional[str]:
"""
Extract @username from repost overlay using OCR (EasyOCR primary, Tesseract fallback)
Handles both images and videos (multi-frame extraction for videos)
"""
# Check if we have any OCR available
if not self.easyocr_reader and not TESSERACT_AVAILABLE:
self.log("No OCR engine available", "warning")
return None
try:
# For images: Use EasyOCR (much better than Tesseract for Instagram overlays)
if file_path.endswith(('.jpg', '.jpeg', '.png', '.webp', '.heic')):
# Try EasyOCR first (best for Instagram stories)
if self.easyocr_reader:
try:
results = self.easyocr_reader.readtext(file_path)
# EasyOCR returns list of (bbox, text, confidence)
all_text = []
for bbox, text, conf in results:
if conf > 0.5: # Only use detections with >50% confidence
all_text.append(text)
text = " ".join(all_text)
if text.strip():
self.log(f"EasyOCR detected text: {text[:100]}", "debug")
except Exception as e:
self.log(f"EasyOCR failed: {e}, falling back to Tesseract", "debug")
text = ""
else:
text = ""
# Fallback to Tesseract if EasyOCR didn't find anything
if not text.strip() and TESSERACT_AVAILABLE:
with Image.open(file_path) as img:
username_region = self._extract_username_region(img)
for config in ['--psm 7', '--psm 11', '--psm 6']:
try:
ocr_result = pytesseract.image_to_string(username_region, config=config)
if ocr_result and len(ocr_result.strip()) > 2:
text = ocr_result
self.log(f"Tesseract (fallback) text: {text[:100]}", "debug")
break
except Exception:
continue
# For videos: extract multiple frames and OCR each
elif file_path.endswith(('.mp4', '.mov', '.avi', '.mkv', '.webm')):
if not CV2_AVAILABLE:
self.log("OpenCV not available - cannot process video", "warning")
return None
video = cv2.VideoCapture(file_path)
if not video.isOpened():
self.log(f"Failed to open video: {file_path}", "warning")
return None
frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
if frame_count == 0:
self.log(f"Video has no frames: {file_path}", "warning")
return None
# Check frames at 0%, 10%, and 50% positions
frames_to_check = [
0,
max(0, int(frame_count * 0.1)),
max(0, int(frame_count * 0.5))
]
text = ""
for frame_num in frames_to_check:
video.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = video.read()
if ret:
# Try EasyOCR first
if self.easyocr_reader:
try:
results = self.easyocr_reader.readtext(frame)
for bbox, frame_text, conf in results:
if conf > 0.5:
text += frame_text + " "
except Exception as e:
self.log(f"EasyOCR video frame failed: {e}", "debug")
# Fallback to Tesseract if needed
if not text.strip() and TESSERACT_AVAILABLE:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
img = Image.fromarray(frame_rgb)
username_region = self._extract_username_region(img)
for config in ['--psm 7', '--psm 11', '--psm 6']:
try:
ocr_result = pytesseract.image_to_string(username_region, config=config)
if ocr_result and len(ocr_result.strip()) > 2:
text += ocr_result + "\n"
break
except Exception:
continue
video.release()
self.log(f"OCR text (video, {len(frames_to_check)} frames): {text[:100]}...", "debug")
else:
self.log(f"Unsupported file format: {file_path}", "debug")
return None
# Parse text to find @username or just username
# Pattern 1: @ followed by username characters (including space which might be underscore)
# Instagram usernames can have underscores, but OCR sometimes reads them as spaces
matches = re.findall(r'@([a-zA-Z0-9._ ]+)', text)
if matches:
# Clean up: remove trailing spaces, convert spaces to underscores
username = matches[0].strip().replace(' ', '_')
# Remove any characters that aren't valid in Instagram usernames
username = re.sub(r'[^a-zA-Z0-9._]', '', username)
# Remove trailing dots/underscores
username = username.rstrip('._')
if len(username) >= 3: # Valid Instagram username minimum
self.log(f"Extracted username (with @): @{username}", "info")
return username
# Pattern 2: Instagram username without @ (at least 3 chars, lowercase letters, numbers, dots, underscores)
# Filter out common OCR noise and make sure it's a valid Instagram username pattern
lines = text.split('\n')
for line in lines:
line = line.strip().lower()
# Match Instagram username pattern: 3-30 chars, alphanumeric + dots/underscores
if re.match(r'^[a-z0-9._]{3,30}$', line):
# Additional filter: likely an Instagram username (not random text)
# Instagram usernames don't end with dots and must contain letters
if not line.endswith('.') and re.search(r'[a-z]', line):
self.log(f"Extracted username (without @): @{line}", "info")
return line
except Exception as e:
self.log(f"OCR extraction failed: {e}", "warning")
return None
def _is_monitored_account(self, username: str) -> bool:
"""
Check if username is in search_monitors
Returns True if user is being actively monitored for downloads
"""
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
# Check search_monitors table
cursor.execute("""
SELECT 1 FROM search_monitors
WHERE platform IN ('instagram', 'instaloader', 'fastdl', 'imginn')
AND source = ?
AND active = 1
LIMIT 1
""", (username,))
return cursor.fetchone() is not None
except Exception as e:
self.log(f"Error checking monitored status: {e}", "error")
return False
def _already_fetched_today(self, username: str) -> bool:
"""
Check if we already downloaded this user's content today
Uses repost_fetch_cache table to track fetches
"""
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
# Create cache table if doesn't exist
cursor.execute("""
CREATE TABLE IF NOT EXISTS repost_fetch_cache (
username TEXT PRIMARY KEY,
last_fetched TEXT NOT NULL,
content_count INTEGER DEFAULT 0
)
""")
# Check if fetched in last 12 hours
cursor.execute("""
SELECT last_fetched FROM repost_fetch_cache
WHERE username = ?
AND datetime(last_fetched) > datetime('now', '-12 hours')
""", (username,))
result = cursor.fetchone()
return result is not None
except Exception as e:
self.log(f"Error checking fetch cache: {e}", "error")
return False
def _mark_fetched(self, username: str, content_count: int = 0):
"""Mark that we fetched this user's content"""
try:
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute("""
INSERT OR REPLACE INTO repost_fetch_cache
(username, last_fetched, content_count)
VALUES (?, ?, ?)
""", (username, datetime.now().isoformat(), content_count))
self.log(f"Marked @{username} as fetched ({content_count} items)", "debug")
except Exception as e:
self.log(f"Error marking fetch: {e}", "error")
def _download_content_via_imginn(self, username: str, destination: Path, add_to_database: bool) -> bool:
"""
Download stories AND recent posts from user via ImgInn
Args:
username: Instagram username
destination: Where to save (normal path or /tmp)
add_to_database: If False, skip database recording (temp processing)
Returns:
True if successful, False otherwise
"""
try:
# Import imginn module
from modules.imginn_module import ImgInnDownloader
# Initialize ImgInn with or without database
imginn = ImgInnDownloader(
unified_db=self.db if add_to_database else None,
log_callback=lambda msg, lvl: self.log(msg, lvl)
)
# Create destination directories
stories_dir = destination / "stories"
posts_dir = destination / "posts"
stories_dir.mkdir(parents=True, exist_ok=True)
posts_dir.mkdir(parents=True, exist_ok=True)
# Download stories
self.log(f"Downloading stories from @{username} via ImgInn...", "info")
stories_files = imginn.download_stories(
username=username,
max_stories=50,
output_dir=stories_dir,
skip_database=not add_to_database
)
stories_count = len(stories_files) if isinstance(stories_files, list) else 0
# Download recent posts (last 24 hours)
self.log(f"Downloading recent posts from @{username} via ImgInn...", "info")
posts_files = imginn.download_posts(
username=username,
max_posts=50,
output_dir=posts_dir,
max_age_hours=24,
skip_database=not add_to_database
)
posts_count = len(posts_files) if isinstance(posts_files, list) else 0
total_count = stories_count + posts_count
self.log(f"Downloaded {total_count} items ({stories_count} stories, {posts_count} posts)", "info")
# Mark this fetch in cache
self._mark_fetched(username, total_count)
return total_count > 0
except Exception as e:
self.log(f"ImgInn download failed: {e}", "error")
return False
def _find_matching_original(self, repost_path: str, search_dir: Path) -> Optional[str]:
"""
Find matching original content using perceptual hashing
Searches both stories/ and posts/ subdirectories
Args:
repost_path: Path to the repost file (e.g., evalongoria story)
search_dir: Directory to search (e.g., /tmp/.../globalgiftfoundation/)
Returns:
Path to best matching original, or None
"""
if not IMAGEHASH_AVAILABLE:
self.log("imagehash not available - cannot match", "warning")
return None
# Calculate hash of repost
repost_hash = self._get_perceptual_hash(repost_path)
if not repost_hash:
self.log(f"Failed to calculate hash for repost: {repost_path}", "warning")
return None
self.log(f"Repost hash: {repost_hash}", "debug")
# Search both stories and posts
best_match = None
best_distance = 999
threshold = 10 # Hamming distance threshold (0-64 scale)
for subdir in ["stories", "posts"]:
content_dir = search_dir / subdir
if not content_dir.exists():
self.log(f"Directory not found: {content_dir}", "debug")
continue
files = list(content_dir.rglob("*"))
self.log(f"Checking {len(files)} files in {content_dir}", "debug")
for file_path in files:
if not file_path.is_file():
continue
# Skip non-media files
if file_path.suffix.lower() not in ['.jpg', '.jpeg', '.png', '.mp4', '.mov', '.avi', '.webp']:
continue
# Calculate hash
candidate_hash = self._get_perceptual_hash(str(file_path))
if not candidate_hash:
continue
# Compare (Hamming distance)
distance = repost_hash - candidate_hash
self.log(f" {file_path.name}: distance={distance}", "debug")
if distance < threshold and distance < best_distance:
best_distance = distance
best_match = str(file_path)
self.log(f"Better match found: {file_path.name} (distance: {distance})", "info")
if best_match:
self.log(f"✓ Found original: {Path(best_match).name} (distance: {best_distance})", "success")
return best_match
else:
self.log(f"✗ No matching original found for {Path(repost_path).name}", "warning")
return None
def _get_perceptual_hash(self, file_path: str):
"""Calculate perceptual hash for image or video"""
if not IMAGEHASH_AVAILABLE:
return None
try:
# Image: direct hash
if file_path.endswith(('.jpg', '.jpeg', '.png', '.webp', '.heic')):
with Image.open(file_path) as img:
return imagehash.dhash(img) # Difference hash (good for cropped/resized)
# Video: hash middle frame
elif file_path.endswith(('.mp4', '.mov', '.avi', '.mkv', '.webm')):
if not CV2_AVAILABLE:
return None
video = cv2.VideoCapture(file_path)
if not video.isOpened():
return None
frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
if frame_count == 0:
video.release()
return None
# Use middle frame
video.set(cv2.CAP_PROP_POS_FRAMES, frame_count // 2)
ret, frame = video.read()
video.release()
if ret:
# Convert BGR to RGB
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
img = Image.fromarray(frame_rgb)
result = imagehash.dhash(img)
img.close()
return result
except Exception as e:
self.log(f"Hash calculation failed for {Path(file_path).name}: {e}", "debug")
return None
def _replace_repost_with_original(self, repost_path: str, original_path: str) -> str:
"""
Replace repost file with original high-quality file
Workflow:
1. Move repost to recycle bin (preserves it, not deleted)
2. Return path to ORIGINAL file with its original filename/metadata
3. Move module processes original as if it was downloaded directly
Args:
repost_path: Path to repost (e.g., evalongoria_story6.mp4)
original_path: Path to original (e.g., globalgiftfoundation_20251109_100000.jpg)
Returns:
Path to original file (keeps original filename and metadata)
"""
import os
repost_file = Path(repost_path)
original_file = Path(original_path)
# Move repost to recycle bin (not delete - can recover if mistake)
if self.db:
try:
recycle_id = self.db.move_to_recycle_bin(
file_path=str(repost_file),
deleted_from='repost_detection',
deleted_by='system',
metadata={
'reason': 'replaced_with_original',
'original_source': str(original_file),
'original_username': self.last_original_username
}
)
if recycle_id:
self.log(f"Moved repost to recycle bin: {repost_file.name} (ID: {recycle_id[:8]}...)", "info")
else:
self.log(f"Failed to move repost to recycle bin, will delete instead", "warning")
# Fallback: delete if recycle bin fails
repost_file.unlink()
except Exception as e:
self.log(f"Recycle bin failed: {e}, deleting repost", "warning")
try:
repost_file.unlink()
except Exception:
pass
else:
# No database - just delete
try:
repost_file.unlink()
self.log(f"Deleted repost: {repost_file.name}", "debug")
except Exception as e:
self.log(f"Failed to delete repost: {e}", "warning")
# Return path to ORIGINAL file with its original filename and metadata
# Move module will process it as if it was downloaded directly from the original source
self.log(f"Replacing repost with original: {repost_file.name}{original_file.name}", "info")
# Update database to track replacement
self._record_repost_replacement(
repost_path=str(repost_file),
original_path=str(original_file),
replacement_path=str(original_file) # Same as original - keeps original filename
)
return str(original_file)
def _record_repost_replacement(self, repost_path: str, original_path: str, replacement_path: str):
"""
Track repost replacements in database
Creates repost_replacements table to track what was replaced
"""
try:
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Create tracking table
cursor.execute("""
CREATE TABLE IF NOT EXISTS repost_replacements (
id INTEGER PRIMARY KEY AUTOINCREMENT,
repost_path TEXT NOT NULL,
repost_filename TEXT NOT NULL,
repost_source TEXT NOT NULL,
original_path TEXT NOT NULL,
original_username TEXT NOT NULL,
replacement_path TEXT NOT NULL,
detected_at TEXT DEFAULT CURRENT_TIMESTAMP,
hash_distance INTEGER
)
""")
# Extract usernames
repost_source = Path(repost_path).parent.name
original_username = self.last_original_username or "unknown"
# Insert record
cursor.execute("""
INSERT INTO repost_replacements
(repost_path, repost_filename, repost_source, original_path, original_username, replacement_path)
VALUES (?, ?, ?, ?, ?, ?)
""", (
repost_path,
Path(repost_path).name,
repost_source,
original_path,
original_username,
replacement_path
))
self.log(f"Recorded replacement: {repost_source} → @{original_username}", "debug")
except Exception as e:
self.log(f"Failed to record replacement: {e}", "error")
def _cleanup_temp_downloads(self, temp_dir: Path, keep_file: str = None):
"""
Clean up temporary downloads for non-monitored accounts
Args:
temp_dir: Directory to clean (e.g., /tmp/repost_detection/username/)
keep_file: Optional file to preserve (the matched original)
"""
if not temp_dir.exists():
return
keep_path = Path(keep_file) if keep_file else None
deleted_count = 0
try:
# Delete all files except the keeper
for file_path in temp_dir.rglob("*"):
if not file_path.is_file():
continue
if keep_path and file_path == keep_path:
continue # Skip the matched file
try:
file_path.unlink()
deleted_count += 1
except Exception as e:
self.log(f"Failed to delete temp file {file_path.name}: {e}", "debug")
# Remove empty directories
for subdir in [temp_dir / "stories", temp_dir / "posts"]:
if subdir.exists() and not any(subdir.iterdir()):
subdir.rmdir()
if not any(temp_dir.iterdir()):
temp_dir.rmdir()
self.log(f"Cleaned up {deleted_count} temporary files", "info")
except Exception as e:
self.log(f"Failed to cleanup directories: {e}", "debug")
if __name__ == "__main__":
print("Instagram Repost Detector Module")
print("This module should be imported, not run directly")
print("\nDependencies:")
print(f" - pytesseract/PIL: {'' if TESSERACT_AVAILABLE else ''}")
print(f" - opencv-python: {'' if CV2_AVAILABLE else ''}")
print(f" - imagehash: {'' if IMAGEHASH_AVAILABLE else ''}")

461
modules/instagram_utils.py Normal file
View File

@@ -0,0 +1,461 @@
#!/usr/bin/env python3
"""
Instagram Utilities Module
Shared utility functions for Instagram downloaders (imginn, fastdl, toolzu, instaloader).
Centralizes common functionality like media ID extraction to avoid code duplication.
"""
import re
from datetime import datetime
from pathlib import Path
from typing import Optional, Set, Dict, Any
def extract_instagram_media_id(filename_or_id: str) -> str:
"""Extract the actual Instagram media ID from a filename or ID string.
Instagram image filenames follow the pattern:
{user_id}_{media_id}_{post_id}_n.ext
Where media_id is a 17-18 digit number starting with 18xxxxxxx
For video stories with AQ... format, these are story keys and
we use the whole key as the media ID.
Args:
filename_or_id: A filename like '591164014_18551181784006538_2284814566270897032_n'
or just a media ID string
Returns:
The extracted Instagram media ID (17-18 digit number) or the original string
if no pattern matches
Examples:
>>> extract_instagram_media_id('591164014_18551181784006538_2284814566270897032_n')
'18551181784006538'
>>> extract_instagram_media_id('18551181784006538')
'18551181784006538'
>>> extract_instagram_media_id('AQOOlj6M4PlGHBuYl02KzwUXefsdiou9q3ooFiNF4cUy3DEY6QKxROoUe9BKCeVJA4UF5BiVPIuqXheCU')
'AQOOlj6M4PlGHBuYl02KzwUXefsdiou9q3ooFiNF4cUy3DEY6QKxROoUe9BKCeVJA4UF5BiVPIuqXheCU'
"""
if not filename_or_id:
return filename_or_id
# Pattern 1: Standard Instagram image format with underscore separators
# {user_id}_{media_id}_{post_id}_n
# Media ID is the 17-18 digit number starting with 18
# Use underscore or start/end as boundaries (not \b which doesn't work with underscores)
ig_media_id_pattern = r'(?:^|_)(18\d{15,17})(?:_|$)'
match = re.search(ig_media_id_pattern, filename_or_id)
if match:
return match.group(1)
# Pattern 2: If it's already a valid media ID (17-18 digits starting with 18)
if re.match(r'^18\d{15,17}$', filename_or_id):
return filename_or_id
# Pattern 3: Story key format (AQ... encoded string) - use as-is
if filename_or_id.startswith('AQ') and len(filename_or_id) > 50:
return filename_or_id
# Pattern 4: Short post code format (like DRkaDSFD-U2) - use as-is
if re.match(r'^[A-Za-z0-9_-]{10,15}$', filename_or_id):
return filename_or_id
# No pattern matched - return original string
return filename_or_id
def extract_media_id_from_url(url: str) -> Optional[str]:
"""Extract Instagram media ID from a CDN URL.
Instagram CDN URLs contain media IDs in patterns like:
561378837_18538674661006538_479694548187839800_n.jpg
The second number (18538674661006538) is the Instagram media ID.
Args:
url: Instagram CDN URL string
Returns:
Media ID string or None if not found
"""
if not url:
return None
# Pattern: number_MEDIAID_number_n.jpg or .mp4
pattern = r'(\d+)_(\d{17,19})_\d+_n\.(jpg|mp4|jpeg|png)'
match = re.search(pattern, url)
if match:
return match.group(2) # Return the media ID
return None
def extract_media_ids_from_url(url: str) -> list:
"""Extract all Instagram media IDs from a URL.
Similar to extract_media_id_from_url but returns all matches as a list.
Args:
url: URL string that may contain Instagram media IDs
Returns:
List of media IDs found in the URL
"""
if not url:
return []
# Pattern: number_MEDIAID_number_n.jpg
pattern = r'(\d+)_(\d{17,19})_\d+_n\.(jpg|mp4|jpeg|png)'
matches = re.findall(pattern, url)
if matches:
# Return the media ID (second capture group) from each match
return [match[1] for match in matches]
return []
def extract_post_shortcode(url: str) -> Optional[str]:
"""Extract Instagram post shortcode from a URL.
Args:
url: Instagram URL like https://www.instagram.com/p/ABC123/
Returns:
Shortcode string or None if not found
"""
if not url:
return None
match = re.search(r'/p/([^/]+)/?', url)
if match:
return match.group(1)
return None
def media_id_to_shortcode(media_id: str) -> str:
"""Convert Instagram media ID to shortcode.
Args:
media_id: Numeric media ID string
Returns:
Instagram shortcode string
"""
alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
try:
media_id_int = int(media_id)
except (ValueError, TypeError):
return media_id # Return as-is if not a valid number
shortcode = ''
while media_id_int > 0:
remainder = media_id_int % 64
media_id_int = media_id_int // 64
shortcode = alphabet[remainder] + shortcode
return shortcode or 'A'
def scan_existing_files_for_media_ids(output_dir: Path, profile_name: str = None,
min_file_size: int = 0, recursive: bool = True) -> Set[str]:
"""Scan existing files and extract media IDs for duplicate detection.
Scans image and video files in the output directory, extracts both the
full media ID string and the normalized Instagram media ID (18-digit number).
Args:
output_dir: Directory to scan for existing files
profile_name: Optional profile name to filter files
min_file_size: Minimum file size in bytes (skip smaller files as corrupted)
recursive: If True, search subdirectories (rglob), otherwise only top level (glob)
Returns:
Set of media IDs (both full and normalized) found in existing files
"""
media_ids = set()
if not output_dir.exists():
return media_ids
glob_func = output_dir.rglob if recursive else output_dir.glob
for pattern in ["*.jpg", "*.jpeg", "*.png", "*.heic", "*.mp4", "*.mov"]:
for filepath in glob_func(pattern):
# Skip files smaller than min_file_size (likely corrupted/incomplete)
if min_file_size > 0:
try:
if filepath.stat().st_size < min_file_size:
continue
except OSError:
continue
filename = filepath.stem
# Format is: profile_YYYYMMDD_HHMMSS_mediaid
# Split into parts: [profile, date, time, ...rest is media_id]
parts = filename.split('_', 3)
if len(parts) >= 4:
# Check profile name if provided
if profile_name and parts[0] != profile_name:
continue
media_id_full = parts[3]
elif len(parts) > 1:
media_id_full = parts[-1]
else:
media_id_full = filename
if media_id_full:
# Add the full media ID string
media_ids.add(media_id_full)
# Also add the normalized Instagram media ID (18-digit number)
normalized_id = extract_instagram_media_id(media_id_full)
if normalized_id and normalized_id != media_id_full:
media_ids.add(normalized_id)
return media_ids
def parse_instagram_filename(filename: str) -> dict:
"""Parse an Instagram filename into its components.
Args:
filename: Filename like 'evalongoria_20251205_120406_591164014_18551181784006538_2284814566270897032_n_story1.jpg'
Returns:
Dictionary with parsed components:
- username: str or None
- date: str or None (YYYYMMDD format)
- time: str or None (HHMMSS format)
- media_id_full: str or None (full ID after date/time)
- media_id: str or None (normalized 18-digit Instagram media ID)
- suffix: str or None (e.g., 'story1')
- extension: str or None
"""
result = {
'username': None,
'date': None,
'time': None,
'media_id_full': None,
'media_id': None,
'suffix': None,
'extension': None
}
if not filename:
return result
# Get extension
path = Path(filename)
result['extension'] = path.suffix.lower() if path.suffix else None
basename = path.stem
# Split into parts
parts = basename.split('_')
if len(parts) >= 4:
result['username'] = parts[0]
# Check if parts[1] and parts[2] look like date/time
if len(parts[1]) == 8 and parts[1].isdigit():
result['date'] = parts[1]
if len(parts[2]) == 6 and parts[2].isdigit():
result['time'] = parts[2]
# Everything after date/time is the media ID (possibly with suffix)
media_id_full = '_'.join(parts[3:])
result['media_id_full'] = media_id_full
# Check for story suffix
if '_story' in media_id_full:
media_part, suffix_part = media_id_full.rsplit('_story', 1)
result['media_id_full'] = media_part
result['suffix'] = f'story{suffix_part}'
# Extract normalized media ID
result['media_id'] = extract_instagram_media_id(result['media_id_full'])
return result
def record_instagram_download(db, media_id: str, username: str, content_type: str,
filename: str, url: str = None, download_url: str = None,
post_date: datetime = None, file_path: str = None,
method: str = None, extra_metadata: Dict[str, Any] = None) -> bool:
"""Record an Instagram download in the database with normalized media_id.
This is the centralized function for recording Instagram downloads across all
Instagram downloader modules (imginn, fastdl, toolzu, instaloader). It ensures
the media_id is always normalized for cross-module duplicate detection.
Args:
db: Database instance (UnifiedDatabase or adapter with record_download method)
media_id: The media ID (will be normalized automatically)
username: Instagram username
content_type: Type of content (posts, stories, reels, highlights)
filename: Filename of the downloaded file
url: Original Instagram URL (e.g., https://instagram.com/p/ABC123/)
download_url: Direct download URL (CDN URL)
post_date: Post date/time
file_path: Full file path on disk
method: Download method (imginn, fastdl, toolzu, instaloader)
extra_metadata: Additional metadata to include
Returns:
True if successfully recorded, False otherwise
"""
if not db:
return False
# Normalize the media_id for consistent cross-module detection
normalized_media_id = extract_instagram_media_id(media_id) if media_id else media_id
# Build metadata with normalized media_id
metadata = {
'media_id': normalized_media_id,
'original_media_id': media_id if media_id != normalized_media_id else None,
}
# Add extra metadata if provided
if extra_metadata:
metadata.update(extra_metadata)
# Remove None values
metadata = {k: v for k, v in metadata.items() if v is not None}
# Determine URL for database (use download_url or construct from media_id)
db_url = url or download_url or f"instagram://{normalized_media_id}"
# Calculate file hash if file_path provided
file_hash = None
if file_path:
try:
from modules.unified_database import UnifiedDatabase
file_hash = UnifiedDatabase.get_file_hash(file_path)
except Exception:
pass
try:
# Try to use the db's record_download method directly
if hasattr(db, 'record_download'):
return db.record_download(
url=db_url,
platform='instagram',
source=username,
content_type=content_type,
filename=filename,
file_path=file_path,
file_hash=file_hash,
post_date=post_date,
metadata=metadata,
method=method
)
# Fallback for adapter-style databases
elif hasattr(db, 'mark_downloaded'):
return db.mark_downloaded(
username=username,
url=db_url,
filename=filename,
post_date=post_date,
metadata=metadata,
file_path=file_path,
content_type=content_type
)
else:
return False
except Exception:
return False
def is_instagram_downloaded(db, media_id: str, username: str = None) -> bool:
"""Check if Instagram content is already downloaded by media_id.
Checks for both the original and normalized media_id to ensure cross-module
duplicate detection works correctly.
Args:
db: Database instance (UnifiedDatabase or adapter)
media_id: The media ID to check (will check both original and normalized)
username: Optional username to scope the check
Returns:
True if already downloaded, False otherwise
"""
if not db or not media_id:
return False
# Normalize the media_id
normalized_media_id = extract_instagram_media_id(media_id)
# Check if this looks like a shortcode (10-15 alphanumeric chars, no 18xxx pattern)
is_shortcode = (normalized_media_id == media_id and
re.match(r'^[A-Za-z0-9_-]{10,15}$', media_id) and
not re.match(r'^18\d{15,17}$', media_id))
try:
# Check if db has get_connection (UnifiedDatabase) - query directly
if hasattr(db, 'get_connection'):
with db.get_connection() as conn:
cursor = conn.cursor()
# Check both normalized and original media_id
# Also verify file_path is set (download was actually completed)
if normalized_media_id != media_id:
cursor.execute('''
SELECT 1 FROM downloads
WHERE platform = 'instagram'
AND (media_id = ? OR media_id = ?)
AND file_path IS NOT NULL AND file_path != ''
LIMIT 1
''', (normalized_media_id, media_id))
else:
cursor.execute('''
SELECT 1 FROM downloads
WHERE platform = 'instagram'
AND media_id = ?
AND file_path IS NOT NULL AND file_path != ''
LIMIT 1
''', (normalized_media_id,))
if cursor.fetchone() is not None:
return True
# For shortcodes, also check the metadata JSON column
if is_shortcode:
cursor.execute('''
SELECT 1 FROM downloads
WHERE platform = 'instagram'
AND metadata LIKE ?
AND file_path IS NOT NULL AND file_path != ''
LIMIT 1
''', (f'%"shortcode": "{media_id}"%',))
if cursor.fetchone() is not None:
return True
# Check recycle bin — files previously downloaded then deleted
# should not be re-downloaded
cursor.execute('''
SELECT 1 FROM recycle_bin
WHERE original_filename LIKE ?
LIMIT 1
''', (f'%{normalized_media_id}%',))
if cursor.fetchone() is not None:
return True
return False
# Fallback for adapters with is_already_downloaded method
elif hasattr(db, 'is_already_downloaded'):
if db.is_already_downloaded(normalized_media_id):
return True
# Also check original if different
if normalized_media_id != media_id and db.is_already_downloaded(media_id):
return True
return False
except Exception:
return False

1259
modules/instaloader_module.py Executable file

File diff suppressed because it is too large Load Diff

535
modules/media_identifier.py Normal file
View File

@@ -0,0 +1,535 @@
"""
Media Identifier Module
Parses media filenames using guessit and matches them against TMDB for metadata enrichment.
Generates organized file paths for TV Shows and Movies.
"""
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import requests
from modules.universal_logger import get_logger
logger = get_logger('MediaIdentifier')
# Try to import guessit, but gracefully handle if not installed
try:
import guessit
GUESSIT_AVAILABLE = True
except ImportError:
GUESSIT_AVAILABLE = False
logger.warning("guessit not installed - filename parsing will be limited")
@dataclass
class ParsedMedia:
"""Represents parsed media information from a filename."""
title: str
media_type: str # 'movie' or 'episode' (TV)
year: Optional[int] = None
season: Optional[int] = None
episode: Optional[int] = None
quality: Optional[str] = None
source: Optional[str] = None
codec: Optional[str] = None
release_group: Optional[str] = None
original_filename: str = ""
def to_dict(self) -> Dict[str, Any]:
return {
'title': self.title,
'media_type': self.media_type,
'year': self.year,
'season': self.season,
'episode': self.episode,
'quality': self.quality,
'source': self.source,
'codec': self.codec,
'release_group': self.release_group,
'original_filename': self.original_filename,
}
@dataclass
class TMDBMatch:
"""Represents a TMDB match for parsed media."""
tmdb_id: int
title: str
original_title: Optional[str]
media_type: str # 'movie' or 'tv'
year: Optional[int] = None
poster_path: Optional[str] = None
overview: Optional[str] = None
# For TV episodes
season_number: Optional[int] = None
episode_number: Optional[int] = None
episode_title: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
return {
'tmdb_id': self.tmdb_id,
'title': self.title,
'original_title': self.original_title,
'media_type': self.media_type,
'year': self.year,
'poster_path': self.poster_path,
'overview': self.overview,
'season_number': self.season_number,
'episode_number': self.episode_number,
'episode_title': self.episode_title,
}
class MediaIdentifier:
"""
Identifies media from filenames and matches against TMDB.
Uses guessit for filename parsing and TMDB API for metadata enrichment.
"""
TMDB_BASE_URL = "https://api.themoviedb.org/3"
TMDB_IMAGE_BASE = "https://image.tmdb.org/t/p/w500"
# Quality normalization patterns
QUALITY_MAP = {
'2160p': '2160p',
'4k': '2160p',
'uhd': '2160p',
'1080p': '1080p',
'fullhd': '1080p',
'fhd': '1080p',
'720p': '720p',
'hd': '720p',
'480p': '480p',
'sd': '480p',
'360p': '360p',
}
def __init__(self, tmdb_api_key: str):
"""
Initialize the MediaIdentifier.
Args:
tmdb_api_key: TMDB API key for lookups
"""
self.api_key = tmdb_api_key
self.session = requests.Session()
def parse_filename(self, filename: str) -> Optional[ParsedMedia]:
"""
Parse a media filename to extract metadata.
Args:
filename: The filename to parse (without path)
Returns:
ParsedMedia object with extracted information, or None if parsing fails
"""
if not filename:
return None
# Strip path if present
filename = Path(filename).name
if GUESSIT_AVAILABLE:
return self._parse_with_guessit(filename)
else:
return self._parse_fallback(filename)
def _parse_with_guessit(self, filename: str) -> Optional[ParsedMedia]:
"""Parse filename using guessit library."""
try:
result = guessit.guessit(filename)
# Determine media type
media_type = result.get('type', 'movie')
if media_type == 'episode':
media_type = 'episode'
else:
media_type = 'movie'
# Extract title
title = result.get('title', '')
if not title:
return None
# Extract quality
quality = None
screen_size = result.get('screen_size')
if screen_size:
quality = self.QUALITY_MAP.get(str(screen_size).lower(), str(screen_size))
return ParsedMedia(
title=title,
media_type=media_type,
year=result.get('year'),
season=result.get('season'),
episode=result.get('episode'),
quality=quality,
source=result.get('source'),
codec=result.get('video_codec'),
release_group=result.get('release_group'),
original_filename=filename,
)
except Exception as e:
logger.error(f"guessit parsing failed for '{filename}': {e}")
return self._parse_fallback(filename)
def _parse_fallback(self, filename: str) -> Optional[ParsedMedia]:
"""
Fallback parser when guessit is not available.
Uses regex patterns to extract common media info.
"""
try:
# Remove extension
name = Path(filename).stem
# Replace common separators with spaces
name = re.sub(r'[._]', ' ', name)
# Try to extract TV show pattern: Show Name S01E02 or Show.Name.1x02
tv_pattern = r'^(.+?)[\s\.]+[Ss](\d{1,2})[Ee](\d{1,2})'
tv_match = re.match(tv_pattern, name)
if tv_match:
title = tv_match.group(1).strip()
season = int(tv_match.group(2))
episode = int(tv_match.group(3))
# Extract quality
quality = self._extract_quality(name)
return ParsedMedia(
title=title,
media_type='episode',
season=season,
episode=episode,
quality=quality,
original_filename=filename,
)
# Try alternative TV pattern: 1x02 format
alt_tv_pattern = r'^(.+?)[\s\.]+(\d{1,2})x(\d{1,2})'
alt_match = re.match(alt_tv_pattern, name)
if alt_match:
title = alt_match.group(1).strip()
season = int(alt_match.group(2))
episode = int(alt_match.group(3))
quality = self._extract_quality(name)
return ParsedMedia(
title=title,
media_type='episode',
season=season,
episode=episode,
quality=quality,
original_filename=filename,
)
# Assume movie - extract title and year
# Pattern: Movie Title (2023) or Movie.Title.2023
movie_pattern = r'^(.+?)[\s\.]+\(?(\d{4})\)?'
movie_match = re.match(movie_pattern, name)
if movie_match:
title = movie_match.group(1).strip()
year = int(movie_match.group(2))
else:
# Just use the name as title
title = name.split()[0] if name.split() else name
year = None
quality = self._extract_quality(name)
return ParsedMedia(
title=title,
media_type='movie',
year=year,
quality=quality,
original_filename=filename,
)
except Exception as e:
logger.error(f"Fallback parsing failed for '{filename}': {e}")
return None
def _extract_quality(self, text: str) -> Optional[str]:
"""Extract quality from text."""
text_lower = text.lower()
for pattern, quality in self.QUALITY_MAP.items():
if pattern in text_lower:
return quality
return None
def match_tmdb(self, parsed: ParsedMedia) -> Optional[TMDBMatch]:
"""
Match parsed media against TMDB.
Args:
parsed: ParsedMedia object from parse_filename
Returns:
TMDBMatch object if found, None otherwise
"""
if not parsed:
return None
try:
if parsed.media_type == 'episode':
return self._match_tv_show(parsed)
else:
return self._match_movie(parsed)
except Exception as e:
logger.error(f"TMDB matching failed for '{parsed.title}': {e}")
return None
def _match_tv_show(self, parsed: ParsedMedia) -> Optional[TMDBMatch]:
"""Match a TV show episode against TMDB."""
try:
# Search for the TV show
search_url = f"{self.TMDB_BASE_URL}/search/tv"
params = {
'api_key': self.api_key,
'query': parsed.title,
'page': 1,
}
if parsed.year:
params['first_air_date_year'] = parsed.year
response = self.session.get(search_url, params=params, timeout=30)
response.raise_for_status()
data = response.json()
results = data.get('results', [])
if not results:
logger.debug(f"No TMDB results for TV show: {parsed.title}")
return None
# Use the first (best) result
show = results[0]
show_id = show['id']
# Get episode details if we have season/episode
episode_title = None
if parsed.season and parsed.episode:
episode_url = f"{self.TMDB_BASE_URL}/tv/{show_id}/season/{parsed.season}/episode/{parsed.episode}"
ep_params = {'api_key': self.api_key}
try:
ep_response = self.session.get(episode_url, params=ep_params, timeout=30)
if ep_response.status_code == 200:
ep_data = ep_response.json()
episode_title = ep_data.get('name')
except Exception:
pass
# Parse year from first_air_date
year = None
first_air_date = show.get('first_air_date', '')
if first_air_date and len(first_air_date) >= 4:
try:
year = int(first_air_date[:4])
except ValueError:
pass
return TMDBMatch(
tmdb_id=show_id,
title=show.get('name', parsed.title),
original_title=show.get('original_name'),
media_type='tv',
year=year,
poster_path=show.get('poster_path'),
overview=show.get('overview'),
season_number=parsed.season,
episode_number=parsed.episode,
episode_title=episode_title,
)
except Exception as e:
logger.error(f"TMDB TV show matching failed: {e}")
return None
def _match_movie(self, parsed: ParsedMedia) -> Optional[TMDBMatch]:
"""Match a movie against TMDB."""
try:
# Search for the movie
search_url = f"{self.TMDB_BASE_URL}/search/movie"
params = {
'api_key': self.api_key,
'query': parsed.title,
'page': 1,
}
if parsed.year:
params['year'] = parsed.year
response = self.session.get(search_url, params=params, timeout=30)
response.raise_for_status()
data = response.json()
results = data.get('results', [])
if not results:
logger.debug(f"No TMDB results for movie: {parsed.title}")
return None
# Use the first (best) result
movie = results[0]
# Parse year from release_date
year = None
release_date = movie.get('release_date', '')
if release_date and len(release_date) >= 4:
try:
year = int(release_date[:4])
except ValueError:
pass
return TMDBMatch(
tmdb_id=movie['id'],
title=movie.get('title', parsed.title),
original_title=movie.get('original_title'),
media_type='movie',
year=year,
poster_path=movie.get('poster_path'),
overview=movie.get('overview'),
)
except Exception as e:
logger.error(f"TMDB movie matching failed: {e}")
return None
def get_organized_path(
self,
match: TMDBMatch,
base_path: str,
original_filename: str,
) -> str:
"""
Generate an organized file path for the matched media.
Args:
match: TMDBMatch object with TMDB metadata
base_path: Base directory for media storage
original_filename: Original filename (for extension)
Returns:
Full organized path for the file
"""
base = Path(base_path)
# Get extension from original filename
ext = Path(original_filename).suffix
# Sanitize title for filesystem
safe_title = self._sanitize_filename(match.title)
if match.media_type == 'tv':
# TV: {base}/TV Shows/{Show}/Season {XX}/{Show} - S{XX}E{XX} - {Episode Title}.{ext}
show_dir = base / "TV Shows" / safe_title
if match.season_number is not None:
season_dir = show_dir / f"Season {match.season_number:02d}"
else:
season_dir = show_dir / "Season 01"
# Build filename
if match.season_number is not None and match.episode_number is not None:
ep_part = f"S{match.season_number:02d}E{match.episode_number:02d}"
else:
ep_part = "S01E01"
if match.episode_title:
safe_ep_title = self._sanitize_filename(match.episode_title)
filename = f"{safe_title} - {ep_part} - {safe_ep_title}{ext}"
else:
filename = f"{safe_title} - {ep_part}{ext}"
return str(season_dir / filename)
else:
# Movie: {base}/Movies/{Title} ({Year})/{Title} ({Year}).{ext}
if match.year:
movie_folder = f"{safe_title} ({match.year})"
else:
movie_folder = safe_title
movie_dir = base / "Movies" / movie_folder
filename = f"{movie_folder}{ext}"
return str(movie_dir / filename)
def _sanitize_filename(self, name: str) -> str:
"""
Sanitize a string for use as a filename.
Removes/replaces characters that are invalid in filenames.
"""
if not name:
return "Unknown"
# Replace problematic characters
name = re.sub(r'[<>:"/\\|?*]', '', name)
name = re.sub(r'\s+', ' ', name)
name = name.strip()
# Limit length
if len(name) > 100:
name = name[:100].strip()
return name if name else "Unknown"
def identify_and_match(
self,
filename: str,
base_path: str = "/media",
) -> Dict[str, Any]:
"""
Convenience method to parse, match, and get organized path in one call.
Args:
filename: The media filename to process
base_path: Base directory for organized media
Returns:
Dict with parsed info, TMDB match, and organized path
"""
result = {
'success': False,
'filename': filename,
'parsed': None,
'match': None,
'organized_path': None,
'error': None,
}
try:
# Parse filename
parsed = self.parse_filename(filename)
if not parsed:
result['error'] = 'Failed to parse filename'
return result
result['parsed'] = parsed.to_dict()
# Match against TMDB
match = self.match_tmdb(parsed)
if match:
result['match'] = match.to_dict()
# Get organized path
organized_path = self.get_organized_path(match, base_path, filename)
result['organized_path'] = organized_path
result['success'] = True
else:
result['error'] = 'No TMDB match found'
return result
except Exception as e:
result['error'] = str(e)
logger.error(f"identify_and_match failed for '{filename}': {e}")
return result

View File

@@ -0,0 +1,86 @@
#!/usr/bin/env python3
"""
Helper wrapper to integrate monitoring with downloaders
"""
from functools import wraps
from modules.downloader_monitor import get_monitor
def monitor_download(downloader_name):
"""
Decorator to monitor download attempts
Usage:
@monitor_download('fastdl')
def download_function(username, ...):
...
return count
"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
# Extract username from args or kwargs
username = kwargs.get('username') or (args[0] if args else 'unknown')
try:
# Call the actual download function
result = func(*args, **kwargs)
# Determine success based on result
if isinstance(result, int):
count = result
success = count > 0
elif isinstance(result, dict):
count = result.get('count', 0)
success = result.get('success', count > 0)
else:
count = 0
success = False
# Log to monitor
monitor = get_monitor()
monitor.log_download_attempt(
downloader=downloader_name,
username=username,
success=success,
file_count=count,
error_message=None
)
return result
except Exception as e:
# Log failure
monitor = get_monitor()
monitor.log_download_attempt(
downloader=downloader_name,
username=username,
success=False,
file_count=0,
error_message=str(e)
)
raise
return wrapper
return decorator
def log_download_result(downloader: str, username: str, count: int, error: str = None):
"""
Simple function to log download result to monitor
Args:
downloader: Downloader name (fastdl, imginn, etc.)
username: Username
count: Number of files downloaded
error: Error message if failed
"""
monitor = get_monitor()
monitor.log_download_attempt(
downloader=downloader,
username=username,
success=(error is None),
file_count=count,
error_message=error
)

1714
modules/move_module.py Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,36 @@
"""
Paid Content Module
Downloads and organizes content from subscription-based creator platforms
(OnlyFans, Fansly, Patreon, Fanbox, etc.) via the Coomer.party and Kemono.party archival APIs.
Also supports YouTube channels and Twitch clips via yt-dlp.
"""
from .scraper import PaidContentScraper
from .api_client import PaidContentAPIClient
from .db_adapter import PaidContentDBAdapter
from .file_host_downloader import FileHostDownloader
from .embed_downloader import EmbedDownloader
from .youtube_client import YouTubeClient
from .twitch_client import TwitchClient, TwitchThumbnailCache
from .fansly_direct_client import FanslyDirectClient
from .onlyfans_client import OnlyFansClient
from .xhamster_client import XHamsterClient
from .tiktok_client import TikTokClient
from .instagram_adapter import InstagramAdapter
__all__ = [
'PaidContentScraper',
'PaidContentAPIClient',
'PaidContentDBAdapter',
'FileHostDownloader',
'EmbedDownloader',
'YouTubeClient',
'TwitchClient',
'TwitchThumbnailCache',
'FanslyDirectClient',
'OnlyFansClient',
'XHamsterClient',
'TikTokClient',
'InstagramAdapter',
]

View File

@@ -0,0 +1,311 @@
"""
Unified API client for Coomer.party and Kemono.party
Both services share the same API structure (Kemono fork)
"""
import aiohttp
import asyncio
from typing import List, Optional, Dict, Any
from modules.base_module import LoggingMixin, RateLimitMixin
from .models import Creator, Post, Attachment
class PaidContentAPIClient(LoggingMixin, RateLimitMixin):
"""
API client for Coomer and Kemono archival services
API Endpoints:
- GET /creators - List all creators
- GET /{service}/user/{creator_id} - Get creator info
- GET /{service}/user/{creator_id} - Get creator's posts (paginated with ?o=offset)
- GET /{service}/user/{creator_id}/post/{post_id} - Get single post
"""
# Fallback URLs if database doesn't have them configured
DEFAULT_SERVICE_URLS = {
'coomer': 'https://coomer.party',
'kemono': 'https://kemono.party'
}
SUPPORTED_PLATFORMS = {
'coomer': ['onlyfans', 'fansly', 'candfans'],
'kemono': ['patreon', 'fanbox', 'gumroad', 'subscribestar', 'discord']
}
def __init__(self, service_id: str, session_cookie: str = None, base_url: str = None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='API')
self._init_rate_limiter(min_delay=0.5, max_delay=2.0, batch_delay_min=1, batch_delay_max=3)
self.service_id = service_id
# Use provided base_url, or fall back to defaults
if base_url:
# If base_url includes /api/v1, extract just the base
if '/api/v1' in base_url:
self.base_url = base_url.replace('/api/v1', '').rstrip('/')
else:
self.base_url = base_url.rstrip('/')
else:
self.base_url = self.DEFAULT_SERVICE_URLS.get(service_id)
self.api_url = f"{self.base_url}/api/v1"
self.session_cookie = session_cookie
self._session: Optional[aiohttp.ClientSession] = None
async def _get_session(self) -> aiohttp.ClientSession:
"""Get or create aiohttp session"""
if self._session is None or self._session.closed:
# Note: Coomer/Kemono require 'Accept: text/css' header as anti-scraping measure
# Despite this, they still return JSON responses
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/css',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': self.base_url
}
cookies = {}
if self.session_cookie:
cookies['session'] = self.session_cookie
timeout = aiohttp.ClientTimeout(total=30)
self._session = aiohttp.ClientSession(headers=headers, cookies=cookies, timeout=timeout)
return self._session
async def close(self):
"""Close the aiohttp session"""
if self._session and not self._session.closed:
await self._session.close()
self._session = None
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.close()
async def check_health(self) -> Dict[str, Any]:
"""Check API health status"""
import time
try:
session = await self._get_session()
start = time.time()
async with session.get(f"{self.api_url}/creators", timeout=aiohttp.ClientTimeout(total=10)) as resp:
elapsed = time.time() - start
if resp.status == 200:
# content_type=None allows parsing JSON regardless of response content-type
await resp.json(content_type=None)
return {'status': 'healthy', 'response_time': round(elapsed, 3)}
elif resp.status == 429:
return {'status': 'rate_limited', 'response_code': 429}
else:
return {'status': 'degraded', 'response_code': resp.status}
except asyncio.TimeoutError:
return {'status': 'timeout', 'error': 'Request timed out'}
except Exception as e:
return {'status': 'down', 'error': str(e)}
async def get_all_creators(self) -> List[Dict]:
"""Get list of all available creators (for search)"""
self._delay_between_items()
try:
session = await self._get_session()
async with session.get(f"{self.api_url}/creators") as resp:
if resp.status == 200:
return await resp.json(content_type=None)
self.log(f"Failed to get creators list: HTTP {resp.status}", 'warning')
return []
except Exception as e:
self.log(f"Error getting creators list: {e}", 'error')
return []
async def get_creator(self, platform: str, creator_id: str) -> Optional[Creator]:
"""Get creator info"""
self._delay_between_items()
try:
session = await self._get_session()
# First try to get creator profile
url = f"{self.api_url}/{platform}/user/{creator_id}/profile"
async with session.get(url) as resp:
if resp.status == 200:
data = await resp.json(content_type=None)
return Creator.from_api(data, self.service_id, platform, self.base_url)
# Fallback: get first post to extract creator info
url = f"{self.api_url}/{platform}/user/{creator_id}/posts"
async with session.get(url) as resp:
if resp.status == 200:
posts = await resp.json(content_type=None)
if posts and len(posts) > 0:
# Extract creator info from first post
first_post = posts[0]
# Construct image URLs - use .st instead of .party
from urllib.parse import urlparse
parsed = urlparse(self.base_url)
# Convert .party to .st for image URLs (coomer.party/kemono.party images are at .st)
netloc = parsed.netloc.replace('.party', '.st')
img_domain = f"img.{netloc}"
profile_image_url = f"https://{img_domain}/icons/{platform}/{creator_id}"
banner_image_url = f"https://{img_domain}/banners/{platform}/{creator_id}"
return Creator(
creator_id=creator_id,
service_id=self.service_id,
platform=platform,
username=first_post.get('user', creator_id),
display_name=first_post.get('user', creator_id),
profile_image_url=profile_image_url,
banner_image_url=banner_image_url
)
self.log(f"Creator not found: {platform}/{creator_id}", 'warning')
return None
except Exception as e:
self.log(f"Error getting creator {platform}/{creator_id}: {e}", 'error')
return None
async def get_creator_posts(self, platform: str, creator_id: str, offset: int = 0) -> List[Post]:
"""Get creator's posts (50 per page by default)"""
self._delay_between_items()
try:
session = await self._get_session()
url = f"{self.api_url}/{platform}/user/{creator_id}/posts"
params = {'o': offset} if offset > 0 else {}
async with session.get(url, params=params) as resp:
if resp.status == 200:
data = await resp.json(content_type=None)
return [Post.from_api(p, self.service_id, platform, creator_id, self.base_url) for p in data]
elif resp.status == 404:
self.log(f"Creator not found: {platform}/{creator_id}", 'warning')
else:
self.log(f"Failed to get posts: HTTP {resp.status}", 'warning')
return []
except Exception as e:
self.log(f"Error getting posts for {platform}/{creator_id}: {e}", 'error')
return []
async def get_all_creator_posts(self, platform: str, creator_id: str,
since_date: str = None, max_posts: int = None,
progress_callback=None) -> List[Post]:
"""Fetch all posts with pagination"""
all_posts = []
offset = 0
page = 0
self.log(f"Fetching posts for {platform}/{creator_id}", 'info')
while True:
posts = await self.get_creator_posts(platform, creator_id, offset)
if not posts:
break
for post in posts:
# Stop if we've reached posts we've already seen
if since_date and post.published_at and post.published_at <= since_date:
self.log(f"Reached already-seen post date: {post.published_at}", 'debug')
return all_posts
all_posts.append(post)
if max_posts and len(all_posts) >= max_posts:
self.log(f"Reached max posts limit: {max_posts}", 'debug')
return all_posts
page += 1
offset += 50
if progress_callback:
progress_callback(page, len(all_posts))
self._delay_between_batches()
self.log(f"Fetched {len(all_posts)} posts for {platform}/{creator_id}", 'info')
return all_posts
async def get_post(self, platform: str, creator_id: str, post_id: str) -> Optional[Post]:
"""Get single post by ID"""
self._delay_between_items()
try:
session = await self._get_session()
url = f"{self.api_url}/{platform}/user/{creator_id}/post/{post_id}"
async with session.get(url) as resp:
if resp.status == 200:
data = await resp.json(content_type=None)
# Single post endpoint wraps response in {"post": {...}}
if isinstance(data, dict) and 'post' in data:
data = data['post']
return Post.from_api(data, self.service_id, platform, creator_id, self.base_url)
return None
except Exception as e:
self.log(f"Error getting post {post_id}: {e}", 'error')
return None
async def search_creators(self, query: str, platform: str = None) -> List[Dict]:
"""Search for creators by name"""
self._delay_between_items()
try:
# Get all creators and filter locally (API doesn't have search endpoint)
all_creators = await self.get_all_creators()
query_lower = query.lower()
results = []
for creator in all_creators:
if platform and creator.get('service') != platform:
continue
name = (creator.get('name') or '').lower()
if query_lower in name:
results.append({
'id': creator.get('id'),
'name': creator.get('name'),
'service': creator.get('service'),
'indexed': creator.get('indexed'),
'updated': creator.get('updated'),
'favorited': creator.get('favorited', 0)
})
# Sort by favorited count (popularity)
results.sort(key=lambda x: x.get('favorited', 0), reverse=True)
return results[:50] # Limit results
except Exception as e:
self.log(f"Error searching creators: {e}", 'error')
return []
def get_attachment_url(self, server_path: str) -> str:
"""Convert server path to full download URL"""
if not server_path:
return ''
if server_path.startswith('http'):
return server_path
return f"{self.base_url}/data{server_path}"
def get_thumbnail_url(self, server_path: str) -> str:
"""Get thumbnail URL for an attachment"""
if not server_path:
return ''
if server_path.startswith('http'):
return server_path
return f"{self.base_url}/thumbnail/data{server_path}"
@classmethod
def get_supported_platforms(cls, service_id: str) -> List[str]:
"""Get list of supported platforms for a service"""
return cls.SUPPORTED_PLATFORMS.get(service_id, [])
@classmethod
def is_valid_service(cls, service_id: str) -> bool:
"""Check if service ID is valid"""
return service_id in cls.SERVICE_URLS
@classmethod
def get_service_ids(cls) -> List[str]:
"""Get list of all service IDs"""
return list(cls.SERVICE_URLS.keys())

View File

@@ -0,0 +1,389 @@
"""
Bellazon Forum Thread Client for Paid Content
Scrapes Bellazon forum threads (Invision Power Suite) treating each thread
as a "creator" and each reply with media as a post.
Only bellazon-hosted uploads are captured (external image host links are
unreliable/ephemeral). Video attachments (attachment.php) are also captured.
"""
import asyncio
import html
import json
import re
from datetime import datetime, timezone
from typing import Dict, List, Optional, Set
from urllib.parse import urlparse
import aiohttp
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class BellazonClient(LoggingMixin):
"""Client for scraping Bellazon forum threads."""
SERVICE_ID = 'bellazon'
PLATFORM = 'bellazon'
BASE_URL = 'https://www.bellazon.com/main'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
}
# Extensions considered images
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
# Extensions considered videos
VIDEO_EXTS = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
def __init__(self, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Bellazon')
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
async def get_profile_info(self, topic_id: str) -> Optional[Dict]:
"""Fetch first page of a thread and return profile-like info.
Returns dict with: username (slug), display_name, post_count, topic_url
"""
# Bellazon requires a slug in the URL but redirects to the correct one
url = f'{self.BASE_URL}/topic/{topic_id}-x/'
timeout = aiohttp.ClientTimeout(total=30)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"Bellazon topic {topic_id} returned HTTP {resp.status}", 'warning')
return None
final_url = str(resp.url)
page_html = await resp.text()
except Exception as e:
self.log(f"Failed to fetch Bellazon topic {topic_id}: {e}", 'error')
return None
# Extract slug from final URL: /topic/{id}-{slug}/
slug = self._extract_slug(final_url, topic_id)
# Extract thread title from <h1>
title = self._extract_title(page_html)
# Extract page count from "Page X of Y"
page_count = self._extract_page_count(page_html)
# Count comments on this page to estimate total
comment_ids = re.findall(r'data-commentid="(\d+)"', page_html)
per_page = len(comment_ids) or 20
estimated_comments = per_page * page_count
return {
'username': slug,
'display_name': title or slug,
'post_count': estimated_comments,
'page_count': page_count,
'topic_url': final_url.split('?')[0].rstrip('/'),
}
async def get_posts(self, topic_id: str, topic_url: str,
known_post_ids: Optional[Set[str]] = None,
progress_callback=None) -> List[Post]:
"""Scrape all pages of a thread and return posts with media."""
known = known_post_ids or set()
posts: List[Post] = []
# Fetch page 1 to get page count
page1_url = f'{topic_url}/page/1/'
timeout = aiohttp.ClientTimeout(total=30)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
page_html = await self._fetch_page(session, page1_url)
if page_html is None:
return posts
page_count = self._extract_page_count(page_html)
self.log(f"Thread has {page_count} pages", 'info')
# Parse page 1
page_posts = self._parse_page(page_html, topic_id, known)
posts.extend(page_posts)
if progress_callback:
progress_callback(len(posts))
# Parse remaining pages
for page_num in range(2, page_count + 1):
page_url = f'{topic_url}/page/{page_num}/'
await asyncio.sleep(1) # Rate limit
page_html = await self._fetch_page(session, page_url)
if page_html is None:
self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
break
page_posts = self._parse_page(page_html, topic_id, known)
posts.extend(page_posts)
if progress_callback:
progress_callback(len(posts))
self.log(f"Page {page_num}/{page_count}: {len(page_posts)} posts with media", 'debug')
except Exception as e:
self.log(f"Error scraping Bellazon thread: {e}", 'error')
self.log(f"Total: {len(posts)} posts with media from {page_count} pages", 'info')
return posts
# ------------------------------------------------------------------
# HTML parsing helpers
# ------------------------------------------------------------------
def _parse_page(self, page_html: str, topic_id: str, known: Set[str]) -> List[Post]:
"""Parse a single page of HTML and return Post objects for comments with media."""
posts: List[Post] = []
# Split HTML into comment blocks using data-commentid markers
# Each comment starts with data-commentid="..." and contains a content block
comment_pattern = re.compile(
r'data-commentid="(\d+)"\s+data-quotedata="([^"]*)"',
re.DOTALL
)
matches = list(comment_pattern.finditer(page_html))
if not matches:
return posts
for i, match in enumerate(matches):
comment_id = match.group(1)
post_id = f"comment_{comment_id}"
if post_id in known:
continue
quotedata_raw = match.group(2)
# Parse quote data for username and timestamp
username, timestamp = self._parse_quotedata(quotedata_raw)
# Extract the content block for this comment
start = match.end()
end = matches[i + 1].start() if i + 1 < len(matches) else len(page_html)
content_block = page_html[start:end]
# Find the actual content within data-role="commentContent"
# The closing pattern is </div> followed by blank lines then </div>
content_match = re.search(
r'data-role="commentContent"[^>]*>(.*?)</div>\s*\n\s*\n\s*</div>',
content_block, re.DOTALL
)
if not content_match:
# Fallback: grab everything from commentContent to ipsEntry__foot
content_match = re.search(
r'data-role="commentContent"[^>]*>(.*?)(?=ipsEntry__foot)',
content_block, re.DOTALL
)
if not content_match:
continue
content_html = content_match.group(1)
# Extract media from content
attachments = self._extract_media(content_html)
if not attachments:
continue # Skip text-only replies
# Build published_at from timestamp
published_at = None
if timestamp:
try:
dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
published_at = dt.isoformat()
except (ValueError, OSError):
pass
post = Post(
post_id=post_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=topic_id,
title='',
content=f"Posted by {username}" if username else '',
published_at=published_at,
attachments=attachments,
)
posts.append(post)
known.add(post_id)
return posts
def _extract_media(self, content_html: str) -> List[Attachment]:
"""Extract image and video attachments from a comment's HTML content."""
attachments: List[Attachment] = []
seen_urls: set = set()
# 1. Bellazon-hosted images: <a class="ipsAttachLink ipsAttachLink_image" href="...full..."><img src="...thumb...">
for m in re.finditer(
r'ipsAttachLink_image"\s+href="([^"]+)"[^>]*><img[^>]*src="([^"]+)"',
content_html
):
full_url = self._normalize_url(m.group(1))
if full_url in seen_urls:
continue
# Skip thumbnails as the full URL
if '_thumb.' in full_url or '.thumb.' in full_url:
continue
seen_urls.add(full_url)
attachments.append(self._make_attachment(full_url, 'image'))
# 2. Direct image/video links from bellazon uploads not caught by pattern 1
for m in re.finditer(
r'href="([^"]*bellazon\.com/main/uploads/[^"]+)"',
content_html
):
url = self._normalize_url(m.group(1))
if url in seen_urls:
continue
if '_thumb.' in url or '.thumb.' in url:
continue
ext = self._get_extension(url)
if ext in self.IMAGE_EXTS or ext in self.VIDEO_EXTS:
seen_urls.add(url)
file_type = 'image' if ext in self.IMAGE_EXTS else 'video'
attachments.append(self._make_attachment(url, file_type))
# 3. Video <source> tags: <source src="//www.bellazon.com/main/uploads/...MP4" type="video/mp4">
for m in re.finditer(
r'<source\s+src="([^"]+)"[^>]*type="video/',
content_html
):
url = self._normalize_url(m.group(1))
if url in seen_urls:
continue
seen_urls.add(url)
name = self._filename_from_url(url)
attachments.append(self._make_attachment(url, 'video', name=name))
# 4. Video/file attachments: <a href="...attachment.php?id=XXX">filename.MP4</a>
# These are protocol-relative URLs like //www.bellazon.com/main/applications/...
for m in re.finditer(
r'href="([^"]*attachment\.php\?id=\d+[^"]*)"[^>]*>([^<]+)',
content_html
):
att_url = self._normalize_url(m.group(1))
filename = m.group(2).strip()
if att_url in seen_urls:
continue
ext = self._get_extension(filename)
if ext in self.VIDEO_EXTS or ext in self.IMAGE_EXTS:
seen_urls.add(att_url)
file_type = 'video' if ext in self.VIDEO_EXTS else 'image'
attachments.append(self._make_attachment(att_url, file_type, name=filename))
return attachments
def _make_attachment(self, url: str, file_type: str, name: str = None) -> Attachment:
"""Create an Attachment from a URL."""
if name is None:
name = self._filename_from_url(url)
ext = self._get_extension(name)
return Attachment(
name=name,
file_type=file_type,
extension=ext if ext else None,
server_path=url, # Used as dedup key
download_url=url,
)
# ------------------------------------------------------------------
# Utility helpers
# ------------------------------------------------------------------
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
"""Fetch a single page, return HTML or None."""
try:
async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"HTTP {resp.status} for {url}", 'warning')
return None
return await resp.text()
except Exception as e:
self.log(f"Error fetching {url}: {e}", 'warning')
return None
@staticmethod
def _extract_slug(url: str, topic_id: str) -> str:
"""Extract slug from URL like /topic/39089-india-reynolds/"""
m = re.search(rf'/topic/{re.escape(topic_id)}-([^/?#]+)', url)
if m:
return m.group(1).strip('/')
return topic_id
@staticmethod
def _extract_title(page_html: str) -> Optional[str]:
"""Extract thread title from <h1>."""
m = re.search(r'<h1[^>]*>([^<]+)</h1>', page_html)
if m:
return html.unescape(m.group(1).strip())
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
if m:
title = html.unescape(m.group(1).strip())
# Remove site suffix
title = re.sub(r'\s*[-–—]\s*Bellazon.*$', '', title, flags=re.IGNORECASE).strip()
return title
return None
@staticmethod
def _extract_page_count(page_html: str) -> int:
"""Extract total page count from 'Page X of Y'."""
m = re.search(r'Page\s+\d+\s+of\s+(\d+)', page_html)
if m:
return int(m.group(1))
return 1
@staticmethod
def _parse_quotedata(raw: str) -> tuple:
"""Parse HTML-encoded JSON quotedata, return (username, unix_timestamp)."""
try:
decoded = html.unescape(raw)
data = json.loads(decoded)
return data.get('username', ''), data.get('timestamp')
except (json.JSONDecodeError, ValueError):
return '', None
@staticmethod
def _normalize_url(url: str) -> str:
"""Normalize a URL: handle protocol-relative, decode HTML entities, make absolute."""
url = html.unescape(url) # &amp; → &
if url.startswith('//'):
url = 'https:' + url
elif url.startswith('/'):
url = 'https://www.bellazon.com' + url
elif not url.startswith('http'):
url = 'https://www.bellazon.com/main/' + url
return url
@staticmethod
def _get_extension(filename_or_url: str) -> str:
"""Get lowercase file extension from a filename or URL."""
# Strip query params
clean = filename_or_url.split('?')[0].split('#')[0]
if '.' in clean.split('/')[-1]:
return clean.rsplit('.', 1)[-1].lower()
return ''
@staticmethod
def _filename_from_url(url: str) -> str:
"""Extract filename from URL path."""
path = urlparse(url).path
name = path.rstrip('/').split('/')[-1]
return name if name else 'unnamed'

View File

@@ -0,0 +1,468 @@
"""
BestEyeCandy.com Client for Paid Content
Scrapes celebrity photo galleries from BestEyeCandy.com.
Each celeb has a unique CID and paginated photo listings.
Optimization: Full-res URLs follow a predictable pattern. We visit ONE
detail page to determine the pattern (server hostname + name format),
then construct all remaining URLs from photo IDs found on listing pages.
"""
import asyncio
import html
import json
import re
from datetime import datetime, timezone
from typing import Dict, List, Optional, Set
from urllib.parse import urlparse
import aiohttp
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class BestEyeCandyClient(LoggingMixin):
"""Client for scraping BestEyeCandy.com celebrity photo galleries."""
SERVICE_ID = 'besteyecandy'
PLATFORM = 'besteyecandy'
BASE_URL = 'https://besteyecandy.com'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
}
def __init__(self, unified_db=None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='BestEyeCandy')
self.unified_db = unified_db
# ------------------------------------------------------------------
# Cookie support
# ------------------------------------------------------------------
def _get_cookies(self) -> Optional[list]:
"""Load cookies from the scrapers table for besteyecandy."""
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?",
(self.SERVICE_ID,))
row = cursor.fetchone()
if row and row[0]:
data = json.loads(row[0])
if isinstance(data, dict) and 'cookies' in data:
return data['cookies']
elif isinstance(data, list):
return data
except Exception as e:
self.log(f"Could not load cookies: {e}", 'debug')
return None
def _build_cookie_jar(self, cookies_list: list) -> aiohttp.CookieJar:
"""Build an aiohttp CookieJar from a list of cookie dicts."""
jar = aiohttp.CookieJar(unsafe=True)
for cookie in cookies_list:
from http.cookies import Morsel
import types
name = cookie.get('name', '')
value = cookie.get('value', '')
domain = cookie.get('domain', '')
path = cookie.get('path', '/')
# Use SimpleCookie approach
from http.cookies import SimpleCookie
sc = SimpleCookie()
sc[name] = value
sc[name]['domain'] = domain
sc[name]['path'] = path
if cookie.get('secure'):
sc[name]['secure'] = True
jar.update_cookies(sc, urlparse(f"https://{domain.lstrip('.')}"))
return jar
def _create_session(self, timeout: aiohttp.ClientTimeout = None) -> aiohttp.ClientSession:
"""Create an aiohttp session with cookies loaded from DB."""
if timeout is None:
timeout = aiohttp.ClientTimeout(total=60)
cookies_list = self._get_cookies()
if cookies_list:
jar = self._build_cookie_jar(cookies_list)
self.log(f"Loaded {len(cookies_list)} cookies for session", 'debug')
return aiohttp.ClientSession(timeout=timeout, cookie_jar=jar)
else:
self.log("No cookies found for besteyecandy, requests may fail", 'warning')
return aiohttp.ClientSession(timeout=timeout)
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
async def get_profile_info(self, cid: str, celeb_slug: str) -> Optional[Dict]:
"""Fetch page 1 of a celeb's listing and return profile-like info."""
url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
f'sortedby-age/page-1/{celeb_slug}.html')
try:
async with self._create_session() as session:
async with session.get(url, headers=self.HEADERS,
allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"BestEyeCandy cid {cid} returned HTTP {resp.status}",
'warning')
return None
page_html = await resp.text()
except Exception as e:
self.log(f"Failed to fetch BestEyeCandy cid {cid}: {e}", 'error')
return None
# Extract celeb name from page title or heading
celeb_name = self._extract_celeb_name(page_html) or celeb_slug.replace('-', ' ')
# Extract total photos and pages
total_photos = self._extract_total_photos(page_html)
photos_per_page = len(self._extract_photo_ids(page_html)) or 48
page_count = self._extract_page_count(page_html,
photos_per_page=photos_per_page)
celeb_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
f'sortedby-age/page-1/{celeb_slug}.html')
return {
'username': celeb_slug,
'display_name': celeb_name,
'post_count': total_photos,
'page_count': page_count,
'celeb_url': celeb_url,
}
async def get_posts(self, cid: str, celeb_slug: str,
known_post_ids: Optional[Set[str]] = None,
progress_callback=None) -> List[Post]:
"""Scrape all listing pages and return posts with full-res image URLs.
Each listing page becomes one Post with ~48 Attachments (one per photo).
Post IDs are "page_N" (e.g. "page_1", "page_2", ...).
Phase 1: Fetch page 1, get first photo ID, visit detail page to learn
the full-res URL pattern.
Phase 2: Paginate all listing pages, build one Post per page.
"""
known = known_post_ids or set()
posts: List[Post] = []
total_photos = 0
url_pattern = None
try:
async with self._create_session() as session:
# -- Phase 1: Fetch page 1 and determine full-res URL pattern --
page1_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
f'sortedby-age/page-1/{celeb_slug}.html')
page_html = await self._fetch_page(session, page1_url)
if page_html is None:
return []
# Estimate page count for progress display
photos_per_page = len(self._extract_photo_ids(page_html)) or 48
estimated_pages = self._extract_page_count(
page_html, photos_per_page=photos_per_page)
self.log(f"Estimated {estimated_pages} pages of photos "
f"({photos_per_page}/page)", 'info')
# Discover full-res URL pattern from first photo
first_page_ids = self._extract_photo_ids(page_html)
if first_page_ids:
url_pattern = await self._discover_url_pattern(
session, first_page_ids[0], cid, celeb_slug)
if not url_pattern:
self.log("Could not determine full-res URL pattern", 'error')
return []
self.log(f"URL pattern: server={url_pattern['server']}, "
f"name_format={url_pattern['name_format']}, "
f"ext={url_pattern['ext']}", 'info')
# -- Phase 2: Paginate all pages, one Post per page --
page_num = 0
has_next = True # start with page 1
while has_next:
page_num += 1
if page_num == 1:
# Already fetched page 1
pass
else:
await asyncio.sleep(2) # Rate limit
page_url = (
f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
f'sortedby-age/page-{page_num}/{celeb_slug}.html')
page_html = await self._fetch_page(session, page_url)
if page_html is None:
self.log(f"Failed to fetch page {page_num}, stopping",
'warning')
break
page_ids = self._extract_photo_ids(page_html)
if not page_ids:
self.log(f"Page {page_num}: no photos, stopping", 'info')
break
total_photos += len(page_ids)
has_next = self._has_next_page(page_html)
# Check if this page-post is already known
post_id = f"page_{page_num}"
if post_id in known:
self.log(f"Page {page_num}: already known, skipping",
'debug')
if progress_callback:
progress_callback(
f"Page {page_num}/~{estimated_pages}"
f"{total_photos} photos (skipped known)")
continue
# Build attachments for all photos on this page
attachments = []
for photo_id in page_ids:
dl_url = self._construct_full_res_url(url_pattern, photo_id)
filename = dl_url.rsplit('/', 1)[-1]
attachments.append(Attachment(
name=filename,
file_type='image',
extension=url_pattern.get('ext', 'jpg'),
server_path=dl_url,
download_url=dl_url,
))
post = Post(
post_id=post_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=cid,
title=f"Page {page_num}",
content=f"{len(page_ids)} photos",
published_at=datetime.now(tz=timezone.utc).isoformat(),
attachments=attachments,
)
posts.append(post)
if progress_callback:
progress_callback(
f"Page {page_num}/~{estimated_pages}"
f"{total_photos} photos")
self.log(f"Page {page_num}/~{estimated_pages}: "
f"{len(page_ids)} photos", 'debug')
except Exception as e:
self.log(f"Error scraping BestEyeCandy: {e}", 'error')
self.log(f"Total: {len(posts)} new page-posts with "
f"{total_photos} photos across all pages", 'info')
return posts
# ------------------------------------------------------------------
# URL pattern discovery
# ------------------------------------------------------------------
async def _discover_url_pattern(self, session: aiohttp.ClientSession,
photo_id: str, cid: str,
celeb_slug: str) -> Optional[Dict]:
"""Visit a detail page to discover the full-res URL pattern.
Returns dict with keys: server, dir_pattern, name_format, ext
"""
detail_url = (f'{self.BASE_URL}/section/celeb-photogallery/'
f'cid-{cid}/{celeb_slug}/photo-{photo_id}.html')
await asyncio.sleep(2) # Rate limit
page_html = await self._fetch_page(session, detail_url)
if page_html is None:
return None
# Look for full-res image URL in the detail page
# Pattern: <img src="https://euX.besteyecandy.com/section/large-photos/area-female/besteyecandy-{ID}/{Name}_{ID}_BestEyeCandyCOM.jpg">
# or <a href="..."> with similar pattern
patterns = [
r'(https?://[a-z0-9]+\.besteyecandy\.com/section/large-photos/[^"\'>\s]+)',
r'(https?://[a-z0-9]+\.besteyecandy\.com/[^"\'>\s]*besteyecandy-' + re.escape(photo_id) + r'[^"\'>\s]*)',
]
full_res_url = None
for pattern in patterns:
match = re.search(pattern, page_html)
if match:
full_res_url = match.group(1)
break
if not full_res_url:
self.log(f"Could not find full-res URL on detail page for photo {photo_id}",
'error')
return None
self.log(f"Found full-res URL: {full_res_url}", 'debug')
# Parse the URL to extract the pattern components
parsed = urlparse(full_res_url)
server = parsed.netloc # e.g., eu4.besteyecandy.com
# Extract name format from the filename
# e.g., Myleene_Klass_7727820_BestEyeCandyCOM.jpg
filename = parsed.path.rsplit('/', 1)[-1]
ext = filename.rsplit('.', 1)[-1] if '.' in filename else 'jpg'
# Extract the path pattern (everything before the filename)
path_dir = parsed.path.rsplit('/', 1)[0] # e.g., /section/large-photos/area-female/besteyecandy-7727820
# The directory pattern includes the photo ID, extract the base
# e.g., /section/large-photos/area-female/besteyecandy-{ID}
dir_pattern = re.sub(re.escape(photo_id), '{ID}', path_dir)
# Extract the name format by removing the photo ID
# e.g., Myleene_Klass_{ID}_BestEyeCandyCOM.jpg -> Myleene_Klass_{ID}_BestEyeCandyCOM
name_without_ext = filename.rsplit('.', 1)[0]
name_format = name_without_ext.replace(photo_id, '{ID}')
return {
'server': server,
'dir_pattern': dir_pattern,
'name_format': name_format,
'ext': ext,
'example_url': full_res_url,
}
def _construct_full_res_url(self, url_pattern: Dict, photo_id: str) -> str:
"""Construct the full-res URL for a photo ID using the discovered pattern."""
dir_path = url_pattern['dir_pattern'].replace('{ID}', photo_id)
filename = url_pattern['name_format'].replace('{ID}', photo_id) + '.' + url_pattern['ext']
return f"https://{url_pattern['server']}{dir_path}/{filename}"
# ------------------------------------------------------------------
# HTML parsing helpers
# ------------------------------------------------------------------
def _extract_photo_ids(self, page_html: str) -> List[str]:
"""Extract photo IDs from a listing page.
Photo links look like: href="...photo-12345.html"
"""
ids = re.findall(r'href="[^"]*photo-(\d+)\.html"', page_html)
# Deduplicate while preserving order
seen = set()
unique_ids = []
for pid in ids:
if pid not in seen:
seen.add(pid)
unique_ids.append(pid)
return unique_ids
@staticmethod
def _extract_celeb_name(page_html: str) -> Optional[str]:
"""Extract celebrity name from the page."""
# Try <title> tag: "Myleene Klass Photo Collection @ ...::: BestEyeCandy.com :::..."
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
if m:
title = html.unescape(m.group(1).strip())
# Remove everything from "Photo Collection" or "@" onwards
title = re.sub(r'\s*Photo\s+Collection.*$', '', title,
flags=re.IGNORECASE).strip()
title = re.sub(r'\s*@.*$', '', title).strip()
# Fallback: remove BestEyeCandy suffix
title = re.sub(r'\s*[-\u2013\u2014|]?\s*\.{0,3}:{0,3}\s*BestEyeCandy.*$', '',
title, flags=re.IGNORECASE).strip()
if title:
return title
# Try <h1> or <h2>
m = re.search(r'<h[12][^>]*>([^<]+)</h[12]>', page_html)
if m:
return html.unescape(m.group(1).strip())
return None
@staticmethod
def _extract_total_photos(page_html: str) -> int:
"""Extract total photo count from the page.
Handles European format (15.660) and US format (15,660).
"""
# Look for "N.NNN photos" or "N,NNN photos" or "NNN photos"
# Require leading digit to avoid matching ", photo" from keywords
m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
if m:
num_str = m.group(1)
# European format uses dots as thousands separators: 15.660
# US format uses commas: 15,660
# Remove both dots and commas (they're thousands separators)
num_str = num_str.replace('.', '').replace(',', '')
try:
return int(num_str)
except ValueError:
pass
return 0
@staticmethod
def _extract_page_count(page_html: str, photos_per_page: int = 48) -> int:
"""Extract total page count from the listing page.
Uses total photo count divided by photos per page, or falls back
to finding the maximum page number in pagination links.
"""
# Method 1: Calculate from total photos
m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
if m:
num_str = m.group(1).replace('.', '').replace(',', '')
try:
total = int(num_str)
if total > 0:
return (total + photos_per_page - 1) // photos_per_page
except ValueError:
pass
# Method 2: Find max page-N in pagination links for same celeb
page_nums = [int(x) for x in re.findall(r'/page-(\d+)/', page_html)]
if page_nums:
return max(page_nums)
return 1
@staticmethod
def _has_next_page(page_html: str) -> bool:
"""Check if there's a 'Next Page' link on the current page."""
return 'alt="Next Page"' in page_html
# ------------------------------------------------------------------
# Utility helpers
# ------------------------------------------------------------------
async def _fetch_page(self, session: aiohttp.ClientSession,
url: str) -> Optional[str]:
"""Fetch a single page, return HTML or None."""
try:
async with session.get(url, headers=self.HEADERS,
allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"HTTP {resp.status} for {url}", 'warning')
return None
return await resp.text()
except Exception as e:
self.log(f"Error fetching {url}: {e}", 'warning')
return None

View File

@@ -0,0 +1,622 @@
"""
Coppermine Gallery scraper client.
Coppermine is a PHP photo gallery with a nested structure:
categories > sub-categories > albums > photos
One album maps to one Post with N Attachments.
Full-res URLs are derived from thumbnails by stripping the `thumb_` prefix.
"""
import asyncio
import re
from datetime import datetime
from typing import Dict, List, Optional, Set
from urllib.parse import urljoin, urlparse, parse_qs
import aiohttp
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class CoppermineClient(LoggingMixin):
SERVICE_ID = 'coppermine'
PLATFORM = 'coppermine'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
}
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
def __init__(self, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Coppermine')
async def get_profile_info(self, gallery_url: str) -> Optional[Dict]:
"""Fetch gallery root and extract profile metadata.
Args:
gallery_url: Base gallery URL (e.g. https://kylie-jenner.org/gallery)
Returns:
Dict with username, display_name, post_count, gallery_url or None on failure
"""
root_url = self._build_url(gallery_url, 'index.php')
timeout = aiohttp.ClientTimeout(total=30)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
html = await self._fetch_page(session, root_url)
if not html:
return None
# Extract site title from <title> tag
title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
site_title = title_match.group(1).strip() if title_match else 'Coppermine Gallery'
# Clean HTML entities
site_title = re.sub(r'&amp;', '&', site_title)
site_title = re.sub(r'&lt;', '<', site_title)
site_title = re.sub(r'&gt;', '>', site_title)
site_title = re.sub(r'&#\d+;', '', site_title)
site_title = re.sub(r'&\w+;', '', site_title)
# Try to extract stats: "N files in M albums"
total_files = 0
total_albums = 0
stats_match = re.search(
r'(\d[\d,]*)\s+files?\s+in\s+(\d[\d,]*)\s+albums?',
html, re.IGNORECASE
)
if stats_match:
total_files = int(stats_match.group(1).replace(',', ''))
total_albums = int(stats_match.group(2).replace(',', ''))
# Use domain as username
parsed = urlparse(gallery_url)
domain = parsed.netloc.replace('www.', '')
return {
'username': domain,
'display_name': site_title,
'post_count': total_albums,
'gallery_url': gallery_url,
}
except Exception as e:
self.log(f"Error fetching profile info from {gallery_url}: {e}", 'error')
return None
async def get_posts(self, gallery_url: str,
known_post_ids: Optional[Set[str]] = None,
progress_callback=None,
post_callback=None):
"""Crawl the gallery, yielding new albums as Post objects incrementally.
Phase 1: Fetch root, extract top-level category links
Phase 2: Recursively crawl categories until album links found
Phase 3: For each album, fetch thumbnails and call post_callback immediately
Args:
gallery_url: Base gallery URL
known_post_ids: Set of post IDs already in DB (album_NNN)
progress_callback: Called with status message strings
post_callback: async callable(post) — called for each album as it's fetched.
If provided, posts are streamed instead of collected.
Returns:
List of Post objects (only if post_callback is None)
"""
known = known_post_ids or set()
timeout = aiohttp.ClientTimeout(total=None, sock_connect=30, sock_read=60)
posts_collected = [] if post_callback is None else None
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
# Phase 1: Get all category links from root
root_url = self._build_url(gallery_url, 'index.php')
root_html = await self._fetch_page(session, root_url)
if not root_html:
self.log("Failed to fetch gallery root", 'error')
return [] if post_callback is None else None
category_ids = self._extract_category_ids(root_html)
self.log(f"Found {len(category_ids)} top-level categories", 'info')
if progress_callback:
progress_callback(f'Found {len(category_ids)} categories, crawling...')
# Phase 2: Recursively crawl categories to find album IDs
album_ids = set()
visited_cats = set()
for cat_id in category_ids:
new_albums = await self._crawl_category(
session, gallery_url, cat_id, visited_cats, known, progress_callback
)
album_ids.update(new_albums)
# Filter out known albums
new_album_ids = {aid for aid in album_ids
if f"album_{aid}" not in known}
self.log(f"Found {len(new_album_ids)} new albums "
f"({len(album_ids)} total, {len(album_ids) - len(new_album_ids)} known)",
'info')
if progress_callback:
progress_callback(f'Found {len(new_album_ids)} new albums, fetching photos...')
# Phase 3: Fetch each new album and deliver Post objects
parsed = urlparse(gallery_url)
domain = parsed.netloc.replace('www.', '')
fetched = 0
for i, album_id in enumerate(sorted(new_album_ids)):
if progress_callback and (i + 1) % 5 == 0:
progress_callback(
f'Fetching album {i + 1}/{len(new_album_ids)}...'
)
post = await self._fetch_album(session, gallery_url, album_id, domain)
if post and post.attachments:
fetched += 1
if post_callback:
await post_callback(post)
else:
posts_collected.append(post)
# Rate limit: 1s between page fetches
await asyncio.sleep(2)
self.log(f"Fetched {fetched} albums with attachments", 'info')
return posts_collected
except Exception as e:
self.log(f"Error crawling gallery {gallery_url}: {e}", 'error')
return [] if post_callback is None else None
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _build_url(self, gallery_url: str, page: str) -> str:
"""Build a full URL from the gallery base and a page name."""
base = gallery_url.rstrip('/')
return f"{base}/{page}"
async def _fetch_page(self, session: aiohttp.ClientSession, url: str,
max_retries: int = 3) -> Optional[str]:
"""Fetch a page and return its HTML text, or None on failure.
Retries with exponential backoff on connection errors / server disconnects.
"""
for attempt in range(max_retries):
try:
async with session.get(url, headers=self.HEADERS) as resp:
if resp.status == 429:
wait = 5 * (attempt + 1)
self.log(f"Rate limited on {url}, waiting {wait}s", 'warning')
await asyncio.sleep(wait)
continue
if resp.status != 200:
self.log(f"HTTP {resp.status} fetching {url}", 'warning')
return None
return await resp.text()
except (aiohttp.ServerDisconnectedError, aiohttp.ClientOSError,
aiohttp.ClientPayloadError, ConnectionResetError) as e:
wait = 3 * (attempt + 1)
if attempt < max_retries - 1:
self.log(f"Connection error on {url}, retry {attempt + 1} in {wait}s: {e}",
'warning')
await asyncio.sleep(wait)
else:
self.log(f"Failed after {max_retries} attempts: {url}: {e}", 'warning')
return None
except Exception as e:
self.log(f"Error fetching {url}: {e}", 'warning')
return None
return None
def _extract_category_ids(self, html: str) -> List[str]:
"""Extract category IDs from index.php page.
Looks for links like: index.php?cat=N
"""
cat_ids = []
seen = set()
for match in re.finditer(r'index\.php\?cat=(\d+)', html):
cat_id = match.group(1)
if cat_id not in seen:
seen.add(cat_id)
cat_ids.append(cat_id)
return cat_ids
def _extract_album_ids(self, html: str) -> List[str]:
"""Extract album IDs from a category page.
Looks for links like: thumbnails.php?album=N
"""
album_ids = []
seen = set()
for match in re.finditer(r'thumbnails\.php\?album=(\d+)', html):
album_id = match.group(1)
if album_id not in seen:
seen.add(album_id)
album_ids.append(album_id)
return album_ids
def _extract_page_count(self, html: str) -> int:
"""Extract total page count from Coppermine pagination text.
Looks for patterns like "53 albums on 2 page(s)" or "N files on M page(s)".
"""
match = re.search(r'on\s+(\d+)\s+page\(s\)', html, re.IGNORECASE)
if match:
return int(match.group(1))
return 1
async def _crawl_category(self, session: aiohttp.ClientSession,
gallery_url: str, cat_id: str,
visited: Set[str], known: Set[str],
progress_callback=None,
depth: int = 0) -> Set[str]:
"""Recursively crawl a category to find all album IDs.
Categories can contain sub-categories or albums. We recurse
until we find album links (thumbnails.php?album=N).
Handles pagination within category pages (index.php?cat=N&page=M).
Args:
session: aiohttp session
gallery_url: Base gallery URL
cat_id: Category ID to crawl
visited: Set of already-visited category IDs (prevents loops)
known: Set of known post_ids (for logging only)
progress_callback: Status callback
depth: Recursion depth (max 10)
Returns:
Set of album ID strings
"""
if cat_id in visited or depth > 10:
return set()
visited.add(cat_id)
# Fetch first page
cat_url = self._build_url(gallery_url, f'index.php?cat={cat_id}')
html = await self._fetch_page(session, cat_url)
if not html:
return set()
await asyncio.sleep(2)
album_ids = set(self._extract_album_ids(html))
sub_cat_ids = self._extract_category_ids(html)
# Handle pagination: fetch remaining pages
total_pages = self._extract_page_count(html)
if total_pages > 1:
for page_num in range(2, total_pages + 1):
page_url = self._build_url(
gallery_url, f'index.php?cat={cat_id}&page={page_num}'
)
page_html = await self._fetch_page(session, page_url)
if page_html:
album_ids.update(self._extract_album_ids(page_html))
# Sub-categories are the same on every page, no need to re-extract
await asyncio.sleep(2)
# Filter out the current category from sub-categories
sub_cat_ids = [c for c in sub_cat_ids if c != cat_id and c not in visited]
if progress_callback:
progress_callback(
f'Category {cat_id}: {len(album_ids)} albums, '
f'{len(sub_cat_ids)} sub-categories'
+ (f' ({total_pages} pages)' if total_pages > 1 else '')
)
# Recurse into sub-categories
for sub_id in sub_cat_ids:
sub_albums = await self._crawl_category(
session, gallery_url, sub_id, visited, known,
progress_callback, depth + 1
)
album_ids.update(sub_albums)
return album_ids
async def _fetch_album(self, session: aiohttp.ClientSession,
gallery_url: str, album_id: str,
domain: str) -> Optional[Post]:
"""Fetch an album page (all pages) and build a Post object.
Handles pagination within albums (thumbnails.php?album=N&page=M).
Args:
session: aiohttp session
gallery_url: Base gallery URL
album_id: Album ID to fetch
domain: Domain name for creator_id
Returns:
Post object with attachments, or None on failure
"""
album_url = self._build_url(gallery_url, f'thumbnails.php?album={album_id}')
html = await self._fetch_page(session, album_url)
if not html:
return None
# Extract album title from first page
title = self._extract_album_title(html)
if not title:
title = f"Album {album_id}"
# Extract attachments from first page
attachments = self._extract_attachments(html, gallery_url)
# Handle pagination within album
total_pages = self._extract_page_count(html)
if total_pages > 1:
for page_num in range(2, total_pages + 1):
page_url = self._build_url(
gallery_url, f'thumbnails.php?album={album_id}&page={page_num}'
)
page_html = await self._fetch_page(session, page_url)
if page_html:
attachments.extend(self._extract_attachments(page_html, gallery_url))
await asyncio.sleep(2)
if not attachments:
return None
# Extract album date from breadcrumb + title
album_date = self._extract_album_date(html, title)
post_id = f"album_{album_id}"
return Post(
post_id=post_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=domain,
title=None,
content=title,
published_at=album_date,
attachments=attachments,
)
def _extract_album_title(self, html: str) -> Optional[str]:
"""Extract album title from page HTML.
Priority: breadcrumb last item > <h1>/<h2> heading > <title> last segment
"""
# Try breadcrumb: last text segment after the last ">"
# Coppermine breadcrumbs: "Home > Category > Sub > Album Title"
bc_match = re.search(
r'class="[^"]*breadcrumb[^"]*"[^>]*>(.*?)</(?:div|span|td|p)',
html, re.DOTALL | re.IGNORECASE
)
if bc_match:
bc_text = bc_match.group(1)
# Strip HTML tags, split on ">", take last segment
bc_text = re.sub(r'<[^>]+>', ' ', bc_text)
parts = [p.strip() for p in bc_text.split('>') if p.strip()]
if parts:
title = self._clean_text(parts[-1])
if title and title.lower() not in ('home', 'index', 'gallery'):
return title
# Try headings
for tag in ('h1', 'h2', 'h3'):
h_match = re.search(
rf'<{tag}[^>]*>(.*?)</{tag}>', html, re.DOTALL | re.IGNORECASE
)
if h_match:
title = self._clean_text(h_match.group(1))
if title and len(title) > 2:
return title
# Fallback: <title> tag — take the last segment before the site name
title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
if title_match:
title = title_match.group(1).strip()
# Usually "Site Name - Album Title" or "Album Title - Site Name"
# The album-specific part is typically not the site name;
# use the longest segment as a heuristic
if ' - ' in title:
parts = [p.strip() for p in title.split(' - ')]
# Pick the longest part (album names tend to be longer than site names)
title = max(parts, key=len)
if title:
return self._clean_text(title)
return None
def _extract_album_date(self, html: str, title: str) -> str:
"""Extract album date from breadcrumb year + title month/day.
Breadcrumb: "Home > Candids > 2026 > January 11 - Leaving..."
Title: "January 11 - Leaving Golden Globes afterparty..."
Returns ISO date string, or current datetime as fallback.
"""
MONTHS = {
'january': 1, 'february': 2, 'march': 3, 'april': 4,
'may': 5, 'june': 6, 'july': 7, 'august': 8,
'september': 9, 'october': 10, 'november': 11, 'december': 12,
}
# Extract year from breadcrumb path (look for 4-digit year in links)
year = None
# Breadcrumb links: index.php?cat=155">2026</a>
for m in re.finditer(r'>\s*((?:19|20)\d{2})\s*</', html):
year = int(m.group(1))
# Also try path segments in albums/ URLs for year
if not year:
path_match = re.search(r'albums/[^/]+/(20\d{2})/', html)
if path_match:
year = int(path_match.group(1))
# Extract month and day from album title
month, day = None, None
if title:
# "January 11 - ..." or "March 3 - ..."
date_match = re.match(
r'(\w+)\s+(\d{1,2})\b', title
)
if date_match:
month_name = date_match.group(1).lower()
if month_name in MONTHS:
month = MONTHS[month_name]
day = int(date_match.group(2))
# Build date from breadcrumb year + title month/day
if year and month and day:
try:
return datetime(year, month, day).isoformat()
except ValueError:
pass
if year and month:
try:
return datetime(year, month, 1).isoformat()
except ValueError:
pass
if year:
return datetime(year, 1, 1).isoformat()
# Fallback: parse "Date added=Jan 13, 2026" from thumbnail tooltips
MONTH_ABBR = {
'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
}
added_match = re.search(
r'Date added\s*=\s*(\w{3})\s+(\d{1,2}),?\s+(\d{4})', html
)
if added_match:
m_abbr = added_match.group(1).lower()
if m_abbr in MONTH_ABBR:
try:
return datetime(
int(added_match.group(3)),
MONTH_ABBR[m_abbr],
int(added_match.group(2))
).isoformat()
except ValueError:
pass
# Also try "last one added on Jan 13, 2026" from album_stat
stat_match = re.search(
r'last one added on\s+(\w{3})\s+(\d{1,2}),?\s+(\d{4})', html
)
if stat_match:
m_abbr = stat_match.group(1).lower()
if m_abbr in MONTH_ABBR:
try:
return datetime(
int(stat_match.group(3)),
MONTH_ABBR[m_abbr],
int(stat_match.group(2))
).isoformat()
except ValueError:
pass
return datetime.now().isoformat()
def _extract_attachments(self, html: str, gallery_url: str) -> List[Attachment]:
"""Extract photo attachments from album page HTML.
Finds thumbnail images and converts them to full-res URLs by
stripping the `thumb_` prefix from the filename.
"""
attachments = []
seen_urls = set()
# Pattern: thumbnail images in album pages
# Common patterns:
# <img src="albums/path/thumb_filename.jpg" ...>
# <img src="albums/path/normal_filename.jpg" ...>
for match in re.finditer(
r'<img[^>]+src=["\']([^"\']*?albums/[^"\']*?(?:thumb_|normal_)[^"\']+)["\']',
html, re.IGNORECASE
):
thumb_src = match.group(1)
full_url = self._thumb_to_fullres(thumb_src, gallery_url)
if full_url and full_url not in seen_urls:
seen_urls.add(full_url)
filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
attachments.append(Attachment(
name=filename,
server_path=full_url, # use as dedup key
file_type='image' if ext in self.IMAGE_EXTS else 'unknown',
extension=ext or None,
download_url=full_url,
))
# Also try: <a href="displayimage.php?..."><img src="albums/...">
# Some themes wrap thumbnails in links
if not attachments:
for match in re.finditer(
r'<a[^>]+href=["\'][^"\']*displayimage\.php[^"\']*["\'][^>]*>'
r'\s*<img[^>]+src=["\']([^"\']+)["\']',
html, re.IGNORECASE | re.DOTALL
):
thumb_src = match.group(1)
full_url = self._thumb_to_fullres(thumb_src, gallery_url)
if full_url and full_url not in seen_urls:
seen_urls.add(full_url)
filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
attachments.append(Attachment(
name=filename,
server_path=full_url,
file_type='image' if ext in self.IMAGE_EXTS else 'unknown',
extension=ext or None,
download_url=full_url,
))
return attachments
def _thumb_to_fullres(self, thumb_src: str, gallery_url: str) -> Optional[str]:
"""Convert a thumbnail URL to a full-resolution URL.
Strips `thumb_` or `normal_` prefix from the filename and
prepends the gallery base URL if needed.
Args:
thumb_src: Thumbnail src attribute value
gallery_url: Base gallery URL
Returns:
Full-resolution image URL, or None if conversion fails
"""
if not thumb_src:
return None
# Strip thumb_ or normal_ prefix from filename
# e.g. albums/candids/2026/0111/thumb_001.jpg → albums/candids/2026/0111/001.jpg
fullres_path = re.sub(r'(/)(?:thumb_|normal_)', r'\1', thumb_src)
# If the path is already absolute (starts with http), return as-is
if fullres_path.startswith(('http://', 'https://')):
return fullres_path
# Otherwise, make it absolute relative to gallery URL
base = gallery_url.rstrip('/')
fullres_path = fullres_path.lstrip('./')
return f"{base}/{fullres_path}"
def _clean_text(self, text: str) -> str:
"""Clean HTML entities and whitespace from text."""
text = re.sub(r'&amp;', '&', text)
text = re.sub(r'&lt;', '<', text)
text = re.sub(r'&gt;', '>', text)
text = re.sub(r'&quot;', '"', text)
text = re.sub(r'&#\d+;', '', text)
text = re.sub(r'&\w+;', '', text)
text = re.sub(r'<[^>]+>', '', text)
return text.strip()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,297 @@
"""
Embed Downloader - Downloads embedded videos from posts using yt-dlp
Supports: YouTube, Vimeo, Dailymotion, Twitch, and many other platforms
"""
import asyncio
import json
import os
import subprocess
from pathlib import Path
from typing import Dict, Optional
from modules.base_module import LoggingMixin
class EmbedDownloader(LoggingMixin):
"""
Download embedded videos from posts using yt-dlp
Wrapper around yt-dlp for downloading videos from various platforms
embedded in creator posts.
"""
# Quality presets for yt-dlp
QUALITY_PRESETS = {
'best': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
'1080p': 'bestvideo[height<=1080][ext=mp4]+bestaudio[ext=m4a]/best[height<=1080][ext=mp4]/best',
'720p': 'bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/best[height<=720][ext=mp4]/best',
'480p': 'bestvideo[height<=480][ext=mp4]+bestaudio[ext=m4a]/best[height<=480][ext=mp4]/best',
'audio': 'bestaudio[ext=m4a]/bestaudio/best',
}
def __init__(self, ytdlp_path: str = None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Embed')
# Find yt-dlp executable
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
if not self.ytdlp_path:
self.log("yt-dlp not found, embed downloading will be disabled", 'warning')
def _find_ytdlp(self) -> Optional[str]:
"""Find yt-dlp executable"""
# Check common locations
common_paths = [
'/usr/local/bin/yt-dlp',
'/usr/bin/yt-dlp',
'/opt/homebrew/bin/yt-dlp',
os.path.expanduser('~/.local/bin/yt-dlp'),
]
for path in common_paths:
if os.path.isfile(path) and os.access(path, os.X_OK):
return path
# Try to find via which
try:
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def is_available(self) -> bool:
"""Check if yt-dlp is available"""
return self.ytdlp_path is not None
async def download(self, url: str, output_dir: Path, quality: str = 'best',
filename_template: str = None) -> Dict:
"""
Download video from URL
Args:
url: Video URL to download
output_dir: Directory to save the video
quality: Quality preset ('best', '1080p', '720p', '480p', 'audio')
filename_template: Optional custom filename template
Returns:
Dict with success status and file info
"""
if not self.is_available():
return {
'success': False,
'error': 'yt-dlp not available'
}
try:
# Create output directory
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Build output template
if filename_template:
output_template = str(output_dir / filename_template)
else:
output_template = str(output_dir / 'embed_%(title).50s_%(id)s.%(ext)s')
# Get format string
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
# Build command
cmd = [
self.ytdlp_path,
'--no-playlist',
'--no-warnings',
'-f', format_str,
'--merge-output-format', 'mp4',
'-o', output_template,
'--print-json', # Output JSON with video info
url
]
self.log(f"Downloading embed: {url}", 'debug')
# Run yt-dlp
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error_msg = stderr.decode('utf-8', errors='replace').strip()
# Try to extract useful error message
if 'Video unavailable' in error_msg:
error_msg = 'Video unavailable or private'
elif 'age-restricted' in error_msg.lower():
error_msg = 'Video is age-restricted'
elif 'members only' in error_msg.lower():
error_msg = 'Video is members-only'
elif len(error_msg) > 200:
error_msg = error_msg[:200] + '...'
self.log(f"yt-dlp failed: {error_msg}", 'warning')
return {
'success': False,
'error': error_msg or f'yt-dlp exited with code {result.returncode}'
}
# Parse output JSON
stdout_text = stdout.decode('utf-8', errors='replace')
video_info = None
for line in stdout_text.strip().split('\n'):
try:
video_info = json.loads(line)
break
except json.JSONDecodeError:
continue
if not video_info:
# Try to find the downloaded file
files = list(output_dir.glob('embed_*'))
if files:
file_path = files[0]
return {
'success': True,
'file_path': str(file_path),
'filename': file_path.name,
'file_size': file_path.stat().st_size if file_path.exists() else None
}
return {
'success': False,
'error': 'Could not parse yt-dlp output'
}
# Extract file info
file_path = video_info.get('_filename') or video_info.get('filename')
# Handle potential path issues
if file_path:
file_path = Path(file_path)
if not file_path.exists():
# Try to find the file
possible_files = list(output_dir.glob(f"*{video_info.get('id', '')}*"))
if possible_files:
file_path = possible_files[0]
return {
'success': True,
'file_path': str(file_path) if file_path else None,
'filename': file_path.name if file_path else None,
'file_size': file_path.stat().st_size if file_path and file_path.exists() else video_info.get('filesize'),
'title': video_info.get('title'),
'duration': video_info.get('duration'),
'uploader': video_info.get('uploader'),
'upload_date': video_info.get('upload_date'),
'video_id': video_info.get('id'),
'platform': video_info.get('extractor_key', video_info.get('extractor', 'unknown')).lower()
}
except asyncio.TimeoutError:
return {
'success': False,
'error': 'Download timed out'
}
except Exception as e:
self.log(f"Error downloading embed: {e}", 'error')
return {
'success': False,
'error': str(e)
}
async def get_video_info(self, url: str) -> Dict:
"""
Get video information without downloading
Args:
url: Video URL
Returns:
Dict with video metadata
"""
if not self.is_available():
return {'success': False, 'error': 'yt-dlp not available'}
try:
cmd = [
self.ytdlp_path,
'--no-playlist',
'--no-warnings',
'-j', # Output JSON
'--no-download',
url
]
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error_msg = stderr.decode('utf-8', errors='replace').strip()
return {
'success': False,
'error': error_msg or f'yt-dlp exited with code {result.returncode}'
}
video_info = json.loads(stdout.decode('utf-8'))
return {
'success': True,
'title': video_info.get('title'),
'duration': video_info.get('duration'),
'uploader': video_info.get('uploader'),
'upload_date': video_info.get('upload_date'),
'view_count': video_info.get('view_count'),
'like_count': video_info.get('like_count'),
'description': video_info.get('description'),
'thumbnail': video_info.get('thumbnail'),
'video_id': video_info.get('id'),
'platform': video_info.get('extractor_key', video_info.get('extractor', 'unknown')).lower(),
'formats': len(video_info.get('formats', []))
}
except Exception as e:
self.log(f"Error getting video info: {e}", 'error')
return {
'success': False,
'error': str(e)
}
@staticmethod
def detect_platform(url: str) -> Optional[str]:
"""Detect video platform from URL"""
url_lower = url.lower()
if 'youtube.com' in url_lower or 'youtu.be' in url_lower:
return 'youtube'
elif 'vimeo.com' in url_lower:
return 'vimeo'
elif 'dailymotion.com' in url_lower:
return 'dailymotion'
elif 'twitch.tv' in url_lower:
return 'twitch'
elif 'twitter.com' in url_lower or 'x.com' in url_lower:
return 'twitter'
elif 'tiktok.com' in url_lower:
return 'tiktok'
elif 'instagram.com' in url_lower:
return 'instagram'
elif 'reddit.com' in url_lower:
return 'reddit'
return None
@staticmethod
def is_supported_url(url: str) -> bool:
"""Check if URL is from a supported platform"""
return EmbedDownloader.detect_platform(url) is not None

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,529 @@
"""
Download files from external file hosting services
Supports: Bunkr, Pixeldrain, Gofile, Cyberdrop
"""
import asyncio
import re
from pathlib import Path
from typing import Dict, List, Optional
from urllib.parse import urlparse, parse_qs
import aiohttp
from modules.base_module import LoggingMixin, RateLimitMixin
class FileHostDownloader(LoggingMixin, RateLimitMixin):
"""
Download files from various file hosting services
Used for manual import of PPV content
"""
SUPPORTED_HOSTS = {
'bunkr': ['bunkr.sk', 'bunkr.si', 'bunkr.la', 'bunkrr.ru', 'bunkr.ph', 'bunkr.is', 'bunkr.ac', 'bunkr.cr'],
'pixeldrain': ['pixeldrain.com'],
'gofile': ['gofile.io'],
'cyberdrop': ['cyberdrop.me', 'cyberdrop.to', 'cyberdrop.cc'],
'fileditch': ['fileditchfiles.me', 'fileditch.me'],
}
# Bunkr CDN servers (food-themed) - try in order
BUNKR_CDNS = [
'i-soup.bunkr.ru',
'i-burger.bunkr.ru',
'i-pizza.bunkr.ru',
'i-taco.bunkr.ru',
'i-fries.bunkr.ru',
'i-hotdog.bunkr.ru',
'i-nachos.bunkr.ru',
'i-sushi.bunkr.ru',
'i-ramen.bunkr.ru',
'i-curry.bunkr.ru',
'i-kebab.bunkr.ru',
'i-pasta.bunkr.ru',
'i-steak.bunkr.ru',
'i-salad.bunkr.ru',
'i-sandwich.bunkr.ru',
'i-waffle.bunkr.ru',
'i-pancake.bunkr.ru',
'i-donut.bunkr.ru',
'i-cookie.bunkr.ru',
'i-cake.bunkr.ru',
'i-bacon.bunkr.ru',
'i-cheese.bunkr.ru',
'i-chicken.bunkr.ru',
'i-fish.bunkr.ru',
'i-noodle.bunkr.ru',
'i-rice.bunkr.ru',
'i-bread.bunkr.ru',
'burger.bunkr.ru',
'pizza.bunkr.ru',
'milkshake.bunkr.ru',
]
def __init__(self, log_callback=None, progress_callback=None):
self._init_logger('PaidContent', log_callback, default_module='FileHost')
self._init_rate_limiter(min_delay=1, max_delay=3)
self.progress_callback = progress_callback # Called with (downloaded_bytes, total_bytes, filename)
def detect_host(self, url: str) -> Optional[str]:
"""Detect which file host a URL belongs to"""
try:
parsed = urlparse(url)
domain = parsed.netloc.lower().replace('www.', '')
for host, domains in self.SUPPORTED_HOSTS.items():
if domain in domains:
return host
except Exception:
pass
return None
def is_supported_url(self, url: str) -> bool:
"""Check if URL is from a supported file host"""
return self.detect_host(url) is not None
async def download_url(self, url: str, save_dir: Path) -> Dict:
"""
Download file(s) from URL
Returns: {'success': bool, 'files': [paths], 'error': str}
"""
host = self.detect_host(url)
if not host:
return {'success': False, 'files': [], 'error': 'Unsupported host'}
handler = getattr(self, f'_download_{host}', None)
if not handler:
return {'success': False, 'files': [], 'error': f'No handler for {host}'}
try:
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
return await handler(url, save_dir)
except Exception as e:
self.log(f"Error downloading from {host}: {e}", 'error')
return {'success': False, 'files': [], 'error': str(e)}
async def _download_pixeldrain(self, url: str, save_dir: Path) -> Dict:
"""Download from Pixeldrain"""
# Extract file ID from URL
# Format: https://pixeldrain.com/u/FILEID or /l/LISTID
parsed = urlparse(url)
path_parts = parsed.path.strip('/').split('/')
if len(path_parts) < 2:
return {'success': False, 'files': [], 'error': 'Invalid Pixeldrain URL'}
url_type, file_id = path_parts[0], path_parts[1]
files = []
timeout = aiohttp.ClientTimeout(total=300)
async with aiohttp.ClientSession(timeout=timeout) as session:
if url_type == 'u':
# Single file
api_url = f"https://pixeldrain.com/api/file/{file_id}/info"
async with session.get(api_url) as resp:
if resp.status != 200:
return {'success': False, 'files': [], 'error': f'API error: {resp.status}'}
info = await resp.json()
download_url = f"https://pixeldrain.com/api/file/{file_id}"
filename = info.get('name', f'{file_id}.bin')
save_path = save_dir / self._sanitize_filename(filename)
await self._download_file(session, download_url, save_path)
files.append(str(save_path))
elif url_type == 'l':
# List (album)
api_url = f"https://pixeldrain.com/api/list/{file_id}"
async with session.get(api_url) as resp:
if resp.status != 200:
return {'success': False, 'files': [], 'error': f'API error: {resp.status}'}
data = await resp.json()
for i, item in enumerate(data.get('files', [])):
self._delay_between_items()
item_id = item['id']
filename = item.get('name', f'{i:03d}_{item_id}.bin')
download_url = f"https://pixeldrain.com/api/file/{item_id}"
save_path = save_dir / self._sanitize_filename(filename)
try:
await self._download_file(session, download_url, save_path)
files.append(str(save_path))
except Exception as e:
self.log(f"Failed to download {filename}: {e}", 'warning')
return {'success': True, 'files': files, 'error': None}
async def _download_gofile(self, url: str, save_dir: Path) -> Dict:
"""Download from Gofile"""
# Extract content ID from URL
# Format: https://gofile.io/d/CONTENTID
parsed = urlparse(url)
path_parts = parsed.path.strip('/').split('/')
if len(path_parts) < 2 or path_parts[0] != 'd':
return {'success': False, 'files': [], 'error': 'Invalid Gofile URL'}
content_id = path_parts[1]
files = []
timeout = aiohttp.ClientTimeout(total=300)
async with aiohttp.ClientSession(timeout=timeout) as session:
# Create guest account token (POST request required since API change)
async with session.post('https://api.gofile.io/accounts') as resp:
if resp.status != 200:
return {'success': False, 'files': [], 'error': 'Failed to get Gofile token'}
account_data = await resp.json()
if account_data.get('status') != 'ok':
return {'success': False, 'files': [], 'error': f"Gofile API error: {account_data.get('status')}"}
token = account_data.get('data', {}).get('token')
if not token:
return {'success': False, 'files': [], 'error': 'No Gofile token received'}
# Get content info
# Gofile requires x-website-token header (changed from query param in 2024)
headers = {
'Authorization': f'Bearer {token}',
'x-website-token': '4fd6sg89d7s6',
}
api_url = f"https://api.gofile.io/contents/{content_id}"
async with session.get(api_url, headers=headers) as resp:
if resp.status == 401:
return {'success': False, 'files': [], 'error': 'Gofile authentication failed - websiteToken may have changed'}
if resp.status != 200:
return {'success': False, 'files': [], 'error': f'Failed to get content: {resp.status}'}
content_data = await resp.json()
if content_data.get('status') == 'error-notPremium':
return {'success': False, 'files': [], 'error': 'Gofile requires premium account for API access - try direct download'}
if content_data.get('status') != 'ok':
error = content_data.get('data', {}).get('message', content_data.get('status', 'Unknown error'))
return {'success': False, 'files': [], 'error': error}
contents = content_data.get('data', {}).get('children', {})
for item_id, item in contents.items():
if item.get('type') != 'file':
continue
self._delay_between_items()
download_url = item.get('link')
filename = item.get('name', f'{item_id}.bin')
save_path = save_dir / self._sanitize_filename(filename)
try:
await self._download_file(session, download_url, save_path, headers=headers)
files.append(str(save_path))
except Exception as e:
self.log(f"Failed to download {filename}: {e}", 'warning')
return {'success': True, 'files': files, 'error': None}
async def _download_cyberdrop(self, url: str, save_dir: Path) -> Dict:
"""Download from Cyberdrop"""
# Cyberdrop albums: https://cyberdrop.me/a/ALBUMID
# Single files: https://cyberdrop.me/f/FILEID or direct CDN links
files = []
timeout = aiohttp.ClientTimeout(total=300)
async with aiohttp.ClientSession(timeout=timeout) as session:
parsed = urlparse(url)
path_parts = parsed.path.strip('/').split('/')
if len(path_parts) >= 2 and path_parts[0] == 'a':
# Album
album_url = url
async with session.get(album_url) as resp:
if resp.status != 200:
return {'success': False, 'files': [], 'error': f'Failed to fetch album: {resp.status}'}
html = await resp.text()
# Parse file links from HTML
# Pattern: href="https://fs-XXX.cyberdrop.to/FILE"
cdn_pattern = r'href="(https://[a-z0-9-]+\.cyberdrop\.[a-z]+/[^"]+)"'
matches = re.findall(cdn_pattern, html)
for i, file_url in enumerate(matches):
self._delay_between_items()
filename = file_url.split('/')[-1].split('?')[0]
if not filename:
filename = f'{i:03d}.bin'
save_path = save_dir / self._sanitize_filename(filename)
try:
await self._download_file(session, file_url, save_path)
files.append(str(save_path))
except Exception as e:
self.log(f"Failed to download {filename}: {e}", 'warning')
else:
# Single file or direct CDN link
filename = parsed.path.split('/')[-1] or 'download.bin'
save_path = save_dir / self._sanitize_filename(filename)
await self._download_file(session, url, save_path)
files.append(str(save_path))
return {'success': True, 'files': files, 'error': None}
async def _download_bunkr(self, url: str, save_dir: Path) -> Dict:
"""Download from Bunkr with CDN fallback support"""
# Bunkr albums: https://bunkr.sk/a/ALBUMID
# Single files: https://bunkr.sk/f/FILEID or https://bunkr.sk/v/VIDEOID
files = []
failed = []
timeout = aiohttp.ClientTimeout(total=600) # Increased for large files
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
parsed = urlparse(url)
path_parts = parsed.path.strip('/').split('/')
if len(path_parts) >= 2 and path_parts[0] == 'a':
# Album page
async with session.get(url) as resp:
if resp.status != 200:
return {'success': False, 'files': [], 'error': f'Failed to fetch album: {resp.status}'}
html = await resp.text()
# Parse file links from HTML - look for /f/ links
file_pattern = r'href="(/f/[^"]+)"'
matches = re.findall(file_pattern, html)
self.log(f"Found {len(matches)} files in Bunkr album", 'info')
for i, file_path in enumerate(matches):
self._delay_between_items()
# Make absolute URL
file_url = f"https://{parsed.netloc}{file_path}"
# Get direct download URL and file UUID
direct_url, file_uuid = await self._get_bunkr_direct_url_with_uuid(session, file_url)
if not direct_url:
self.log(f"Could not get direct URL for {file_url}", 'warning')
failed.append(file_url)
continue
filename = direct_url.split('/')[-1].split('?')[0]
if not filename:
filename = f'{i:03d}.bin'
save_path = save_dir / self._sanitize_filename(filename)
try:
await self._download_file(session, direct_url, save_path,
try_cdn_fallback=True, file_uuid=file_uuid)
files.append(str(save_path))
self.log(f"Downloaded: {filename}", 'info')
except Exception as e:
self.log(f"Failed to download {filename}: {e}", 'warning')
failed.append(filename)
else:
# Single file page
direct_url, file_uuid = await self._get_bunkr_direct_url_with_uuid(session, url)
if not direct_url:
return {'success': False, 'files': [], 'error': 'Could not get direct download URL'}
filename = direct_url.split('/')[-1].split('?')[0] or 'download.bin'
save_path = save_dir / self._sanitize_filename(filename)
await self._download_file(session, direct_url, save_path,
try_cdn_fallback=True, file_uuid=file_uuid)
files.append(str(save_path))
result = {'success': len(files) > 0, 'files': files, 'error': None}
if failed:
result['failed'] = failed
result['error'] = f'{len(failed)} files failed to download'
return result
async def _get_bunkr_direct_url_with_uuid(self, session: aiohttp.ClientSession, page_url: str) -> tuple:
"""Extract direct download URL and file UUID from Bunkr file page"""
try:
async with session.get(page_url) as resp:
if resp.status != 200:
return None, None
html = await resp.text()
file_uuid = None
# Extract file UUID first
uuid_patterns = [
r'data-v="([a-f0-9-]{36}\.[a-z0-9]+)"',
r'([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}\.[a-z0-9]+)',
]
for pattern in uuid_patterns:
match = re.search(pattern, html)
if match:
file_uuid = match.group(1)
break
# Try to find existing CDN URL in page
cdn_patterns = [
r'href="(https://[^"]*\.bunkr\.ru/[^"]+)"',
r'src="(https://[^"]*\.bunkr\.ru/[^"]+)"',
r'data-src="(https://[^"]*\.bunkr\.ru/[^"]+)"',
]
for pattern in cdn_patterns:
match = re.search(pattern, html)
if match:
url = match.group(1)
if await self._check_url_accessible(session, url):
return url, file_uuid
# If we have UUID, try CDNs
if file_uuid:
self.log(f"Found file UUID: {file_uuid}, trying CDNs...", 'debug')
for cdn in self.BUNKR_CDNS:
cdn_url = f"https://{cdn}/{file_uuid}"
if await self._check_url_accessible(session, cdn_url):
self.log(f"Found working CDN: {cdn}", 'debug')
return cdn_url, file_uuid
return None, file_uuid
except Exception as e:
self.log(f"Error getting Bunkr direct URL: {e}", 'warning')
return None, None
async def _check_url_accessible(self, session: aiohttp.ClientSession, url: str) -> bool:
"""Check if a URL is accessible (returns 200)"""
try:
async with session.head(url, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=10)) as resp:
return resp.status == 200
except Exception:
return False
async def _download_fileditch(self, url: str, save_dir: Path) -> Dict:
"""Download from FileDitch (Cloudflare-protected)"""
from modules.cloudflare_handler import CloudflareHandler
# Extract filename from URL: file.php?f=/b74/tLyJWGrzvSyRlJvBVDBa.mp4
parsed = urlparse(url)
params = parse_qs(parsed.query)
file_path = params.get('f', [''])[0]
if not file_path:
return {'success': False, 'files': [], 'error': 'Invalid FileDitch URL - no file parameter'}
filename = file_path.rsplit('/', 1)[-1] if '/' in file_path else file_path
if not filename:
return {'success': False, 'files': [], 'error': 'Could not extract filename from URL'}
save_path = save_dir / self._sanitize_filename(filename)
# Use CloudflareHandler to get cookies via FlareSolverr
cf_handler = CloudflareHandler(
module_name='FileDitch',
flaresolverr_url='http://localhost:8191/v1',
flaresolverr_enabled=True,
)
self.log('Bypassing Cloudflare for FileDitch via FlareSolverr...', 'info')
if not cf_handler.get_cookies_via_flaresolverr(url):
return {'success': False, 'files': [], 'error': 'Failed to bypass Cloudflare for FileDitch'}
cookies = cf_handler.get_cookies_dict()
user_agent = cf_handler.get_user_agent()
# Download with the obtained cookies
timeout = aiohttp.ClientTimeout(total=3600)
cookie_jar = aiohttp.CookieJar()
headers = {'User-Agent': user_agent or 'Mozilla/5.0'}
async with aiohttp.ClientSession(timeout=timeout, cookie_jar=cookie_jar, headers=headers) as session:
# Set cookies on session
for name, value in cookies.items():
cookie_jar.update_cookies({name: value}, response_url=url)
await self._download_file(session, url, save_path, headers=headers)
return {'success': True, 'files': [str(save_path)], 'error': None}
async def _download_file(self, session: aiohttp.ClientSession, url: str,
save_path: Path, headers: Dict = None,
try_cdn_fallback: bool = False, file_uuid: str = None) -> None:
"""Download a single file with streaming and optional CDN fallback"""
save_path.parent.mkdir(parents=True, exist_ok=True)
urls_to_try = [url]
# If CDN fallback enabled and we have a file UUID, add alternate CDNs
if try_cdn_fallback and file_uuid:
for cdn in self.BUNKR_CDNS:
alt_url = f"https://{cdn}/{file_uuid}"
if alt_url != url:
urls_to_try.append(alt_url)
last_error = None
for try_url in urls_to_try:
try:
self.log(f"Downloading: {save_path.name} from {try_url[:60]}...", 'info')
async with session.get(try_url, headers=headers) as resp:
if resp.status == 200:
total_size = int(resp.headers.get('content-length', 0))
downloaded = 0
last_log_pct = 0
with open(save_path, 'wb') as f:
async for chunk in resp.content.iter_chunked(65536): # 64KB chunks
f.write(chunk)
downloaded += len(chunk)
# Log and callback progress every 2%
if total_size > 0:
pct = int(downloaded * 100 / total_size)
if pct >= last_log_pct + 2:
self.log(f" {save_path.name}: {pct}% ({downloaded // (1024*1024)}MB / {total_size // (1024*1024)}MB)", 'info')
last_log_pct = pct
# Call progress callback if provided
if self.progress_callback:
try:
self.progress_callback(downloaded, total_size, save_path.name)
except Exception:
pass # Don't fail download due to callback error
self.log(f"Downloaded: {save_path.name} ({downloaded // (1024*1024)}MB)", 'info')
return # Success
else:
last_error = f"HTTP {resp.status}"
self.log(f"Download failed: {save_path.name} - {last_error}", 'warning')
except Exception as e:
last_error = str(e)
self.log(f"Download error: {save_path.name} - {last_error}", 'warning')
# Try next CDN
continue
raise Exception(f"Download failed after trying {len(urls_to_try)} URLs: {last_error}")
def _sanitize_filename(self, filename: str) -> str:
"""Sanitize filename for filesystem"""
if not filename:
return 'download.bin'
# Remove/replace invalid characters
filename = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', filename)
filename = filename.strip('. ')
return filename or 'download.bin'
@classmethod
def get_supported_domains(cls) -> List[str]:
"""Get list of all supported domains"""
domains = []
for host_domains in cls.SUPPORTED_HOSTS.values():
domains.extend(host_domains)
return domains

View File

@@ -0,0 +1,171 @@
"""
Filename parser for extracting dates and metadata from Fansly/paid content filenames.
Supports:
1. Fansly snowflake IDs: 871257582885416960.mp4
2. Embedded date format: 2023-05-11_at_15-51_id_513099759796367360-zRvVUZeP.mp4
3. Date-prefixed files: 2022-07-08.mp4 or 2022-07-08_video.mp4
"""
import re
from datetime import datetime, timezone
from typing import Optional, Dict, Tuple
from pathlib import Path
# Fansly epoch calibrated from known files
# Based on: 513099759796367360 = 2023-05-11 15:51 UTC
FANSLY_EPOCH_MS = 1561483337101
def decode_fansly_snowflake(snowflake_id: str) -> Optional[datetime]:
"""
Decode a Fansly snowflake ID to a datetime.
Fansly uses Twitter-style snowflake IDs where the timestamp
is encoded in the upper bits (shifted right by 22).
"""
try:
sid = int(snowflake_id)
# Timestamp is in upper bits
timestamp_ms = (sid >> 22) + FANSLY_EPOCH_MS
return datetime.fromtimestamp(timestamp_ms / 1000, tz=timezone.utc)
except (ValueError, OverflowError, OSError):
return None
def parse_filename(filename: str) -> Dict:
"""
Parse a filename and extract any date/metadata information.
Returns:
{
'original_filename': str,
'detected_date': datetime or None,
'fansly_id': str or None,
'date_source': str or None, # 'snowflake', 'embedded', 'prefix', None
'confidence': str, # 'high', 'medium', 'low'
}
"""
result = {
'original_filename': filename,
'detected_date': None,
'fansly_id': None,
'date_source': None,
'confidence': 'low',
}
# Get the base name without extension
name = Path(filename).stem
# Pattern 1: Embedded date format
# 2023-05-11_at_15-51_id_513099759796367360-zRvVUZeP-YcNs55W9.mp4
# 2026-01-24_at_06-22_id_871257582885416960_hash2_4547ab5367c6d7ea3a28ac4fc79df018.mp4
# Also handles spaces: 2023 05 11_at_15 51_id_513099759796367360
embedded_pattern = r'(\d{4})[-_ ](\d{2})[-_ ](\d{2})[-_ ]?at[-_ ](\d{2})[-_ ](\d{2})[-_ ]?id[-_ ](\d{15,20})'
match = re.search(embedded_pattern, name, re.IGNORECASE)
if match:
year, month, day, hour, minute, fansly_id = match.groups()
try:
result['detected_date'] = datetime(
int(year), int(month), int(day),
int(hour), int(minute), 0,
tzinfo=timezone.utc
)
result['fansly_id'] = fansly_id
result['date_source'] = 'embedded'
result['confidence'] = 'high'
return result
except ValueError:
pass
# Pattern 2: Date prefix (YYYY-MM-DD or YYYY_MM_DD)
# 2022-07-08.mp4 or 2022-07-08_video.mp4
date_prefix_pattern = r'^(\d{4})[-_](\d{2})[-_](\d{2})(?:[_\-\s]|$)'
match = re.match(date_prefix_pattern, name)
if match:
year, month, day = match.groups()
try:
result['detected_date'] = datetime(
int(year), int(month), int(day),
12, 0, 0, # Default to noon
tzinfo=timezone.utc
)
result['date_source'] = 'prefix'
result['confidence'] = 'high'
return result
except ValueError:
pass
# Pattern 3: Pure Fansly snowflake ID
# 871257582885416960.mp4 (15-20 digit number)
snowflake_pattern = r'^(\d{15,20})(?:_\d+)?$'
match = re.match(snowflake_pattern, name)
if match:
fansly_id = match.group(1)
decoded_date = decode_fansly_snowflake(fansly_id)
if decoded_date:
# Sanity check: date should be between 2020 and 2030
if 2020 <= decoded_date.year <= 2030:
result['detected_date'] = decoded_date
result['fansly_id'] = fansly_id
result['date_source'] = 'snowflake'
result['confidence'] = 'high'
return result
# Pattern 4: Fansly ID embedded anywhere in filename
# e.g., video_871257582885416960_hd.mp4
embedded_id_pattern = r'(\d{15,20})'
matches = re.findall(embedded_id_pattern, name)
for potential_id in matches:
decoded_date = decode_fansly_snowflake(potential_id)
if decoded_date and 2020 <= decoded_date.year <= 2030:
result['detected_date'] = decoded_date
result['fansly_id'] = potential_id
result['date_source'] = 'snowflake'
result['confidence'] = 'medium'
return result
return result
def parse_filenames(filenames: list) -> Dict:
"""
Parse multiple filenames and return analysis.
Returns:
{
'files': [parsed result for each file],
'earliest_date': datetime or None,
'latest_date': datetime or None,
'suggested_date': datetime or None, # Most common or earliest
'has_dates': bool,
}
"""
results = [parse_filename(f) for f in filenames]
dates = [r['detected_date'] for r in results if r['detected_date']]
analysis = {
'files': results,
'earliest_date': min(dates) if dates else None,
'latest_date': max(dates) if dates else None,
'suggested_date': min(dates) if dates else None, # Use earliest as default
'has_dates': len(dates) > 0,
}
return analysis
def format_date_for_display(dt: datetime) -> str:
"""Format datetime for display: 'May 11, 2023 at 3:51 PM'"""
if dt is None:
return ''
return dt.strftime('%b %d, %Y at %-I:%M %p')
def format_date_for_input(dt: datetime) -> Tuple[str, str]:
"""Format datetime for HTML inputs: (date_str, time_str)"""
if dt is None:
return ('', '')
return (dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M'))

View File

@@ -0,0 +1,14 @@
"""Backwards-compatibility shim — use xenforo_forum_client instead."""
from .xenforo_forum_client import XenForoForumClient
class HQCelebCornerClient(XenForoForumClient):
"""Legacy alias for XenForoForumClient, pre-configured for HQCelebCorner."""
def __init__(self, log_callback=None):
super().__init__(
service_id='hqcelebcorner',
base_url='https://www.hqcelebcorner.net',
cookie_path='/opt/media-downloader/cookies/forum_cookies_HQCelebCorner.json',
log_callback=log_callback,
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,312 @@
"""
Pydantic models for Paid Content feature
"""
from dataclasses import dataclass, field
from datetime import datetime
from typing import Dict, List, Optional, Any
@dataclass
class Attachment:
"""Represents a file attachment from a post"""
name: str
server_path: str
file_type: Optional[str] = None
extension: Optional[str] = None
download_url: Optional[str] = None
file_size: Optional[int] = None
width: Optional[int] = None
height: Optional[int] = None
duration: Optional[int] = None
needs_quality_recheck: bool = False
is_preview: bool = False
@classmethod
def from_api(cls, data: Dict, base_url: str = '') -> 'Attachment':
"""Create Attachment from API response"""
name = data.get('name', '')
path = data.get('path', '')
# Detect file type from extension
ext = ''
if '.' in name:
ext = name.rsplit('.', 1)[-1].lower()
file_type = 'unknown'
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic'}
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
archive_exts = {'zip', 'rar', '7z', 'tar', 'gz'}
if ext in image_exts:
file_type = 'image'
elif ext in video_exts:
file_type = 'video'
elif ext in archive_exts:
file_type = 'archive'
elif ext in {'pdf', 'doc', 'docx', 'txt'}:
file_type = 'document'
return cls(
name=name,
server_path=path,
file_type=file_type,
extension=ext if ext else None,
download_url=f"{base_url}/data{path}" if base_url and path else None
)
def to_dict(self) -> Dict:
"""Convert to dictionary for database storage"""
d = {
'name': self.name,
'server_path': self.server_path,
'file_type': self.file_type,
'extension': self.extension,
'download_url': self.download_url,
'file_size': self.file_size,
'width': self.width,
'height': self.height,
'duration': self.duration
}
if self.needs_quality_recheck:
d['needs_quality_recheck'] = 1
return d
@dataclass
class Post:
"""Represents a post from a creator"""
post_id: str
service_id: str
platform: str
creator_id: str
title: Optional[str] = None
content: Optional[str] = None
published_at: Optional[str] = None
added_at: Optional[str] = None
edited_at: Optional[str] = None
attachments: List[Attachment] = field(default_factory=list)
embed_urls: List[str] = field(default_factory=list)
is_pinned: bool = False
pinned_at: Optional[str] = None
auto_tags: List[str] = field(default_factory=list) # Tag names to auto-apply on sync
tagged_users: List[str] = field(default_factory=list) # Instagram users tagged in the post
@classmethod
def from_api(cls, data: Dict, service_id: str, platform: str, creator_id: str, base_url: str = '') -> 'Post':
"""Create Post from API response"""
# Parse attachments
attachments = []
for att_data in data.get('attachments', []):
attachments.append(Attachment.from_api(att_data, base_url))
# Also check file field (some APIs use this instead of attachments)
if 'file' in data and data['file']:
file_data = data['file']
if isinstance(file_data, dict):
attachments.append(Attachment.from_api(file_data, base_url))
elif isinstance(file_data, str):
attachments.append(Attachment(
name=file_data.split('/')[-1] if '/' in file_data else file_data,
server_path=file_data
))
# Parse dates
published = data.get('published')
added = data.get('added')
edited = data.get('edited')
# Content: use 'content' if available, fallback to 'substring' (list endpoint returns truncated)
content = data.get('content') or data.get('substring') or ''
# Single post endpoint returns HTML content (e.g. <p>text</p>), strip tags
if content and '<' in content:
import re
content = re.sub(r'<br\s*/?>', '\n', content)
content = re.sub(r'</p>\s*<p>', '\n\n', content)
content = re.sub(r'<[^>]+>', '', content)
content = content.strip()
title = data.get('title')
# OnlyFans posts on Coomer have the post text in 'title' and empty 'content'.
# Copy title to content and clear title (OF posts don't have real titles).
if not content and title:
content = title
title = None
return cls(
post_id=str(data.get('id', '')),
service_id=service_id,
platform=platform,
creator_id=creator_id,
title=title,
content=content,
published_at=published,
added_at=added,
edited_at=edited,
attachments=attachments,
embed_urls=data.get('embed', []) or []
)
def to_dict(self) -> Dict:
"""Convert to dictionary for database storage"""
return {
'post_id': self.post_id,
'title': self.title,
'content': self.content,
'published_at': self.published_at,
'added_at': self.added_at,
'edited_at': self.edited_at,
'has_attachments': 1 if self.attachments else 0,
'attachment_count': len(self.attachments),
'embed_count': len(self.embed_urls),
'is_pinned': 1 if self.is_pinned else 0,
'pinned_at': self.pinned_at
}
@dataclass
class Message:
"""Represents a chat message from/to a creator"""
message_id: str
platform: str
service_id: str
creator_id: str # Platform-specific creator ID
text: Optional[str] = None
sent_at: Optional[str] = None
is_from_creator: bool = True
is_tip: bool = False
tip_amount: Optional[float] = None
price: Optional[float] = None
is_free: bool = True
is_purchased: bool = False
reply_to_message_id: Optional[str] = None
attachments: List[Attachment] = field(default_factory=list)
def to_dict(self) -> Dict:
"""Convert to dictionary for database storage"""
return {
'message_id': self.message_id,
'text': self.text,
'sent_at': self.sent_at,
'is_from_creator': 1 if self.is_from_creator else 0,
'is_tip': 1 if self.is_tip else 0,
'tip_amount': self.tip_amount,
'price': self.price,
'is_free': 1 if self.is_free else 0,
'is_purchased': 1 if self.is_purchased else 0,
'has_attachments': 1 if self.attachments else 0,
'attachment_count': len(self.attachments),
'reply_to_message_id': self.reply_to_message_id,
}
@dataclass
class Creator:
"""Represents a creator from Coomer/Kemono"""
creator_id: str
service_id: str
platform: str
username: str
display_name: Optional[str] = None
profile_image_url: Optional[str] = None
banner_image_url: Optional[str] = None
bio: Optional[str] = None
post_count: int = 0
@classmethod
def from_api(cls, data: Dict, service_id: str, platform: str, base_url: str = None) -> 'Creator':
"""Create Creator from API response"""
creator_id = str(data.get('id', ''))
# Construct image domain - use .st instead of .party (coomer.party redirects to coomer.st)
img_domain = None
if base_url and creator_id:
from urllib.parse import urlparse
parsed = urlparse(base_url)
# Convert .party to .st for image URLs (coomer.party/kemono.party images are at .st)
netloc = parsed.netloc.replace('.party', '.st')
img_domain = f"img.{netloc}"
# Construct profile image URL from icon endpoint
profile_image_url = data.get('profile_image')
if not profile_image_url and img_domain:
# Icon URLs are at img.{domain}/icons/{platform}/{creator_id}
profile_image_url = f"https://{img_domain}/icons/{platform}/{creator_id}"
# Construct banner image URL
banner_image_url = data.get('banner_image')
if not banner_image_url and img_domain:
# Banner URLs are at img.{domain}/banners/{platform}/{creator_id}
banner_image_url = f"https://{img_domain}/banners/{platform}/{creator_id}"
return cls(
creator_id=creator_id,
service_id=service_id,
platform=platform,
username=data.get('name', ''),
display_name=data.get('name'),
profile_image_url=profile_image_url,
banner_image_url=banner_image_url,
post_count=data.get('post_count', 0)
)
def to_dict(self) -> Dict:
"""Convert to dictionary for database storage"""
return {
'service_id': self.service_id,
'platform': self.platform,
'creator_id': self.creator_id,
'username': self.username,
'display_name': self.display_name,
'profile_image_url': self.profile_image_url,
'banner_image_url': self.banner_image_url,
'bio': self.bio,
'post_count': self.post_count
}
@dataclass
class SyncResult:
"""Result of a creator sync operation"""
success: bool
new_posts: int = 0
new_attachments: int = 0
downloaded_files: int = 0
failed_files: int = 0
skipped_files: int = 0
error: Optional[str] = None
downloaded_file_info: Optional[List[Dict]] = None # List of {file_path, filename, source, content_type}
def to_dict(self) -> Dict:
return {
'success': self.success,
'new_posts': self.new_posts,
'new_attachments': self.new_attachments,
'downloaded_files': self.downloaded_files,
'failed_files': self.failed_files,
'skipped_files': self.skipped_files,
'error': self.error
}
@dataclass
class DownloadResult:
"""Result of a download operation"""
success: bool
file_path: Optional[str] = None
file_hash: Optional[str] = None
file_size: Optional[int] = None
error: Optional[str] = None
is_duplicate: bool = False
def to_dict(self) -> Dict:
return {
'success': self.success,
'file_path': self.file_path,
'file_hash': self.file_hash,
'file_size': self.file_size,
'error': self.error,
'is_duplicate': self.is_duplicate
}

View File

@@ -0,0 +1,729 @@
"""
OnlyFans Direct API Client
Downloads content directly from the OnlyFans API using browser-extracted
credentials and dynamic request signing.
"""
import asyncio
import aiohttp
import re
from datetime import datetime
from typing import List, Optional, Dict, Any, Callable
from urllib.parse import urlparse, urlencode
from modules.base_module import LoggingMixin, RateLimitMixin
from .models import Post, Attachment, Message
from .onlyfans_signing import OnlyFansSigner
class OnlyFansClient(LoggingMixin, RateLimitMixin):
"""
API client for downloading content directly from OnlyFans.
API Endpoints:
- Base URL: https://onlyfans.com/api2/v2
- Auth: Requires browser-extracted credentials (sess, auth_id, x-bc, User-Agent)
- Signing: Every request needs dynamic sign/time/app-token headers
- GET /users/me - Verify auth
- GET /users/{username} - Get user profile
- GET /users/{user_id}/posts?limit=50&offset={offset} - Get posts (paginated)
"""
BASE_URL = "https://onlyfans.com/api2/v2"
SERVICE_ID = "onlyfans_direct"
PLATFORM = "onlyfans"
def __init__(
self,
auth_config: Dict[str, str],
signing_url: Optional[str] = None,
log_callback: Optional[Callable] = None,
):
"""
Args:
auth_config: Dict with keys: sess, auth_id, auth_uid (optional), x_bc, user_agent
signing_url: Optional custom URL for signing rules
log_callback: Optional logging callback
"""
self._init_logger('PaidContent', log_callback, default_module='OnlyFansDirect')
# More conservative rate limiting than Fansly (OF is stricter)
self._init_rate_limiter(
min_delay=1.5, max_delay=3.0,
batch_delay_min=3, batch_delay_max=6
)
self.auth_config = auth_config
self._session: Optional[aiohttp.ClientSession] = None
self._signer = OnlyFansSigner(rules_url=signing_url)
async def _get_session(self) -> aiohttp.ClientSession:
"""Get or create aiohttp session with OnlyFans headers"""
if self._session is None or self._session.closed:
# Build cookie string
cookies = f"sess={self.auth_config['sess']}; auth_id={self.auth_config['auth_id']}"
auth_uid = self.auth_config.get('auth_uid')
if auth_uid:
cookies += f"; auth_uid_{self.auth_config['auth_id']}={auth_uid}"
headers = {
'Accept': 'application/json, text/plain, */*',
'User-Agent': self.auth_config.get('user_agent', ''),
'x-bc': self.auth_config.get('x_bc', ''),
'Cookie': cookies,
'Origin': 'https://onlyfans.com',
'Referer': 'https://onlyfans.com/',
}
timeout = aiohttp.ClientTimeout(total=60)
self._session = aiohttp.ClientSession(headers=headers, timeout=timeout)
return self._session
async def _sign_request(self, endpoint: str) -> Dict[str, str]:
"""
Compute signing headers for an API request.
Args:
endpoint: API path (e.g. "/users/me") - will be prefixed with /api2/v2
Returns:
Dict with sign, time, app-token, user-id headers
"""
user_id = self.auth_config.get('auth_id', '0')
# Sign with full URL path (matching OF-Scraper)
full_path = f"/api2/v2{endpoint}"
sign_headers = await self._signer.sign(full_path, user_id)
sign_headers['user-id'] = user_id
return sign_headers
async def _api_request(self, endpoint: str, params: Optional[Dict] = None) -> Optional[Dict]:
"""
Make a signed API request to OnlyFans.
Handles 401 (auth failure), 429 (rate limit), and general errors.
Auto-retries on 429 with exponential backoff.
Args:
endpoint: API path (e.g. "/users/me")
params: Optional query parameters
Returns:
Parsed JSON response or None on failure
"""
session = await self._get_session()
# Include query params in the signing path (OF-Scraper does this)
sign_endpoint = endpoint
if params:
sign_endpoint = f"{endpoint}?{urlencode(params)}"
sign_headers = await self._sign_request(sign_endpoint)
url = f"{self.BASE_URL}{endpoint}"
max_retries = 3
for attempt in range(max_retries):
try:
async with session.get(url, params=params, headers=sign_headers) as resp:
if resp.status == 200:
return await resp.json()
elif resp.status == 401:
self.log("OnlyFans auth failed (401) - credentials may be expired", 'error')
return None
elif resp.status == 429:
retry_after = int(resp.headers.get('Retry-After', 30))
wait = min(retry_after * (attempt + 1), 120)
self.log(f"Rate limited (429), waiting {wait}s (attempt {attempt + 1}/{max_retries})", 'warning')
await asyncio.sleep(wait)
# Refresh signing headers for retry (timestamp changes)
sign_headers = await self._sign_request(sign_endpoint)
continue
elif resp.status == 404:
self.log(f"Not found (404): {endpoint}", 'debug')
return None
else:
text = await resp.text()
self.log(f"API error: HTTP {resp.status} for {endpoint}: {text[:200]}", 'warning')
return None
except asyncio.TimeoutError:
self.log(f"Request timeout for {endpoint} (attempt {attempt + 1})", 'warning')
if attempt < max_retries - 1:
await asyncio.sleep(5 * (attempt + 1))
sign_headers = await self._sign_request(sign_endpoint)
continue
return None
except Exception as e:
self.log(f"Request error for {endpoint}: {e}", 'error')
return None
return None
@staticmethod
def _strip_html(text: str) -> str:
"""Strip HTML tags and convert common entities to plain text"""
if not text:
return ''
text = re.sub(r'<br\s*/?>', '\n', text)
text = re.sub(r'<[^>]+>', '', text)
text = text.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&#x27;', "'").replace('&quot;', '"')
return text.strip()
async def close(self):
"""Close the aiohttp session"""
if self._session and not self._session.closed:
await self._session.close()
self._session = None
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.close()
async def check_auth(self) -> Dict[str, Any]:
"""
Verify credentials by calling /users/me.
Returns:
Dict with 'valid' bool and optionally 'user_id', 'username', 'name'
"""
self._delay_between_items()
try:
data = await self._api_request("/users/me")
if data and data.get('id'):
return {
'valid': True,
'user_id': str(data['id']),
'username': data.get('username', ''),
'name': data.get('name', ''),
}
return {'valid': False, 'error': 'Invalid credentials or unexpected response'}
except Exception as e:
self.log(f"Error checking auth: {e}", 'error')
return {'valid': False, 'error': str(e)}
async def get_user_info(self, username: str) -> Optional[Dict[str, Any]]:
"""
Get user profile info.
Args:
username: The OnlyFans username
Returns:
Normalized user info dict or None
"""
self._delay_between_items()
try:
data = await self._api_request(f"/users/{username}")
if not data or not data.get('id'):
self.log(f"User not found: {username}", 'warning')
return None
return {
'user_id': str(data['id']),
'username': data.get('username', username),
'display_name': data.get('name', ''),
'avatar_url': data.get('avatar'),
'banner_url': data.get('header'),
'bio': self._strip_html(data.get('rawAbout') or data.get('about') or ''),
'join_date': (data.get('joinDate') or '')[:10] or None,
'posts_count': data.get('postsCount', 0),
}
except Exception as e:
self.log(f"Error getting user info for {username}: {e}", 'error')
return None
async def get_single_post(self, post_id: str) -> Optional[Post]:
"""
Fetch a single post by its OnlyFans post ID.
Args:
post_id: The OnlyFans post ID
Returns:
Post object or None
"""
self._delay_between_items()
data = await self._api_request(f"/posts/{post_id}")
if not data:
self.log(f"Post {post_id} not found", 'warning')
return None
user_id = str(data.get('author', {}).get('id', data.get('authorId', '')))
post = self._parse_post(data, user_id)
return post
async def get_posts(
self,
user_id: str,
username: str,
since_date: Optional[str] = None,
until_date: Optional[str] = None,
days_back: Optional[int] = None,
max_posts: Optional[int] = None,
progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[Post]:
"""
Fetch posts from a creator's timeline using offset-based pagination.
Args:
user_id: The OnlyFans numeric user ID
username: The username (for logging/reference)
since_date: Only fetch posts after this date (ISO format)
until_date: Only fetch posts before this date (ISO format)
days_back: Fetch posts from the last N days
max_posts: Maximum number of posts to fetch
progress_callback: Called with (page, total_posts) during fetching
Returns:
List of Post objects
"""
self.log(f"Fetching posts for {username} (user_id: {user_id})", 'info')
# Calculate date filters - use naive datetimes to avoid tz comparison issues
since_dt = None
until_dt = None
if days_back:
from datetime import timedelta
since_date = (datetime.now() - timedelta(days=days_back)).isoformat()
if since_date:
try:
dt = datetime.fromisoformat(since_date.replace('Z', '+00:00'))
since_dt = dt.replace(tzinfo=None) # Normalize to naive
except (ValueError, TypeError):
pass
if until_date:
try:
dt = datetime.fromisoformat(until_date.replace('Z', '+00:00'))
until_dt = dt.replace(tzinfo=None) # Normalize to naive
except (ValueError, TypeError):
pass
if since_dt:
self.log(f"Date filter: since_date={since_dt.isoformat()}", 'debug')
all_posts: List[Post] = []
offset = 0
page_size = 50
page = 0
consecutive_old = 0 # Track consecutive old posts for early stop
while True:
self._delay_between_items()
params = {
'limit': str(page_size),
'offset': str(offset),
'order': 'publish_date_desc',
}
data = await self._api_request(f"/users/{user_id}/posts", params=params)
if not data:
break
# OF returns a list of posts directly
posts_list = data if isinstance(data, list) else data.get('list', [])
if not posts_list:
break
page_had_old_post = False
for post_data in posts_list:
post = self._parse_post(post_data, user_id)
if not post:
continue
# Check date filters using published_at
if post.published_at and since_dt:
try:
post_dt = datetime.fromisoformat(post.published_at.replace('Z', '+00:00'))
post_dt_naive = post_dt.replace(tzinfo=None) # Normalize to naive
if post_dt_naive < since_dt:
self.log(f"Reached posts older than since_date ({post.published_at}), stopping", 'debug')
return all_posts
except (ValueError, TypeError) as e:
self.log(f"Date comparison error: {e} (post_date={post.published_at})", 'warning')
if post.published_at and until_dt:
try:
post_dt = datetime.fromisoformat(post.published_at.replace('Z', '+00:00'))
post_dt_naive = post_dt.replace(tzinfo=None)
if post_dt_naive > until_dt:
continue
except (ValueError, TypeError):
pass
all_posts.append(post)
if max_posts and len(all_posts) >= max_posts:
self.log(f"Reached max_posts limit: {max_posts}", 'debug')
return all_posts
page += 1
if progress_callback:
progress_callback(page, len(all_posts))
# If we got fewer results than page_size, we've reached the end
if len(posts_list) < page_size:
break
offset += page_size
self._delay_between_batches()
# Also fetch pinned posts (they may not appear in the timeline)
self._delay_between_items()
pinned_data = await self._api_request(
f"/users/{user_id}/posts",
params={'limit': '50', 'offset': '0', 'order': 'publish_date_desc', 'pinned': '1'},
)
if pinned_data:
pinned_list = pinned_data if isinstance(pinned_data, list) else pinned_data.get('list', [])
existing_ids = {p.post_id for p in all_posts}
for post_data in pinned_list:
post = self._parse_post(post_data, user_id)
if post and post.post_id not in existing_ids:
all_posts.append(post)
self.log(f"Fetched {len(all_posts)} posts for {username}", 'info')
return all_posts
def _parse_post(self, post_data: Dict, user_id: str) -> Optional[Post]:
"""
Parse an OnlyFans post into a Post model.
Args:
post_data: Raw post data from API
user_id: Creator's user ID
Returns:
Post object or None if parsing fails
"""
try:
post_id = str(post_data.get('id', ''))
if not post_id:
return None
# Parse timestamp - OF uses ISO format strings
published_at = None
raw_date = post_data.get('postedAt') or post_data.get('createdAt')
if raw_date:
try:
if isinstance(raw_date, str):
published_at = raw_date
elif isinstance(raw_date, (int, float)):
published_at = datetime.fromtimestamp(raw_date).isoformat()
except (ValueError, TypeError, OSError):
pass
# Content text
content = self._strip_html(post_data.get('rawText') or post_data.get('text') or '')
# Parse media attachments
attachments = []
media_list = post_data.get('media', []) or []
for media_item in media_list:
attachment = self._parse_attachment(media_item)
if attachment:
attachments.append(attachment)
# Extract embed URLs from content text
embed_urls = []
if content:
url_pattern = r'https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/|vimeo\.com/|dailymotion\.com/video/)\S+'
embed_urls = re.findall(url_pattern, content)
return Post(
post_id=post_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=user_id,
title=None,
content=content,
published_at=published_at,
added_at=datetime.now().isoformat(),
attachments=attachments,
embed_urls=embed_urls,
is_pinned=bool(post_data.get('isPinned')),
pinned_at=post_data.get('pinnedAt'),
)
except Exception as e:
self.log(f"Error parsing post: {e}", 'error')
return None
def _parse_attachment(self, media_item: Dict) -> Optional[Attachment]:
"""
Parse an OnlyFans media item into an Attachment.
OF media structure:
{
id, type, source: {source: url, width, height, duration},
full: {source: url, ...}, preview: {source: url, ...}
}
Prefers 'full' quality (OF's standard since 2024), falls back to 'source'.
Args:
media_item: Raw media dict from API
Returns:
Attachment object or None
"""
try:
media_id = str(media_item.get('id', ''))
media_type = media_item.get('type', '').lower()
# Map OF media types to our file types
type_map = {
'photo': 'image',
'video': 'video',
'audio': 'audio',
'gif': 'image',
}
file_type = type_map.get(media_type, 'unknown')
# Get download URL - prefer 'full' quality, fallback to 'source'
download_url = None
width = None
height = None
duration = None
# Current OF API nests media under 'files' key
files = media_item.get('files') or media_item
# Try 'full' first (higher quality)
full_data = files.get('full')
if full_data and isinstance(full_data, dict):
download_url = full_data.get('url') or full_data.get('source')
width = full_data.get('width')
height = full_data.get('height')
duration = full_data.get('duration')
# Fallback to 'source'
if not download_url:
source_data = files.get('source')
if source_data and isinstance(source_data, dict):
download_url = source_data.get('url') or source_data.get('source')
if not width:
width = source_data.get('width')
if not height:
height = source_data.get('height')
if not duration:
duration = source_data.get('duration')
# For videos without a direct URL, get metadata from media item
can_view = media_item.get('canView', True)
if not download_url and media_type == 'video':
# OF DRM videos use FairPlay SAMPLE-AES encryption — cannot be downloaded.
# Get dimensions/duration for metadata, then fall through to preview frame.
if not duration:
duration = media_item.get('duration')
if not width:
width = (full_data or {}).get('width')
if not height:
height = (full_data or {}).get('height')
# Fallback to 'preview' for any content type
# For DRM videos (canView=true), downloads the preview frame image (shown with lock overlay)
# For PPV videos (canView=false), there's no preview — marked unavailable
if not download_url:
preview_data = files.get('preview')
if preview_data and isinstance(preview_data, dict):
download_url = preview_data.get('url') or preview_data.get('source')
if not width:
width = preview_data.get('width')
if not height:
height = preview_data.get('height')
# Some OF responses have src directly
if not download_url:
download_url = media_item.get('src')
# Determine extension from URL
ext = ''
if download_url:
parsed = urlparse(download_url)
path = parsed.path
if '.' in path:
ext = path.rsplit('.', 1)[-1].lower()
# Clean up common issues
if ext in ('jpeg',):
ext = 'jpg'
elif media_type == 'photo':
ext = 'jpg'
elif media_type == 'video':
ext = 'mp4'
filename = f"{media_id}.{ext}" if ext else str(media_id)
# Override file_type based on actual extension (OF sometimes misreports type)
video_exts = {'mp4', 'mov', 'webm', 'avi', 'mkv', 'flv', 'm4v', 'wmv', 'mpg', 'mpeg'}
if ext in video_exts and file_type != 'video':
file_type = 'video'
# Duration may be in seconds (float or int)
if duration is not None:
try:
duration = int(float(duration))
except (ValueError, TypeError):
duration = None
# Check if content is actually locked (canView=false) vs just missing URL
can_view = media_item.get('canView', True)
is_preview = not can_view
if not download_url and not can_view:
self.log(f"PPV/locked content: {filename}", 'debug')
# Detect preview-only: no full/source URL but got a preview URL
if not is_preview and download_url:
has_full = False
if full_data and isinstance(full_data, dict):
has_full = bool(full_data.get('url') or full_data.get('source'))
if not has_full:
source_data = files.get('source')
if source_data and isinstance(source_data, dict):
has_full = bool(source_data.get('url') or source_data.get('source'))
elif not source_data:
has_full = False
if not has_full and not media_item.get('src'):
# Only got URL from preview fallback
is_preview = True
return Attachment(
name=filename,
server_path=f"/onlyfans/{media_id}",
file_type=file_type,
extension=ext if ext else None,
download_url=download_url,
file_size=None,
width=width,
height=height,
duration=duration,
is_preview=is_preview,
)
except Exception as e:
self.log(f"Error parsing attachment: {e}", 'error')
return None
# ==================== MESSAGES ====================
async def get_messages(self, user_id: str, max_messages: int = 500) -> List[Message]:
"""
Fetch messages from a conversation with a creator.
Uses GET /chats/{user_id}/messages with cursor-based pagination.
The 'id' param is used as cursor for older messages.
Args:
user_id: OnlyFans numeric user ID of the creator
max_messages: Maximum number of messages to fetch
Returns:
List of Message objects
"""
messages = []
cursor_id = None
page = 0
while len(messages) < max_messages:
page += 1
params = {'limit': 50, 'order': 'desc'}
if cursor_id:
params['id'] = cursor_id
data = await self._api_request(f"/chats/{user_id}/messages", params=params)
if not data:
break
# Response is a dict with 'list' key containing messages
msg_list = data.get('list', []) if isinstance(data, dict) else data
if not msg_list:
break
for msg_data in msg_list:
msg = self._parse_message(msg_data, user_id)
if msg:
messages.append(msg)
self.log(f"Fetched page {page}: {len(msg_list)} messages (total: {len(messages)})", 'debug')
# Use the last message's id as cursor for next page
if len(msg_list) < 50:
break # Last page
last_id = msg_list[-1].get('id')
if last_id and str(last_id) != str(cursor_id):
cursor_id = last_id
else:
break
self.log(f"Fetched {len(messages)} messages for user {user_id}", 'info')
return messages
def _parse_message(self, msg_data: Dict, creator_user_id: str) -> Optional[Message]:
"""
Parse an OnlyFans message into a Message model.
Args:
msg_data: Raw message dict from API
creator_user_id: Numeric user ID of the creator (to determine direction)
Returns:
Message object or None
"""
try:
msg_id = str(msg_data.get('id', ''))
if not msg_id:
return None
# Determine if message is from creator
from_user = msg_data.get('fromUser', {})
from_user_id = str(from_user.get('id', ''))
is_from_creator = (from_user_id == str(creator_user_id))
# Parse text
text = self._strip_html(msg_data.get('text') or '')
# Parse timestamp
created_at = msg_data.get('createdAt')
sent_at = None
if created_at:
try:
sent_at = datetime.fromisoformat(created_at.replace('Z', '+00:00')).isoformat()
except (ValueError, TypeError):
sent_at = created_at
# PPV/price info
price = msg_data.get('price')
is_free = msg_data.get('isFree', True)
is_purchased = msg_data.get('isOpened', False) or msg_data.get('canPurchase') is False
is_tip = msg_data.get('isTip', False)
tip_amount = msg_data.get('tipAmount')
# Parse media attachments (same structure as posts)
attachments = []
media_list = msg_data.get('media', []) or []
for media_item in media_list:
att = self._parse_attachment(media_item)
if att:
attachments.append(att)
return Message(
message_id=msg_id,
platform=self.PLATFORM,
service_id=self.SERVICE_ID,
creator_id=str(creator_user_id),
text=text if text else None,
sent_at=sent_at,
is_from_creator=is_from_creator,
is_tip=bool(is_tip),
tip_amount=float(tip_amount) if tip_amount else None,
price=float(price) if price else None,
is_free=bool(is_free),
is_purchased=bool(is_purchased),
attachments=attachments,
)
except Exception as e:
self.log(f"Error parsing message: {e}", 'error')
return None

View File

@@ -0,0 +1,109 @@
"""
OnlyFans Request Signing Module
Handles the dynamic request signing required by the OnlyFans API.
Fetches signing rules from the DATAHOARDERS/dynamic-rules GitHub repo
and computes SHA-1 based signatures for each API request.
Isolated module so it's easy to update when OF changes their signing scheme.
"""
import hashlib
import time
from typing import Dict, Optional
import aiohttp
RULES_URL = "https://raw.githubusercontent.com/DATAHOARDERS/dynamic-rules/main/onlyfans.json"
class OnlyFansSigner:
"""
Computes request signatures for the OnlyFans API.
Uses dynamic rules fetched from a public GitHub repo (same source as OF-Scraper).
Rules are cached locally and refreshed every 6 hours.
"""
RULES_TTL = 6 * 3600 # 6 hours
def __init__(self, rules_url: Optional[str] = None):
self.rules_url = rules_url or RULES_URL
self._rules: Optional[Dict] = None
self._rules_fetched_at: float = 0
@property
def rules_stale(self) -> bool:
"""Check if cached rules need refreshing"""
if self._rules is None:
return True
return (time.time() - self._rules_fetched_at) > self.RULES_TTL
async def get_rules(self) -> Dict:
"""
Fetch signing rules, using cache if fresh.
Returns:
Dict with keys: static_param, format, checksum_indexes,
checksum_constants, checksum_constant, app_token
"""
if not self.rules_stale:
return self._rules
timeout = aiohttp.ClientTimeout(total=15)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(self.rules_url) as resp:
if resp.status != 200:
if self._rules is not None:
# Use stale cache rather than failing
return self._rules
raise RuntimeError(
f"Failed to fetch OF signing rules: HTTP {resp.status}"
)
self._rules = await resp.json(content_type=None)
self._rules_fetched_at = time.time()
return self._rules
async def sign(self, endpoint_path: str, user_id: str = "0") -> Dict[str, str]:
"""
Compute signing headers for an OnlyFans API request.
Args:
endpoint_path: The full URL path (e.g. "/api2/v2/users/me")
user_id: The authenticated user's ID (from auth_id cookie)
Returns:
Dict with 'sign', 'time', 'app-token' headers
"""
rules = await self.get_rules()
# Timestamp in milliseconds (matching OF-Scraper's implementation)
timestamp = str(round(time.time() * 1000))
# 1. Build the message to hash
msg = "\n".join([
rules["static_param"],
timestamp,
endpoint_path,
str(user_id),
])
# 2. SHA-1 hash
sha1_hash = hashlib.sha1(msg.encode("utf-8")).hexdigest()
sha1_bytes = sha1_hash.encode("ascii")
# 3. Checksum from indexed byte positions + single constant
# (matching OF-Scraper's implementation)
checksum_indexes = rules["checksum_indexes"]
checksum_constant = rules.get("checksum_constant", 0)
checksum = sum(sha1_bytes[i] for i in checksum_indexes) + checksum_constant
# 4. Build the sign header using the format template
# Typical format: "53760:{}:{:x}:69723085"
sign_value = rules["format"].format(sha1_hash, abs(checksum))
return {
"sign": sign_value,
"time": timestamp,
"app-token": rules["app_token"],
}

View File

@@ -0,0 +1,755 @@
"""
Pornhub Client - Fetches creator info and videos using yt-dlp
"""
import asyncio
import html as html_module
import json
import os
import re
import subprocess
import tempfile
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment
class PornhubClient(LoggingMixin):
"""
Client for fetching Pornhub creator information and videos using yt-dlp
Supports:
- Pornstar pages (pornhub.com/pornstar/name)
- Channel pages (pornhub.com/channels/name)
- User pages (pornhub.com/users/name)
- Model pages (pornhub.com/model/name)
"""
SERVICE_ID = 'pornhub'
PLATFORM = 'pornhub'
# Quality presets for yt-dlp
# Pornhub serves single combined streams with IDs like '1080p', '720p', etc.
# NOT separate video+audio streams like YouTube
QUALITY_PRESETS = {
'best': 'bestvideo+bestaudio/best',
'1080p': 'bestvideo[height<=1080]+bestaudio/best[height<=1080]/best',
'720p': 'bestvideo[height<=720]+bestaudio/best[height<=720]/best',
'480p': 'bestvideo[height<=480]+bestaudio/best[height<=480]/best',
}
def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Pornhub')
# Find yt-dlp executable
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
if not self.ytdlp_path:
self.log("yt-dlp not found, Pornhub support will be disabled", 'warning')
# Store database reference for cookie access
self.unified_db = unified_db
self._cookies_file = None
# Cache for profile page HTML (avoid re-fetching for avatar/banner/bio)
self._profile_page_cache: Dict[str, Optional[str]] = {}
def _find_ytdlp(self) -> Optional[str]:
"""Find yt-dlp executable"""
common_paths = [
'/opt/media-downloader/venv/bin/yt-dlp',
'/usr/local/bin/yt-dlp',
'/usr/bin/yt-dlp',
'/opt/homebrew/bin/yt-dlp',
os.path.expanduser('~/.local/bin/yt-dlp'),
]
for path in common_paths:
if os.path.isfile(path) and os.access(path, os.X_OK):
return path
try:
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def is_available(self) -> bool:
"""Check if yt-dlp is available"""
return self.ytdlp_path is not None
def _get_cookies_file(self) -> Optional[str]:
"""Get path to cookies file, creating it from database if needed"""
if self._cookies_file and os.path.exists(self._cookies_file):
return self._cookies_file
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('pornhub',))
row = cursor.fetchone()
if row and row[0]:
data = json.loads(row[0])
# Support both {"cookies": [...]} and [...] formats
if isinstance(data, dict) and 'cookies' in data:
cookies_list = data['cookies']
elif isinstance(data, list):
cookies_list = data
else:
cookies_list = []
if cookies_list:
# Write cookies to temp file in Netscape format
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='pornhub_cookies_')
with os.fdopen(fd, 'w') as f:
f.write("# Netscape HTTP Cookie File\n")
for cookie in cookies_list:
domain = cookie.get('domain', '')
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
path = cookie.get('path', '/')
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
expiry = str(int(cookie.get('expirationDate', 0)))
name = cookie.get('name', '')
value = cookie.get('value', '')
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
self.log(f"Loaded {len(cookies_list)} cookies from pornhub scraper", 'debug')
return self._cookies_file
except Exception as e:
self.log(f"Could not load cookies: {e}", 'debug')
return None
def _get_cookies_list(self) -> Optional[list]:
"""Get cookies as a list of dicts for aiohttp requests"""
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('pornhub',))
row = cursor.fetchone()
if row and row[0]:
data = json.loads(row[0])
if isinstance(data, dict) and 'cookies' in data:
return data['cookies']
elif isinstance(data, list):
return data
except Exception as e:
self.log(f"Could not load cookies list: {e}", 'debug')
return None
def _get_base_cmd(self) -> List[str]:
"""Get base yt-dlp command with cookies if available"""
cmd = [self.ytdlp_path]
cookies_file = self._get_cookies_file()
if cookies_file:
cmd.extend(['--cookies', cookies_file])
return cmd
def cleanup(self):
"""Clean up temporary files"""
if self._cookies_file and os.path.exists(self._cookies_file):
try:
os.unlink(self._cookies_file)
except Exception:
pass
self._cookies_file = None
self._profile_page_cache.clear()
@staticmethod
def extract_creator_id(url: str) -> Optional[Tuple[str, str]]:
"""
Extract creator type and identifier from Pornhub URL
Returns:
Tuple of (type, id) where type is 'pornstar', 'channels', 'users', or 'model'
or None if not a valid Pornhub creator URL
"""
patterns = [
(r'pornhub\.com/pornstar/([a-zA-Z0-9_-]+)', 'pornstar'),
(r'pornhub\.com/channels/([a-zA-Z0-9_-]+)', 'channels'),
(r'pornhub\.com/users/([a-zA-Z0-9_-]+)', 'users'),
(r'pornhub\.com/model/([a-zA-Z0-9_-]+)', 'model'),
]
for pattern, creator_type in patterns:
match = re.search(pattern, url)
if match:
return (creator_type, match.group(1))
return None
@staticmethod
def normalize_creator_url(creator_id: str, creator_type: str = 'pornstar') -> str:
"""Convert creator ID to a consistent URL format
Args:
creator_id: Creator name/identifier (may be 'type/name' format)
creator_type: Default type if not embedded in creator_id
"""
# Already a full URL
if creator_id.startswith('http://') or creator_id.startswith('https://'):
return creator_id
# Handle 'type/name' format from URL parser
if '/' in creator_id:
parts = creator_id.split('/', 1)
creator_type = parts[0]
creator_id = parts[1]
return f"https://www.pornhub.com/{creator_type}/{creator_id}"
def _get_listing_url(self, url: str) -> str:
"""Get the URL to use for listing videos from a creator page.
For pornstars and models, append /videos to get the video listing.
For channels and users, the base URL already lists videos.
"""
# Parse out the type
parsed = self.extract_creator_id(url)
if parsed:
creator_type, _ = parsed
if creator_type in ('pornstar', 'model'):
# Strip any trailing slash and append /videos
url = url.rstrip('/')
if not url.endswith('/videos'):
url = f"{url}/videos"
return url
async def get_creator_info(self, url: str) -> Optional[Dict]:
"""
Get creator information using yt-dlp + profile page scraping
Returns dict with creator metadata or None if not found
"""
if not self.is_available():
return None
creator_type_id = self.extract_creator_id(url)
creator_type = creator_type_id[0] if creator_type_id else 'pornstar'
# Try to scrape the display name from the profile page first
creator_name = None
try:
page_html = await self.get_profile_page(url)
if page_html:
# Look for <h1 itemprop="name">Name</h1> inside nameSubscribe div
name_match = re.search(r'<div class="nameSubscribe">.*?<h1[^>]*>\s*(.+?)\s*</h1>', page_html, re.DOTALL)
if name_match:
creator_name = html_module.unescape(name_match.group(1).strip())
self.log(f"Found creator name from profile page: {creator_name}", 'debug')
except Exception as e:
self.log(f"Could not scrape creator name: {e}", 'debug')
# If page scraping didn't find a name, try yt-dlp
if not creator_name:
try:
listing_url = self._get_listing_url(url)
cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'-j',
'--playlist-items', '1',
listing_url
]
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode == 0:
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line:
continue
try:
data = json.loads(line)
playlist_title = data.get('playlist_title') or ''
creator_name = (data.get('channel') or data.get('uploader')
or playlist_title.replace(' - Videos', '') or None)
if creator_name:
creator_name = html_module.unescape(creator_name)
break
except json.JSONDecodeError:
continue
except Exception as e:
self.log(f"yt-dlp creator info failed: {e}", 'debug')
# Fall back to deriving name from URL slug
if not creator_name and creator_type_id:
creator_name = creator_type_id[1].replace('-', ' ').title()
if creator_name:
return {
'creator_id': creator_type_id[1] if creator_type_id else None,
'creator_name': creator_name,
'creator_url': url,
'creator_type': creator_type,
}
return None
async def get_creator_videos(self, url: str, since_date: str = None,
max_videos: int = None,
progress_callback=None) -> List[Dict]:
"""
Get all videos from a creator page using --flat-playlist for speed.
Args:
url: Pornhub creator URL
since_date: Only fetch videos published after this date (ISO format)
max_videos: Maximum number of videos to fetch
progress_callback: Callback function(count) for progress updates
Returns:
List of video metadata dicts
"""
if not self.is_available():
return []
try:
listing_url = self._get_listing_url(url)
# Use --flat-playlist for fast listing (avoids per-video HTTP requests)
cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'-j',
'--socket-timeout', '30',
'--retries', '3',
listing_url
]
if max_videos:
cmd.extend(['--playlist-items', f'1:{max_videos}'])
self.log(f"Fetching videos from: {url}", 'info')
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error = stderr.decode('utf-8', errors='replace')
self.log(f"Failed to get creator videos: {error}", 'warning')
return []
videos = []
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line:
continue
try:
data = json.loads(line)
# Skip non-video entries
if data.get('_type') == 'playlist':
continue
video_id = data.get('id')
if not video_id:
continue
# Flat-playlist doesn't provide upload_date for Pornhub, but check anyway
upload_date = data.get('upload_date')
if upload_date:
try:
upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat()
except ValueError:
pass
# Decode HTML entities in title (flat-playlist returns them encoded)
title = html_module.unescape(data.get('title', f'Video {video_id}'))
# Build video URL
video_url = (data.get('webpage_url') or data.get('url')
or f"https://www.pornhub.com/view_video.php?viewkey={video_id}")
videos.append({
'video_id': video_id,
'title': title,
'description': data.get('description', ''),
'upload_date': upload_date,
'duration': data.get('duration'),
'view_count': data.get('view_count'),
'thumbnail': data.get('thumbnail'),
'url': video_url,
})
if progress_callback:
progress_callback(len(videos))
if max_videos and len(videos) >= max_videos:
break
except json.JSONDecodeError:
continue
self.log(f"Found {len(videos)} videos", 'info')
return videos
except Exception as e:
self.log(f"Error getting creator videos: {e}", 'error')
return []
async def download_video(self, video_url: str, output_dir: Path, quality: str = 'best',
progress_callback=None) -> Dict:
"""
Download a video
Args:
video_url: Pornhub video URL
output_dir: Directory to save the video
quality: Quality preset
progress_callback: Callback for download progress
Returns:
Dict with success status and file info
"""
if not self.is_available():
return {'success': False, 'error': 'yt-dlp not available'}
try:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s')
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
cmd = self._get_base_cmd() + [
'--no-warnings',
'-f', format_str,
'-o', output_template,
'--print-json',
'--no-playlist',
'--user-agent', 'Mozilla/5.0',
'--referer', 'https://www.pornhub.com/',
'--merge-output-format', 'mp4',
'--concurrent-fragments', '4',
'--no-part',
'--retries', '20',
video_url
]
self.log(f"Downloading video: {video_url}", 'debug')
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error_msg = stderr.decode('utf-8', errors='replace').strip()
if 'Video unavailable' in error_msg or 'not available' in error_msg:
error_msg = 'Video unavailable or private'
elif 'premium' in error_msg.lower():
error_msg = 'Video requires premium access'
elif len(error_msg) > 200:
error_msg = error_msg[:200] + '...'
return {'success': False, 'error': error_msg}
# Parse output JSON
video_info = None
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
try:
video_info = json.loads(line)
break
except json.JSONDecodeError:
continue
if not video_info:
# Try to find downloaded file
files = list(output_dir.glob('*.mp4'))
if files:
file_path = max(files, key=lambda f: f.stat().st_mtime)
return {
'success': True,
'file_path': str(file_path),
'filename': file_path.name,
'file_size': file_path.stat().st_size
}
return {'success': False, 'error': 'Could not find downloaded file'}
file_path = video_info.get('_filename') or video_info.get('filename')
if file_path:
file_path = Path(file_path)
return {
'success': True,
'file_path': str(file_path) if file_path else None,
'filename': file_path.name if file_path else None,
'file_size': file_path.stat().st_size if file_path and file_path.exists() else video_info.get('filesize'),
'title': video_info.get('title'),
'duration': video_info.get('duration'),
'video_id': video_info.get('id'),
'upload_date': video_info.get('upload_date'),
'timestamp': video_info.get('timestamp'),
'thumbnail': video_info.get('thumbnail'),
}
except Exception as e:
self.log(f"Error downloading video: {e}", 'error')
return {'success': False, 'error': str(e)}
async def get_profile_page(self, url: str) -> Optional[str]:
"""Fetch profile page HTML via aiohttp (with cookies if available).
Results are cached to avoid re-fetching for avatar/banner/bio."""
# Strip /videos suffix for profile page
base_url = re.sub(r'/videos/?$', '', url)
if base_url in self._profile_page_cache:
return self._profile_page_cache[base_url]
try:
import aiohttp
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
}
# Build simple cookies dict for the session
cookies_dict = {}
cookies_list = self._get_cookies_list()
if cookies_list:
for cookie in cookies_list:
name = cookie.get('name', '')
value = cookie.get('value', '')
if name:
cookies_dict[name] = value
async with aiohttp.ClientSession(cookies=cookies_dict) as session:
async with session.get(
base_url,
headers=headers,
timeout=aiohttp.ClientTimeout(total=15)
) as resp:
if resp.status == 200:
text = await resp.text()
self._profile_page_cache[base_url] = text
return text
except Exception as e:
self.log(f"Could not fetch profile page: {e}", 'debug')
self._profile_page_cache[base_url] = None
return None
async def get_profile_image(self, url: str) -> Optional[str]:
"""Scrape profile page for avatar/photo URL"""
try:
page_html = await self.get_profile_page(url)
if not page_html:
return None
# Look for avatar image: <img id="getAvatar" src="...">
avatar_match = re.search(r'<img[^>]*id=["\']getAvatar["\'][^>]*src=["\']([^"\']+)["\']', page_html)
if avatar_match:
self.log("Found Pornhub profile avatar", 'debug')
return avatar_match.group(1)
# Try og:image meta tag
og_match = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_html)
if not og_match:
og_match = re.search(r'<meta\s+content="([^"]+)"\s+property="og:image"', page_html)
if og_match:
return og_match.group(1)
except Exception as e:
self.log(f"Could not fetch profile image: {e}", 'debug')
return None
async def get_profile_bio(self, url: str) -> Optional[str]:
"""Scrape bio/about section from profile page"""
try:
page_html = await self.get_profile_page(url)
if not page_html:
return None
# Look for aboutMeSection -> div with the actual text
# Structure: <section class="aboutMeSection ..."><div class="title">About Name</div><div>Bio text</div></section>
about_match = re.search(
r'<section\s+class="aboutMeSection[^"]*"[^>]*>.*?<div class="title">[^<]*</div>\s*<div>\s*(.*?)\s*</div>',
page_html, re.DOTALL
)
if about_match:
bio_text = re.sub(r'<[^>]+>', '', about_match.group(1)).strip()
if bio_text:
self.log("Found Pornhub profile bio", 'debug')
return html_module.unescape(bio_text)
# Fallback: look for biographyAbout section
bio_match = re.search(
r'class="biographyAbout[^"]*"[^>]*>.*?<div class="content[^"]*">(.*?)</div>',
page_html, re.DOTALL
)
if bio_match:
bio_text = re.sub(r'<[^>]+>', '', bio_match.group(1)).strip()
if bio_text:
self.log("Found Pornhub profile bio (fallback)", 'debug')
return html_module.unescape(bio_text)
except Exception as e:
self.log(f"Could not fetch profile bio: {e}", 'debug')
return None
async def get_profile_banner(self, url: str) -> Optional[str]:
"""Scrape banner/cover image if available"""
try:
page_html = await self.get_profile_page(url)
if not page_html:
return None
# Look for cover image: <img id="coverPictureDefault" src="...">
cover_match = re.search(
r'<img[^>]*id=["\']coverPictureDefault["\'][^>]*src=["\']([^"\']+)["\']',
page_html
)
if cover_match:
self.log("Found Pornhub profile banner", 'debug')
return cover_match.group(1)
# Fallback: any img inside coverImage div
cover_match = re.search(
r'<div class="coverImage">\s*<img[^>]*src=["\']([^"\']+)["\']',
page_html, re.DOTALL
)
if cover_match:
self.log("Found Pornhub profile banner (div)", 'debug')
return cover_match.group(1)
except Exception as e:
self.log(f"Could not fetch profile banner: {e}", 'debug')
return None
async def get_profile_info(self, url: str) -> Optional[Dict]:
"""Scrape all profile info from the page in one pass"""
page_html = await self.get_profile_page(url)
if not page_html:
return None
info = {}
# Extract infoPiece data (Gender, Birth Place, Height, etc.)
info_pieces = re.findall(
r'<div class="infoPiece">\s*<span>\s*(.*?)\s*</span>\s*(.*?)\s*</div>',
page_html, re.DOTALL
)
for label, value in info_pieces:
label = re.sub(r'<[^>]+>', '', label).strip().rstrip(':')
value = re.sub(r'<[^>]+>', '', value).strip()
if label and value:
info[label.lower().replace(' ', '_')] = value
return info if info else None
async def get_joined_date(self, url: str) -> Optional[str]:
"""Extract a joined/career start date from profile info"""
try:
profile_info = await self.get_profile_info(url)
if not profile_info:
return None
# Pornstar pages have "Career Start and End: 2011 to Present"
career = profile_info.get('career_start_and_end')
if career:
# Extract start year: "2011 to Present" -> "2011"
match = re.match(r'(\d{4})', career)
if match:
return match.group(1)
# User/model pages might not have career info but could have other dates
return None
except Exception as e:
self.log(f"Could not get joined date: {e}", 'debug')
return None
async def get_creator(self, url: str) -> Optional[Creator]:
"""
Get Creator object from creator URL
"""
info = await self.get_creator_info(url)
if not info:
return None
# Build creator_id as 'type/name' format
creator_type_id = self.extract_creator_id(url)
if creator_type_id:
creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}"
else:
creator_id = info.get('creator_id', '')
# Profile image is already fetched during get_creator_info (page was cached)
profile_image = await self.get_profile_image(url)
return Creator(
creator_id=creator_id,
service_id='pornhub',
platform='pornhub',
username=info.get('creator_name', 'Unknown'),
display_name=info.get('creator_name'),
profile_image_url=profile_image,
)
async def get_posts(self, url: str, since_date: str = None,
max_videos: int = None, progress_callback=None) -> List[Post]:
"""
Get videos as Post objects
"""
videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback)
# Get creator_id from URL
creator_type_id = self.extract_creator_id(url)
creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}" if creator_type_id else ''
posts = []
for video in videos:
# Create attachment for the video
attachment = Attachment(
name=f"{video['title']}.mp4",
file_type='video',
extension='.mp4',
server_path=video['url'],
download_url=video['url'],
duration=video.get('duration'),
)
post = Post(
post_id=video['video_id'],
service_id='pornhub',
platform='pornhub',
creator_id=creator_id,
title=video['title'],
content=video.get('description') or video['title'],
published_at=video.get('upload_date'),
attachments=[attachment],
)
posts.append(post)
return posts

View File

@@ -0,0 +1,678 @@
"""
Reddit Client for Paid Content - Uses gallery-dl to fetch subreddit posts and download media.
Adapts the gallery-dl + metadata parsing pattern from reddit_community_monitor.py
to produce Post/Attachment objects for the paid content system.
"""
import asyncio
import json
import os
import shutil
import subprocess
import tempfile
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Dict, List, Optional
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class RedditClient(LoggingMixin):
"""
Client for fetching Reddit subreddit content via gallery-dl.
gallery-dl downloads files during fetch, so attachments come with local_path
already set. The sync handler moves files to their final location.
"""
SERVICE_ID = 'reddit'
PLATFORM = 'reddit'
def __init__(self, unified_db=None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Reddit')
self.unified_db = unified_db
self.gallery_dl_path = shutil.which('gallery-dl') or '/opt/media-downloader/venv/bin/gallery-dl'
def get_subreddit_info(self, subreddit: str) -> Optional[Dict]:
"""Get basic subreddit info by checking the Reddit JSON API.
Returns dict with creator_id and creator_name.
"""
import urllib.request
import urllib.error
try:
# Quick check via Reddit's public JSON endpoint
url = f'https://www.reddit.com/r/{subreddit}/about.json'
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
})
with urllib.request.urlopen(req, timeout=15) as resp:
data = json.loads(resp.read().decode())
sub_data = data.get('data', {})
display_name = sub_data.get('display_name', subreddit)
title = sub_data.get('title', '')
# Extract icon — community_icon is higher res, icon_img is fallback
icon_url = (sub_data.get('community_icon') or sub_data.get('icon_img') or '').split('?')[0]
# HTML entities in URLs
icon_url = icon_url.replace('&amp;', '&') if icon_url else None
# Extract banner — banner_background_image is the main one
banner_url = sub_data.get('banner_background_image') or sub_data.get('mobile_banner_image') or ''
banner_url = banner_url.split('?')[0] if banner_url else None
if banner_url:
banner_url = banner_url.replace('&amp;', '&')
# Build bio from title + public description
public_desc = sub_data.get('public_description', '')
bio_parts = []
if title:
bio_parts.append(title)
if public_desc and public_desc != title:
bio_parts.append(public_desc)
subscribers = sub_data.get('subscribers')
if subscribers:
bio_parts.append(f"{subscribers:,} subscribers")
bio = ''.join(bio_parts) if bio_parts else None
# Subreddit creation date
created_utc = sub_data.get('created_utc')
joined_date = None
if created_utc:
try:
joined_date = datetime.fromtimestamp(created_utc, tz=timezone.utc).strftime('%Y-%m-%d')
except (ValueError, OSError):
pass
# Use the subreddit title as display name (e.g. "Reddit Pics")
# Fall back to r/name format if no title
friendly_name = title if title else f'r/{display_name}'
return {
'creator_id': display_name.lower(),
'creator_name': f'r/{display_name}',
'display_name': friendly_name,
'bio': bio,
'joined_date': joined_date,
'profile_image_url': icon_url or None,
'banner_image_url': banner_url or None,
}
except urllib.error.HTTPError as e:
if e.code == 404:
self.log(f"Subreddit r/{subreddit} not found (404)", 'warning')
return None
elif e.code == 403:
# Private/quarantined — still exists, return basic info
self.log(f"Subreddit r/{subreddit} is private/quarantined", 'warning')
return {
'creator_id': subreddit.lower(),
'creator_name': f'r/{subreddit}',
}
else:
self.log(f"HTTP {e.code} checking r/{subreddit}", 'warning')
# Return basic info and let sync verify
return {
'creator_id': subreddit.lower(),
'creator_name': f'r/{subreddit}',
}
except Exception as e:
self.log(f"Error getting subreddit info for r/{subreddit}: {e}", 'error')
return None
def get_posts(self, subreddit: str, since_date: str = None, max_posts: int = 0,
progress_callback=None) -> tuple:
"""Fetch posts and download media from a subreddit using gallery-dl.
Args:
subreddit: Subreddit name (without r/)
since_date: ISO date string; skip posts older than this
max_posts: Maximum posts to fetch (0 = unlimited)
progress_callback: Optional callable(downloaded_count, skipped_count, latest_file)
for live progress updates
Returns:
Tuple of (List[Post], temp_dir_path) — caller must clean up temp_dir
when done moving files. Returns ([], None) on failure.
"""
temp_dir = tempfile.mkdtemp(prefix=f'reddit_paid_{subreddit}_')
try:
downloaded = self.run_gallery_dl(subreddit, temp_dir, since_date, max_posts,
progress_callback=progress_callback)
if not downloaded:
shutil.rmtree(temp_dir, ignore_errors=True)
return [], None
# Group files by post using metadata sidecars
grouped = self._group_files_by_post(downloaded, temp_dir, subreddit)
if not grouped:
shutil.rmtree(temp_dir, ignore_errors=True)
return [], None
posts = []
for post_id, post_data in grouped.items():
attachments = []
for file_path in post_data['files']:
ext = file_path.suffix.lower()
file_type = self._detect_file_type(ext)
attachments.append(Attachment(
name=file_path.name,
file_type=file_type,
extension=ext,
server_path=str(file_path), # temp path, will be moved
download_url=None, # Already downloaded
file_size=file_path.stat().st_size if file_path.exists() else None,
))
if not attachments:
continue
post = Post(
post_id=post_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=subreddit.lower(),
title=post_data.get('title'),
content=post_data.get('title'),
published_at=post_data.get('date'),
attachments=attachments,
)
posts.append(post)
self.log(f"Parsed {len(posts)} posts with {sum(len(p.attachments) for p in posts)} attachments from r/{subreddit}", 'info')
return posts, temp_dir
except Exception as e:
self.log(f"Error fetching posts from r/{subreddit}: {e}", 'error')
shutil.rmtree(temp_dir, ignore_errors=True)
return [], None
def run_gallery_dl(self, subreddit: str, temp_dir: str,
since_date: str = None, max_posts: int = 0,
progress_callback=None, batch_callback=None,
batch_size: int = 50) -> dict:
"""Run gallery-dl to download media from a subreddit.
Streams stdout line-by-line. Calls progress_callback for status updates
and batch_callback with lists of new file paths for incremental processing.
Args:
progress_callback: Called with (dl_count, skip_count, total_seen)
batch_callback: Called with (new_files: List[Path]) every batch_size files
batch_size: How many files to accumulate before calling batch_callback
Returns:
Dict with dl_count, skip_count, total.
"""
import time
# Use a separate download archive for paid content reddit
archive_dir = '/opt/media-downloader/data/cache'
os.makedirs(archive_dir, exist_ok=True)
archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
cmd = [
self.gallery_dl_path,
'--write-metadata',
'--download-archive', archive_path,
'-d', temp_dir,
]
# REST API mode to avoid shared OAuth rate limits
cmd.extend(['-o', 'extractor.reddit.api=rest'])
# Limit posts (0 = unlimited)
if max_posts > 0:
cmd.extend(['--range', f'1-{max_posts}'])
# Date filtering
if since_date:
try:
cutoff = since_date[:10] # YYYY-MM-DD
cmd.extend(['--filter', f"date >= datetime.strptime('{cutoff}', '%Y-%m-%d')"])
except (ValueError, IndexError):
pass
cmd.append(f'https://www.reddit.com/r/{subreddit}/new/')
# Check for Reddit cookies file
cookies_file = self._get_cookies_file()
if cookies_file:
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
cmd.extend(['--cookies', temp_cookie_file])
self.log(f"Running gallery-dl for r/{subreddit}", 'info')
self.log(f"Command: {' '.join(cmd)}", 'debug')
dl_count = 0
skip_count = 0
pending_files = []
try:
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
start_time = time.time()
timeout_secs = 7200 # 2 hours
while True:
if time.time() - start_time > timeout_secs:
proc.kill()
self.log(f"gallery-dl timed out for r/{subreddit}", 'error')
break
line = proc.stdout.readline()
if not line and proc.poll() is not None:
break
if not line:
continue
line = line.strip()
if not line:
continue
if line.startswith('# '):
# Skipped file (already in archive)
skip_count += 1
else:
# Downloaded file — gallery-dl prints the full path
dl_count += 1
file_path = Path(line)
if file_path.exists() and not file_path.name.endswith('.json'):
pending_files.append(file_path)
total = dl_count + skip_count
if progress_callback and total % 5 == 0:
progress_callback(dl_count, skip_count, total)
# Flush batch for processing
if batch_callback and len(pending_files) >= batch_size:
batch_callback(list(pending_files))
pending_files.clear()
proc.wait()
# Final batch
if batch_callback and pending_files:
batch_callback(list(pending_files))
pending_files.clear()
if progress_callback:
progress_callback(dl_count, skip_count, dl_count + skip_count)
returncode = proc.returncode
if returncode not in (None, 0, 1, 4, 5):
stderr = proc.stderr.read()
self.log(f"gallery-dl returned code {returncode} for r/{subreddit}", 'warning')
if stderr:
self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
except Exception as e:
self.log(f"gallery-dl failed for r/{subreddit}: {e}", 'error')
self.log(f"gallery-dl done for r/{subreddit}: {dl_count} downloaded, {skip_count} skipped", 'info')
return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
def _group_files_by_post(self, files: List[Path], temp_dir: str,
subreddit: str) -> Dict[str, Dict]:
"""Group downloaded files by Reddit post ID using metadata JSON sidecars.
Adapted from reddit_community_monitor.py:_group_files_by_post
Returns:
Dict mapping reddit_post_id -> {
'files': [Path],
'title': str,
'date': str,
'source_url': str
}
"""
posts: Dict[str, Dict] = {}
for file_path in files:
# Look for matching metadata JSON sidecar
json_path = file_path.with_suffix(file_path.suffix + '.json')
if not json_path.exists():
json_path = file_path.with_suffix('.json')
metadata = {}
if json_path.exists():
try:
with open(json_path, 'r', encoding='utf-8') as f:
metadata = json.load(f)
except (json.JSONDecodeError, Exception) as e:
self.log(f"Failed to parse metadata for {file_path.name}: {e}", 'debug')
# Extract Reddit post ID
reddit_post_id = None
for key in ('id', 'reddit_id', 'parent_id'):
if key in metadata:
reddit_post_id = str(metadata[key])
break
if not reddit_post_id:
# Filename-based fallback: subreddit_postid_num.ext
parts = file_path.stem.split('_')
if len(parts) >= 2:
reddit_post_id = parts[-2] if len(parts) >= 3 else parts[-1]
else:
reddit_post_id = file_path.stem
# Extract post date
post_date = None
if 'date' in metadata:
date_val = metadata['date']
if isinstance(date_val, str):
for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
try:
utc_dt = datetime.strptime(date_val, fmt).replace(tzinfo=timezone.utc)
post_date = utc_dt.astimezone().strftime('%Y-%m-%dT%H:%M:%S')
break
except ValueError:
continue
if not post_date:
post_date = date_val
elif isinstance(date_val, (int, float)):
try:
post_date = datetime.fromtimestamp(date_val, tz=timezone.utc).isoformat()
except (ValueError, OSError):
pass
if not post_date and 'created_utc' in metadata:
try:
post_date = datetime.fromtimestamp(metadata['created_utc'], tz=timezone.utc).isoformat()
except (ValueError, OSError):
pass
if not post_date:
post_date = datetime.now().isoformat()
title = metadata.get('title', metadata.get('description', ''))
sub = metadata.get('subreddit', subreddit)
source_url = f"https://www.reddit.com/r/{sub}/comments/{reddit_post_id}" if sub else ''
if reddit_post_id not in posts:
posts[reddit_post_id] = {
'files': [],
'title': title,
'date': post_date,
'source_url': source_url,
}
posts[reddit_post_id]['files'].append(file_path)
return posts
def _get_cookies_file(self) -> Optional[str]:
"""Get Reddit cookies JSON from the scrapers table if configured."""
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT cookies FROM scrapers WHERE name = 'reddit' AND cookies IS NOT NULL"
)
row = cursor.fetchone()
if row and row[0]:
return row[0]
except Exception as e:
self.log(f"Could not load Reddit cookies: {e}", 'debug')
return None
def _write_netscape_cookie_file(self, cookies_json: str, output_path: str) -> bool:
"""Convert JSON cookies array to Netscape cookie file format."""
try:
cookies = json.loads(cookies_json)
if not isinstance(cookies, list):
return False
with open(output_path, 'w') as f:
f.write("# Netscape HTTP Cookie File\n")
f.write("# https://curl.haxx.se/docs/http-cookies.html\n\n")
for cookie in cookies:
domain = cookie.get('domain', '')
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
path = cookie.get('path', '/')
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
expires = cookie.get('expirationDate', cookie.get('expiry', cookie.get('expires', 0)))
if expires is None:
expires = 0
expires = str(int(float(expires)))
name = cookie.get('name', '')
value = cookie.get('value', '')
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n")
return True
except Exception as e:
self.log(f"Failed to write Netscape cookie file: {e}", 'error')
return False
def get_pullpush_post_ids(self, subreddit: str, after_ts: int = 0,
before_ts: int = None,
progress_callback=None) -> List[Dict]:
"""Fetch all historical post IDs for a subreddit from the Pullpush (Pushshift) API.
Paginates through the full archive using created_utc ascending order.
Rate-limited to ~1 request per 2 seconds.
Args:
subreddit: Subreddit name (without r/)
after_ts: Unix timestamp to start from (0 = beginning of time)
before_ts: Unix timestamp to stop at (None = no upper limit)
progress_callback: Optional callable(fetched_count, message)
Returns:
List of dicts: [{id, title, created_utc, url, is_gallery}, ...]
"""
import time
import urllib.request
import urllib.error
base_url = 'https://api.pullpush.io/reddit/search/submission/'
all_posts = []
current_after = after_ts
page = 0
while True:
params = (
f'subreddit={subreddit}'
f'&size=100'
f'&sort=asc'
f'&sort_type=created_utc'
f'&after={current_after}'
)
if before_ts is not None:
params += f'&before={before_ts}'
url = f'{base_url}?{params}'
page += 1
try:
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
})
with urllib.request.urlopen(req, timeout=30) as resp:
data = json.loads(resp.read().decode())
except urllib.error.HTTPError as e:
if e.code == 429:
self.log(f"Pullpush rate limited, waiting 5s...", 'warning')
time.sleep(5)
continue
self.log(f"Pullpush HTTP {e.code} for r/{subreddit}: {e}", 'error')
break
except Exception as e:
self.log(f"Pullpush request failed for r/{subreddit}: {e}", 'error')
break
posts = data.get('data', [])
if not posts:
break
for post in posts:
all_posts.append({
'id': post.get('id', ''),
'title': post.get('title', ''),
'created_utc': post.get('created_utc', 0),
'url': post.get('url', ''),
'is_gallery': post.get('is_gallery', False),
'selftext': post.get('selftext', ''),
})
last_ts = posts[-1].get('created_utc', 0)
if progress_callback:
progress_callback(len(all_posts),
f"Fetched {len(all_posts)} post IDs (page {page})")
# Handle stuck pagination — same timestamp repeating
if last_ts <= current_after:
current_after = last_ts + 1
else:
current_after = last_ts
# If we got fewer than 100, we've reached the end
if len(posts) < 100:
break
# Rate limit: 2s between requests
time.sleep(2)
self.log(f"Pullpush: fetched {len(all_posts)} total post IDs for r/{subreddit}", 'info')
return all_posts
def run_gallery_dl_urls(self, urls_file: str, temp_dir: str,
progress_callback=None, batch_callback=None,
batch_size: int = 50) -> dict:
"""Run gallery-dl with --input-file to download specific Reddit post URLs.
Same streaming/batch pattern as run_gallery_dl() but reads URLs from a file
instead of scraping a subreddit listing.
Args:
urls_file: Path to file containing one URL per line
temp_dir: Directory for gallery-dl to download into
progress_callback: Called with (dl_count, skip_count, total_seen)
batch_callback: Called with (new_files: List[Path]) every batch_size files
batch_size: How many files to accumulate before calling batch_callback
Returns:
Dict with dl_count, skip_count, total.
"""
import time
# Same archive as normal Reddit paid content sync
archive_dir = '/opt/media-downloader/data/cache'
os.makedirs(archive_dir, exist_ok=True)
archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
cmd = [
self.gallery_dl_path,
'--write-metadata',
'--download-archive', archive_path,
'-d', temp_dir,
'-o', 'extractor.reddit.api=rest',
'--input-file', urls_file,
]
# Check for Reddit cookies file
cookies_file = self._get_cookies_file()
if cookies_file:
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
cmd.extend(['--cookies', temp_cookie_file])
self.log(f"Running gallery-dl with input file ({urls_file})", 'info')
self.log(f"Command: {' '.join(cmd)}", 'debug')
dl_count = 0
skip_count = 0
pending_files = []
try:
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
start_time = time.time()
timeout_secs = 14400 # 4 hours for backfill (can be large)
while True:
if time.time() - start_time > timeout_secs:
proc.kill()
self.log("gallery-dl backfill timed out", 'error')
break
line = proc.stdout.readline()
if not line and proc.poll() is not None:
break
if not line:
continue
line = line.strip()
if not line:
continue
if line.startswith('# '):
skip_count += 1
else:
dl_count += 1
file_path = Path(line)
if file_path.exists() and not file_path.name.endswith('.json'):
pending_files.append(file_path)
total = dl_count + skip_count
if progress_callback:
progress_callback(dl_count, skip_count, total)
if batch_callback and len(pending_files) >= batch_size:
batch_callback(list(pending_files))
pending_files.clear()
proc.wait()
# Final batch
if batch_callback and pending_files:
batch_callback(list(pending_files))
pending_files.clear()
if progress_callback:
progress_callback(dl_count, skip_count, dl_count + skip_count)
returncode = proc.returncode
if returncode not in (None, 0, 1, 4, 5):
stderr = proc.stderr.read()
self.log(f"gallery-dl backfill returned code {returncode}", 'warning')
if stderr:
self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
except Exception as e:
self.log(f"gallery-dl backfill failed: {e}", 'error')
self.log(f"gallery-dl backfill done: {dl_count} downloaded, {skip_count} skipped", 'info')
return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
@staticmethod
def _detect_file_type(ext: str) -> str:
"""Detect file type from extension."""
ext = ext.lower().lstrip('.')
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv', 'mpeg', 'mpg'}
if ext in image_exts:
return 'image'
elif ext in video_exts:
return 'video'
return 'unknown'

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,259 @@
"""
Snapchat Client for Paid Content - Wraps SnapchatClientDownloader for paid content system.
Maps spotlights and highlights to the Post/Attachment model used by the paid content scraper.
"""
from datetime import datetime
from typing import Dict, List, Optional
from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment
class SnapchatPaidContentClient(LoggingMixin):
"""
Client for fetching Snapchat creator content via the existing SnapchatClientDownloader.
Each spotlight/highlight collection maps to one Post with snaps as Attachments.
"""
SERVICE_ID = 'snapchat'
PLATFORM = 'snapchat'
def __init__(self, unified_db=None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Snapchat')
self.unified_db = unified_db
self._downloader = None
def _get_downloader(self):
"""Lazy-init the underlying SnapchatClientDownloader."""
if self._downloader is None:
from modules.snapchat_client_module import SnapchatClientDownloader
self._downloader = SnapchatClientDownloader(
show_progress=False,
use_database=False,
log_callback=self.log_callback,
unified_db=self.unified_db,
)
return self._downloader
def get_creator_info(self, username: str) -> Optional[Dict]:
"""Get creator information from profile page __NEXT_DATA__.
Returns dict with display_name and avatar_url if found.
"""
downloader = self._get_downloader()
profile_url = f"https://story.snapchat.com/@{username}"
html = downloader._fetch_page(profile_url)
if not html:
return {'creator_id': username, 'creator_name': username}
data = downloader._extract_next_data(html)
display_name = username
avatar_url = None
if data:
props = data.get('props', {}).get('pageProps', {})
# userProfile uses a $case/userInfo wrapper
user_profile = props.get('userProfile', {})
user_info = user_profile.get('userInfo', {})
if user_info:
name = user_info.get('displayName', '').strip()
if name:
display_name = name
# Bitmoji 3D avatar URL (best quality)
bitmoji = user_info.get('bitmoji3d') or {}
if isinstance(bitmoji, dict):
avatar_url = bitmoji.get('avatarUrl') or bitmoji.get('url')
# linkPreview OG images as avatar (preview/square.jpeg — good quality)
if not avatar_url:
link_preview = props.get('linkPreview', {})
for img_key in ('facebookImage', 'twitterImage'):
img = link_preview.get(img_key, {})
if isinstance(img, dict) and img.get('url'):
avatar_url = img['url']
break
# pageMetadata.pageTitle sometimes has the display name
if display_name == username:
page_meta = props.get('pageMetadata', {})
page_title = page_meta.get('pageTitle', '')
# Format: "DisplayName (@username) | Snapchat..."
if page_title and '(@' in page_title:
name_part = page_title.split('(@')[0].strip()
if name_part:
display_name = name_part
return {
'creator_id': username,
'creator_name': display_name,
'profile_image_url': avatar_url,
}
def get_creator(self, username: str) -> Optional[Creator]:
"""Get Creator model for a Snapchat user."""
info = self.get_creator_info(username)
if not info:
return None
return Creator(
creator_id=username,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
username=info.get('creator_name', username),
display_name=info.get('creator_name'),
profile_image_url=info.get('profile_image_url'),
)
def get_posts(self, username: str, since_date: str = None) -> List[Post]:
"""Fetch spotlights and highlights as Post objects.
Args:
username: Snapchat username (without @)
since_date: ISO date string; skip snaps older than this
Returns:
List of Post objects (one per spotlight/highlight collection)
"""
downloader = self._get_downloader()
# Parse cutoff date
cutoff_dt = None
if since_date:
try:
if 'T' in since_date:
cutoff_dt = datetime.fromisoformat(since_date.replace('Z', '+00:00').replace('+00:00', ''))
else:
cutoff_dt = datetime.strptime(since_date[:10], '%Y-%m-%d')
except (ValueError, IndexError):
pass
# Discover content from profile (spotlights, highlights, stories)
profile_content = downloader.get_profile_content(username)
self.log(f"Found {len(profile_content.get('spotlights', []))} spotlights, "
f"{len(profile_content.get('highlight_collections', []))} highlights, "
f"{'stories' if profile_content.get('story_collection') else 'no stories'} "
f"for @{username}", 'info')
posts = []
# Process story snaps (inline from profile page — no extra HTTP requests)
story_collection = profile_content.get('story_collection')
if story_collection and story_collection.snaps:
post = self._collection_to_post(story_collection, username, cutoff_dt)
if post and post.attachments:
posts.append(post)
# Process highlights (inline from profile page — no extra HTTP requests)
for collection in profile_content.get('highlight_collections', []):
post = self._collection_to_post(collection, username, cutoff_dt)
if post and post.attachments:
posts.append(post)
# Process spotlights (still requires per-URL fetch for full metadata)
for url in profile_content.get('spotlights', []):
collection = downloader.get_spotlight_metadata(url)
if not collection:
continue
post = self._collection_to_post(collection, username, cutoff_dt)
if post and post.attachments:
posts.append(post)
self.log(f"Mapped {len(posts)} posts with attachments for @{username}", 'info')
return posts
def _collection_to_post(self, collection, username: str, cutoff_dt=None) -> Optional[Post]:
"""Convert a SnapCollection to a Post with Attachments."""
if not collection.snaps:
return None
# Use the earliest snap timestamp as the post date
timestamps = [s.timestamp for s in collection.snaps if s.timestamp]
if timestamps:
earliest = min(timestamps)
published_at = earliest.strftime('%Y-%m-%d')
else:
published_at = None
# Skip if all snaps are older than cutoff
if cutoff_dt and timestamps:
latest = max(timestamps)
if latest < cutoff_dt:
return None
attachments = []
for snap in collection.snaps:
if not snap.media_url:
continue
# Determine extension from media type
ext = '.mp4' if snap.media_type == 'video' else '.jpg'
name = f"{snap.media_id}{ext}" if snap.media_id else f"snap_{snap.index}{ext}"
attachment = Attachment(
name=name,
file_type=snap.media_type,
extension=ext,
server_path=snap.media_url,
download_url=snap.media_url,
width=snap.width if snap.width else None,
height=snap.height if snap.height else None,
duration=snap.duration_ms // 1000 if snap.duration_ms else None,
)
attachments.append(attachment)
if not attachments:
return None
# Build content/title from collection metadata
title = collection.title or None
content = collection.title if collection.title else None
# Tag as spotlight or highlight
tag_name = collection.collection_type.title() # "Spotlight" or "Highlight"
return Post(
post_id=collection.collection_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=username,
title=title,
content=content,
published_at=published_at,
attachments=attachments,
auto_tags=[tag_name],
)
def download_snap(self, media_url: str, output_path: str) -> bool:
"""Download a single snap file via curl_cffi.
Args:
media_url: Direct URL to the media file
output_path: Local path to save the file
Returns:
True if download succeeded
"""
import os
downloader = self._get_downloader()
session = downloader._get_session()
try:
url = media_url.replace('&amp;', '&')
resp = session.get(url, timeout=60)
if resp.status_code == 200 and len(resp.content) > 0:
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'wb') as f:
f.write(resp.content)
return True
else:
self.log(f"Download failed: HTTP {resp.status_code}, size={len(resp.content)}", 'warning')
return False
except Exception as e:
self.log(f"Download error: {e}", 'error')
return False

View File

@@ -0,0 +1,508 @@
"""
Soundgasm + Liltsome Archive Client for Paid Content
Handles:
- Soundgasm profile scraping (no auth/Cloudflare needed)
- Liltsome archive (liltsome.yerf.org) as supplementary source
- Bracket tag parsing from audio titles: [F4M] [Whisper] etc.
- Direct HTTP audio downloads (.m4a)
"""
import asyncio
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
from urllib.parse import quote
import aiohttp
import aiofiles
from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment
# ---------------------------------------------------------------------------
# Bracket tag helpers
# ---------------------------------------------------------------------------
def parse_bracket_tags(title: str) -> Tuple[str, List[str]]:
"""Extract [bracket] tags from a title, normalize, return (clean_title, tags)."""
tags = re.findall(r'\[([^\]]+)\]', title)
clean_title = re.sub(r'\s*\[[^\]]+\]\s*', ' ', title).strip()
normalized: List[str] = []
seen: Set[str] = set()
for tag in tags:
tag_lower = tag.strip().lower()
if tag_lower and tag_lower not in seen:
seen.add(tag_lower)
normalized.append(tag_lower)
return clean_title, normalized
def format_tag_display(tag_lower: str) -> str:
"""Format a normalized lowercase tag for display.
Gender tags (f4m, m4f, f4a …) → uppercase.
Everything else → title case.
"""
if re.match(r'^[a-z]+\d[a-z]+$', tag_lower):
return tag_lower.upper()
return tag_lower.title()
# ---------------------------------------------------------------------------
# SoundgasmClient
# ---------------------------------------------------------------------------
class SoundgasmClient(LoggingMixin):
"""Client for fetching audio from Soundgasm and the Liltsome archive."""
SERVICE_ID = 'soundgasm'
PLATFORM = 'soundgasm'
SOUNDGASM_BASE = 'https://soundgasm.net'
LILTSOME_BASE = 'https://liltsome.yerf.org'
LILTSOME_LIBRARY_URL = f'{LILTSOME_BASE}/data/library.json'
LILTSOME_CACHE_PATH = Path('/opt/media-downloader/data/liltsome_library.json')
LILTSOME_ETAG_PATH = Path('/opt/media-downloader/data/liltsome_library.json.etag')
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
}
def __init__(self, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Soundgasm')
self._liltsome_data: Optional[Dict] = None # cached in-memory per sync run
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
async def get_profile_info(self, username: str) -> Optional[Dict]:
"""Return basic profile info (post count) from Soundgasm and/or Liltsome."""
post_count = 0
source = None
# Try Soundgasm profile page first
try:
sg_posts = await self._fetch_soundgasm_profile(username)
if sg_posts is not None:
post_count = len(sg_posts)
source = 'soundgasm'
except Exception as e:
self.log(f"Soundgasm profile fetch failed for {username}: {e}", 'debug')
# Also check Liltsome for additional posts
try:
lt_entries = await self._get_liltsome_entries(username)
if lt_entries:
post_count = max(post_count, len(lt_entries))
if source is None:
source = 'liltsome'
except Exception as e:
self.log(f"Liltsome lookup failed for {username}: {e}", 'debug')
if post_count == 0 and source is None:
return None
return {
'username': username,
'post_count': post_count,
'source': source,
}
async def get_posts(self, username: str, known_post_ids: Optional[Set[str]] = None,
progress_callback=None) -> List[Post]:
"""Fetch posts from both Soundgasm and Liltsome, deduplicating by post_id."""
known = known_post_ids or set()
posts: List[Post] = []
seen_ids: Set[str] = set(known)
# 1. Soundgasm (may fail if account deleted — that's OK)
try:
sg_posts = await self._fetch_soundgasm_posts(username, seen_ids)
for p in sg_posts:
if p.post_id not in seen_ids:
seen_ids.add(p.post_id)
posts.append(p)
self.log(f"Soundgasm: {len(sg_posts)} new posts for {username}", 'info')
except Exception as e:
self.log(f"Soundgasm fetch failed for {username} (account may be deleted): {e}", 'warning')
if progress_callback:
progress_callback(len(posts))
# 2. Liltsome archive (always)
try:
lt_posts = await self._fetch_liltsome_posts(username, seen_ids)
for p in lt_posts:
if p.post_id not in seen_ids:
seen_ids.add(p.post_id)
posts.append(p)
self.log(f"Liltsome: {len(lt_posts)} new posts for {username}", 'info')
except Exception as e:
self.log(f"Liltsome fetch failed for {username}: {e}", 'warning')
if progress_callback:
progress_callback(len(posts))
return posts
async def download_audio(self, download_url: str, output_path: Path) -> Dict:
"""Download an audio file via direct HTTP GET."""
try:
output_path.parent.mkdir(parents=True, exist_ok=True)
timeout = aiohttp.ClientTimeout(total=300)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(download_url, headers=self.HEADERS) as resp:
if resp.status != 200:
return {'success': False, 'error': f'HTTP {resp.status}'}
async with aiofiles.open(str(output_path), 'wb') as f:
total = 0
async for chunk in resp.content.iter_chunked(65536):
await f.write(chunk)
total += len(chunk)
return {
'success': True,
'file_path': str(output_path),
'file_size': total,
}
except Exception as e:
self.log(f"Download failed for {download_url}: {e}", 'error')
return {'success': False, 'error': str(e)}
# ------------------------------------------------------------------
# Soundgasm scraping
# ------------------------------------------------------------------
async def _fetch_soundgasm_profile(self, username: str) -> Optional[List[Dict]]:
"""Scrape the Soundgasm profile page, return list of {slug, title, plays}."""
url = f'{self.SOUNDGASM_BASE}/u/{username}'
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url, headers=self.HEADERS) as resp:
if resp.status == 404:
return None
if resp.status != 200:
self.log(f"Soundgasm profile returned {resp.status}", 'warning')
return None
html = await resp.text()
# Parse .sound-details divs for links
entries: List[Dict] = []
# Pattern: <a href="https://soundgasm.net/u/{username}/{slug}">title</a>
# (profile page uses absolute URLs)
for m in re.finditer(
r'<a\s+href="(?:https?://soundgasm\.net)?/u/' + re.escape(username) + r'/([^"]+)"[^>]*>\s*([^<]+)',
html, re.IGNORECASE
):
slug = m.group(1).strip()
title = m.group(2).strip()
entries.append({'slug': slug, 'title': title})
return entries
async def _fetch_soundgasm_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
"""Fetch full post details from Soundgasm for new posts."""
profile_entries = await self._fetch_soundgasm_profile(username)
if not profile_entries:
return []
posts: List[Post] = []
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
for entry in profile_entries:
slug = entry['slug']
if slug in seen_ids:
continue
try:
detail = await self._fetch_soundgasm_detail(session, username, slug)
if detail is None:
continue
title_raw = detail.get('title', entry.get('title', slug))
clean_title, tags = parse_bracket_tags(title_raw)
description = detail.get('description', '')
audio_url = detail.get('audio_url')
if not audio_url:
continue
# Determine extension from URL
ext = '.m4a'
if audio_url:
url_path = audio_url.split('?')[0]
if '.' in url_path.split('/')[-1]:
ext = '.' + url_path.split('/')[-1].rsplit('.', 1)[1]
filename = f"{slug}{ext}"
attachment = Attachment(
name=filename,
file_type='audio',
extension=ext.lstrip('.'),
server_path=f'/u/{username}/{slug}',
download_url=audio_url,
)
post = Post(
post_id=slug,
service_id='soundgasm',
platform='soundgasm',
creator_id=username,
title=clean_title or None,
content=description or None,
published_at=None, # Soundgasm has no dates
attachments=[attachment],
auto_tags=tags,
)
posts.append(post)
except Exception as e:
self.log(f"Error fetching Soundgasm detail for {slug}: {e}", 'debug')
return posts
async def _fetch_soundgasm_detail(self, session: aiohttp.ClientSession,
username: str, slug: str) -> Optional[Dict]:
"""Fetch a single Soundgasm audio detail page and extract metadata."""
url = f'{self.SOUNDGASM_BASE}/u/{username}/{slug}'
async with session.get(url, headers=self.HEADERS) as resp:
if resp.status != 200:
return None
html = await resp.text()
# Title: <div aria-label="title"...>Title Text</div>
# or from the page title tag
title = None
title_match = re.search(r'aria-label="title"[^>]*>([^<]+)', html)
if title_match:
title = title_match.group(1).strip()
if not title:
title_match = re.search(r'<title>([^<]+)</title>', html, re.IGNORECASE)
if title_match:
title = title_match.group(1).strip()
# Remove " - Soundgasm" suffix if present
title = re.sub(r'\s*[-–—]\s*Soundgasm.*$', '', title, flags=re.IGNORECASE).strip()
# Description: <div class="jp-description">...</div>
description = None
desc_match = re.search(r'class="jp-description"[^>]*>(.*?)</div>', html, re.DOTALL)
if desc_match:
desc_html = desc_match.group(1)
# Strip HTML tags
description = re.sub(r'<br\s*/?>', '\n', desc_html)
description = re.sub(r'<[^>]+>', '', description).strip()
# Audio URL: m4a: "https://..."
audio_url = None
audio_match = re.search(r'm4a:\s*"([^"]+)"', html)
if audio_match:
audio_url = audio_match.group(1)
if not audio_url:
return None
return {
'title': title or slug,
'description': description,
'audio_url': audio_url,
}
# ------------------------------------------------------------------
# Liltsome archive
# ------------------------------------------------------------------
async def _ensure_liltsome_cache(self) -> bool:
"""Download/refresh the Liltsome library.json using ETag-based invalidation.
Returns True if cache is available (fresh or existing), False otherwise.
"""
etag_file = self.LILTSOME_ETAG_PATH
cache_file = self.LILTSOME_CACHE_PATH
stored_etag = None
if etag_file.exists():
try:
stored_etag = etag_file.read_text().strip()
except Exception:
pass
timeout = aiohttp.ClientTimeout(total=600) # 131MB can take a while
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
# HEAD request to check ETag
async with session.head(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
if resp.status != 200:
self.log(f"Liltsome HEAD returned {resp.status}", 'warning')
return cache_file.exists()
remote_etag = resp.headers.get('ETag', '').strip()
if stored_etag and remote_etag and stored_etag == remote_etag and cache_file.exists():
self.log("Liltsome cache is fresh (ETag match)", 'debug')
return True
# Download the full library
self.log("Downloading Liltsome library.json (this may take a while)...", 'info')
async with session.get(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
if resp.status != 200:
self.log(f"Liltsome GET returned {resp.status}", 'warning')
return cache_file.exists()
cache_file.parent.mkdir(parents=True, exist_ok=True)
async with aiofiles.open(str(cache_file), 'wb') as f:
async for chunk in resp.content.iter_chunked(262144):
await f.write(chunk)
new_etag = resp.headers.get('ETag', remote_etag or '').strip()
if new_etag:
etag_file.write_text(new_etag)
self.log("Liltsome library.json downloaded successfully", 'info')
self._liltsome_data = None # force re-parse
return True
except Exception as e:
self.log(f"Failed to refresh Liltsome cache: {e}", 'warning')
return cache_file.exists()
async def _load_liltsome_data(self) -> Optional[Dict]:
"""Load and cache the Liltsome library data in memory."""
if self._liltsome_data is not None:
return self._liltsome_data
cache_file = self.LILTSOME_CACHE_PATH
if not cache_file.exists():
return None
try:
data = await asyncio.to_thread(self._read_liltsome_json, cache_file)
self._liltsome_data = data
return data
except Exception as e:
self.log(f"Failed to parse Liltsome library.json: {e}", 'error')
return None
@staticmethod
def _read_liltsome_json(path: Path) -> Dict:
"""Read and parse the Liltsome JSON file (blocking, run in thread)."""
with open(path, 'r', encoding='utf-8') as f:
return json.load(f)
async def _get_liltsome_entries(self, username: str) -> Optional[List[Dict]]:
"""Find artist entries in Liltsome data by username (case-insensitive).
library.json structure: {"artists": [{"id": "name", "files": {"audio": [...]}}]}
"""
await self._ensure_liltsome_cache()
data = await self._load_liltsome_data()
if not data:
return None
username_lower = username.lower()
# Top-level is {"artists": [...]}
artists = data.get('artists', []) if isinstance(data, dict) else data
for artist in artists:
artist_id = str(artist.get('id', '')).lower()
artist_name = str(artist.get('name', '')).lower()
if artist_id == username_lower or artist_name == username_lower:
# Audio entries are in files.audio
files = artist.get('files', {})
if isinstance(files, dict):
return files.get('audio', [])
return []
return None
async def _fetch_liltsome_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
"""Convert Liltsome archive entries to Post objects."""
entries = await self._get_liltsome_entries(username)
if not entries:
return []
posts: List[Post] = []
for entry in entries:
filename = entry.get('filename', '')
path = entry.get('path', '')
title_raw = entry.get('title', filename)
entry_tags = entry.get('tags', []) # already lowercase in Liltsome
duration = None
file_size = entry.get('size')
if isinstance(entry.get('metadata'), dict):
duration = entry['metadata'].get('duration')
# Build post_id: prefix with liltsome- to avoid collision
sanitized_name = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) if filename else path
post_id = f'liltsome-{sanitized_name}'
if post_id in seen_ids:
continue
# Parse bracket tags from title for clean_title
clean_title, title_tags = parse_bracket_tags(title_raw)
# Merge: use Liltsome's pre-parsed tags + any extra from title
all_tags_set: Set[str] = set()
all_tags: List[str] = []
for t in entry_tags:
t_lower = t.strip().lower()
if t_lower and t_lower not in all_tags_set:
all_tags_set.add(t_lower)
all_tags.append(t_lower)
for t in title_tags:
if t not in all_tags_set:
all_tags_set.add(t)
all_tags.append(t)
# Build download URL
download_url = f'{self.LILTSOME_BASE}/audio_files/{quote(path, safe="/")}' if path else None
# Determine extension
ext = 'm4a'
if filename and '.' in filename:
ext = filename.rsplit('.', 1)[1].lower()
elif path and '.' in path:
ext = path.rsplit('.', 1)[1].lower()
attachment = Attachment(
name=f"{sanitized_name}.{ext}" if not filename.endswith(f'.{ext}') else filename,
file_type='audio',
extension=ext,
server_path=path or filename,
download_url=download_url,
file_size=file_size,
duration=duration,
)
post = Post(
post_id=post_id,
service_id='soundgasm',
platform='soundgasm',
creator_id=username,
title=clean_title or None,
content=None,
published_at=None,
attachments=[attachment],
auto_tags=all_tags,
)
posts.append(post)
return posts

View File

@@ -0,0 +1,827 @@
"""
TikTok Client for Paid Content - Uses yt-dlp for listing and gallery-dl for downloading
Adapts the hybrid approach from modules/tiktok_module.py into the paid content client pattern.
"""
import asyncio
import html as html_module
import json
import os
import re
import subprocess
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import aiohttp
from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment
class TikTokClient(LoggingMixin):
"""
Client for fetching TikTok creator information and videos.
Uses yt-dlp for listing (fast flat-playlist) and gallery-dl for downloading
(handles carousels/slideshows properly).
"""
SERVICE_ID = 'tiktok'
PLATFORM = 'tiktok'
def __init__(self, unified_db=None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='TikTok')
self.ytdlp_path = self._find_executable('yt-dlp')
self.gallery_dl_path = self._find_executable('gallery-dl')
self.unified_db = unified_db
self._cookies_file = None
self._last_pinned_posts = {}
if not self.ytdlp_path:
self.log("yt-dlp not found, TikTok listing will be disabled", 'warning')
if not self.gallery_dl_path:
self.log("gallery-dl not found, TikTok downloading will be disabled", 'warning')
def _find_executable(self, name: str) -> Optional[str]:
"""Find an executable by name"""
common_paths = [
f'/opt/media-downloader/venv/bin/{name}',
f'/usr/local/bin/{name}',
f'/usr/bin/{name}',
f'/opt/homebrew/bin/{name}',
os.path.expanduser(f'~/.local/bin/{name}'),
]
for path in common_paths:
if os.path.isfile(path) and os.access(path, os.X_OK):
return path
try:
result = subprocess.run(['which', name], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def is_available(self) -> bool:
"""Check if both yt-dlp and gallery-dl are available"""
return self.ytdlp_path is not None and self.gallery_dl_path is not None
def cleanup(self):
"""Clean up any temporary files"""
if self._cookies_file and os.path.exists(self._cookies_file):
try:
os.unlink(self._cookies_file)
except Exception:
pass
def _get_cookies_file(self) -> Optional[str]:
"""Get path to cookies file, creating from database if needed."""
if self._cookies_file and os.path.exists(self._cookies_file):
return self._cookies_file
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
# Check for tiktok scraper cookies
for scraper_id in ('tiktok', 'tiktok_client'):
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", (scraper_id,))
row = cursor.fetchone()
if row and row[0]:
data = json.loads(row[0])
if isinstance(data, dict) and 'cookies' in data:
cookies_list = data['cookies']
elif isinstance(data, list):
cookies_list = data
else:
cookies_list = []
if cookies_list:
import tempfile
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='tiktok_cookies_')
with os.fdopen(fd, 'w') as f:
f.write("# Netscape HTTP Cookie File\n")
for cookie in cookies_list:
domain = cookie.get('domain', '')
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
path = cookie.get('path', '/')
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
expiry = str(int(cookie.get('expirationDate', 0)))
name = cookie.get('name', '')
value = cookie.get('value', '')
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
self.log(f"Loaded {len(cookies_list)} TikTok cookies", 'debug')
return self._cookies_file
except Exception as e:
self.log(f"Could not load TikTok cookies: {e}", 'debug')
return None
def _save_cookies_back(self):
"""Read updated cookies from temp file and save back to database.
yt-dlp and gallery-dl update the cookies file with refreshed tokens
from TikTok (e.g. msToken), so we need to persist those changes."""
if not self._cookies_file or not os.path.exists(self._cookies_file):
return
if not self.unified_db:
return
try:
import http.cookiejar
jar = http.cookiejar.MozillaCookieJar(self._cookies_file)
jar.load(ignore_discard=True, ignore_expires=True)
updated_cookies = []
for cookie in jar:
updated_cookies.append({
'name': cookie.name,
'value': cookie.value,
'domain': cookie.domain,
'path': cookie.path,
'secure': cookie.secure,
'expirationDate': cookie.expires or 0,
})
if not updated_cookies:
return
# Merge updated cookies back to DB
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('tiktok',))
row = cursor.fetchone()
if row and row[0]:
existing_data = json.loads(row[0])
existing_cookies = existing_data if isinstance(existing_data, list) else existing_data.get('cookies', [])
# Merge: updated cookies override existing by name+domain
cookie_map = {(c.get('name'), c.get('domain')): c for c in existing_cookies}
for c in updated_cookies:
cookie_map[(c['name'], c['domain'])] = c
final_cookies = list(cookie_map.values())
else:
final_cookies = updated_cookies
self.unified_db.save_scraper_cookies('tiktok', final_cookies, merge=False)
self.log(f"Saved {len(final_cookies)} refreshed cookies back to DB", 'debug')
# Clear cached file so next use gets fresh cookies from DB
self._cookies_file = None
except Exception as e:
self.log(f"Failed to save cookies back: {e}", 'debug')
def _get_base_cmd(self) -> List[str]:
"""Get base yt-dlp command with cookies if available."""
cmd = [self.ytdlp_path]
cookies_file = self._get_cookies_file()
if cookies_file:
cmd.extend(['--cookies', cookies_file])
return cmd
@staticmethod
def extract_username(url: str) -> Optional[str]:
"""Extract username from TikTok URL"""
match = re.search(r'tiktok\.com/@([a-zA-Z0-9_.]+)', url)
if match:
return match.group(1)
return None
@staticmethod
def normalize_creator_url(username: str) -> str:
"""Convert username to a consistent URL format"""
if username.startswith('http://') or username.startswith('https://'):
return username
username = username.lstrip('@')
return f"https://www.tiktok.com/@{username}"
async def _resolve_channel_id(self, username: str) -> Optional[str]:
"""Resolve a TikTok username to a channel_id (secUid).
When yt-dlp can't extract the secondary user ID from the profile page,
we try to find a video URL from TikTok's embed/RSS and then extract
the channel_id (secUid) from that video's metadata via yt-dlp.
"""
if not self.ytdlp_path:
return None
try:
# Step 1: Get a video URL from this user via the oembed embed HTML
video_url = None
async with aiohttp.ClientSession() as session:
# The oembed HTML often contains a video ID we can use
oembed_url = f"https://www.tiktok.com/oembed?url=https://www.tiktok.com/@{username}"
async with session.get(oembed_url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
if resp.status == 200:
data = await resp.json()
embed_html = data.get('html', '')
# Extract video URL from embed iframe
match = re.search(r'cite="(https://www\.tiktok\.com/@[^"]+/video/\d+)"', embed_html)
if not match:
match = re.search(r'data-video-id="(\d+)"', embed_html)
if match:
video_url = f"https://www.tiktok.com/@{username}/video/{match.group(1)}"
else:
video_url = match.group(1)
if not video_url:
# oembed thumbnail_url sometimes contains the video ID
thumb = data.get('thumbnail_url', '')
vid_match = re.search(r'/video/(\d+)', thumb)
if vid_match:
video_url = f"https://www.tiktok.com/@{username}/video/{vid_match.group(1)}"
if not video_url:
# Step 1b: Check if we have any existing video URLs in the database
if self.unified_db:
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT a.download_url FROM paid_content_attachments a
JOIN paid_content_posts p ON a.post_id = p.id
JOIN paid_content_creators c ON p.creator_id = c.id
WHERE c.username = ? AND a.download_url LIKE '%tiktok.com%'
LIMIT 1
""", (username,))
row = cursor.fetchone()
if row and row[0]:
video_url = row[0]
except Exception:
pass
if not video_url:
self.log(f"No video URL found for @{username} to resolve channel_id", 'debug')
return None
# Step 2: Use yt-dlp to get the channel_id from the single video
self.log(f"Resolving channel_id from video: {video_url}", 'debug')
cmd = self._get_base_cmd() + [
'-j',
'--no-warnings',
'--no-download',
'--socket-timeout', '30',
video_url
]
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode == 0:
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line.strip():
continue
try:
video_data = json.loads(line)
channel_id = video_data.get('channel_id') or video_data.get('playlist_id')
if channel_id:
self.log(f"Resolved @{username} channel_id: {channel_id[:30]}...", 'info')
return channel_id
except json.JSONDecodeError:
continue
except Exception as e:
self.log(f"Failed to resolve channel_id for @{username}: {e}", 'debug')
return None
async def get_creator_info(self, url: str) -> Optional[Dict]:
"""Get creator information using yt-dlp + profile page scraping"""
username = self.extract_username(url)
if not username:
return None
profile_url = self.normalize_creator_url(username)
creator_name = username
# Try yt-dlp for display name from video metadata
if self.ytdlp_path:
try:
cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'-j',
'--playlist-items', '1',
'--socket-timeout', '30',
profile_url
]
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode == 0:
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line:
continue
try:
data = json.loads(line)
creator_name = (data.get('channel') or data.get('uploader')
or data.get('playlist_title') or username)
break
except json.JSONDecodeError:
continue
else:
# Fallback: try tiktokuser: scheme if secondary user ID extraction fails
err_text = stderr.decode('utf-8', errors='replace')
if 'secondary user ID' in err_text or 'Unable to extract' in err_text:
channel_id = await self._resolve_channel_id(username)
if channel_id:
fb_cmd = self._get_base_cmd() + [
'--no-warnings', '--flat-playlist',
'-j', '--playlist-items', '1', '--socket-timeout', '30',
f"tiktokuser:{channel_id}"
]
fb_result = await asyncio.create_subprocess_exec(
*fb_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
)
fb_stdout, _ = await fb_result.communicate()
if fb_result.returncode == 0:
for line in fb_stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line:
continue
try:
data = json.loads(line)
creator_name = (data.get('channel') or data.get('uploader')
or data.get('playlist_title') or username)
break
except json.JSONDecodeError:
continue
except Exception as e:
self.log(f"Failed to get creator info via yt-dlp: {e}", 'debug')
# Scrape profile page for avatar and bio
profile_image = None
bio = None
try:
profile_image, bio, page_name = await self._scrape_profile_page(profile_url)
if page_name and creator_name == username:
creator_name = page_name
except Exception as e:
self.log(f"Failed to scrape profile page: {e}", 'debug')
return {
'creator_id': username,
'creator_name': creator_name,
'creator_url': profile_url,
'profile_image_url': profile_image,
'bio': bio,
}
async def _fetch_profile_with_cookies(self, url: str) -> Optional[str]:
"""Fetch TikTok profile page using curl_cffi with cookies from database."""
cookies_file = self._get_cookies_file()
if not cookies_file:
return None
try:
from curl_cffi import requests as cf_requests
import http.cookiejar
# Load cookies from the Netscape file
jar = http.cookiejar.MozillaCookieJar(cookies_file)
jar.load(ignore_discard=True, ignore_expires=True)
# Try multiple browser versions for curl_cffi compatibility
for _browser in ("chrome136", "chrome131", "chrome"):
try:
session = cf_requests.Session(impersonate=_browser)
break
except Exception:
continue
else:
session = cf_requests.Session()
for cookie in jar:
session.cookies.set(cookie.name, cookie.value, domain=cookie.domain)
resp = session.get(url, timeout=15)
if resp.status_code == 200 and 'avatarLarger' in resp.text:
self.log("Fetched TikTok profile with cookies (curl_cffi)", 'debug')
return resp.text
elif 'captcha' in resp.text.lower():
self.log("TikTok profile still returned captcha with cookies", 'debug')
session.close()
except Exception as e:
self.log(f"curl_cffi profile fetch failed: {e}", 'debug')
return None
async def _scrape_profile_page(self, url: str) -> tuple:
"""
Scrape TikTok profile page for avatar and bio from embedded JSON data.
TikTok embeds user data in __UNIVERSAL_DATA_FOR_REHYDRATION__ script tag.
Returns (profile_image_url, bio, display_name).
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
}
profile_image = None
bio = None
display_name = None
try:
page_html = None
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
if resp.status == 200:
page_html = await resp.text()
# If we got a captcha page, try curl_cffi with cookies
if not page_html or ('captcha' in page_html.lower() and 'avatarLarger' not in page_html):
page_html = await self._fetch_profile_with_cookies(url)
if not page_html:
return (None, None, None)
# Try structured JSON first (__UNIVERSAL_DATA_FOR_REHYDRATION__)
rehydration_match = re.search(
r'<script[^>]*id="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>(.*?)</script>',
page_html, re.DOTALL
)
if rehydration_match:
try:
rdata = json.loads(rehydration_match.group(1))
user_detail = (rdata.get('__DEFAULT_SCOPE__', {})
.get('webapp.user-detail', {}))
user = user_detail.get('userInfo', {}).get('user', {})
if user:
avatar_val = user.get('avatarLarger') or user.get('avatarMedium')
if avatar_val and not avatar_val.endswith('.mp4'):
profile_image = avatar_val
self.log("Found TikTok profile avatar (rehydration)", 'debug')
sig_val = user.get('signature', '')
if sig_val and sig_val.strip():
bio = sig_val.strip()
self.log("Found TikTok bio (rehydration)", 'debug')
nick_val = user.get('nickname')
if nick_val:
display_name = nick_val
self.log(f"Found TikTok display name (rehydration): {display_name}", 'debug')
# Extract pinned post IDs
pinned_list = user_detail.get('pinnedList', [])
if pinned_list:
self._last_pinned_posts = {}
for item in pinned_list:
vid = str(item.get('id', ''))
if vid:
self._last_pinned_posts[vid] = {'pinned_at': None}
if self._last_pinned_posts:
self.log(f"Found {len(self._last_pinned_posts)} pinned TikTok posts", 'debug')
except (json.JSONDecodeError, KeyError):
pass
# Fallback: regex extraction from raw HTML
# Use json.loads to decode values (handles \uXXXX, surrogate pairs, and raw UTF-8)
if not profile_image:
avatar_match = re.search(r'"avatarLarger":"([^"]+)"', page_html)
if not avatar_match:
avatar_match = re.search(r'"avatarMedium":"([^"]+)"', page_html)
if avatar_match:
try:
avatar_url = json.loads(f'"{avatar_match.group(1)}"')
except (json.JSONDecodeError, ValueError):
avatar_url = avatar_match.group(1)
if avatar_url and not avatar_url.endswith('.mp4'):
profile_image = avatar_url
self.log("Found TikTok profile avatar", 'debug')
if not bio:
sig_match = re.search(r'"signature":"([^"]*)"', page_html)
if sig_match:
try:
raw_bio = json.loads(f'"{sig_match.group(1)}"')
except (json.JSONDecodeError, ValueError):
raw_bio = sig_match.group(1)
if raw_bio and raw_bio.strip():
bio = raw_bio.strip()
self.log("Found TikTok bio", 'debug')
if not display_name:
nick_match = re.search(r'"nickname":"([^"]+)"', page_html)
if nick_match:
try:
display_name = json.loads(f'"{nick_match.group(1)}"')
except (json.JSONDecodeError, ValueError):
display_name = nick_match.group(1)
self.log(f"Found TikTok display name: {display_name}", 'debug')
# Extract banner/cover from "coverLarger" field
# (stored separately, not returned here but could be used later)
except asyncio.TimeoutError:
self.log("TikTok profile page request timed out", 'debug')
except Exception as e:
self.log(f"Error scraping TikTok profile: {e}", 'debug')
return (profile_image, bio, display_name)
async def get_creator_videos(self, url: str, since_date: str = None,
max_videos: int = None,
progress_callback=None) -> List[Dict]:
"""
Get all videos from a TikTok profile using yt-dlp --flat-playlist -j.
Uses JSON output to properly handle multi-line descriptions/titles.
Returns list of video metadata dicts with video_id and upload_date.
"""
if not self.ytdlp_path:
return []
username = self.extract_username(url)
if not username:
return []
profile_url = self.normalize_creator_url(username)
try:
# Use yt-dlp flat-playlist with JSON output for full metadata
cmd = self._get_base_cmd() + [
'--flat-playlist',
'-j',
'--no-warnings',
'--socket-timeout', '30',
profile_url
]
self.log(f"Fetching TikTok videos for @{username}", 'info')
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error = stderr.decode('utf-8', errors='replace')
# Fallback: if yt-dlp can't extract secondary user ID, try tiktokuser: scheme
if 'secondary user ID' in error or 'Unable to extract' in error:
self.log(f"yt-dlp can't extract user ID for @{username}, trying channel_id fallback", 'info')
channel_id = await self._resolve_channel_id(username)
if channel_id:
fallback_cmd = self._get_base_cmd() + [
'--flat-playlist',
'-j',
'--no-warnings',
'--socket-timeout', '30',
f"tiktokuser:{channel_id}"
]
fb_result = await asyncio.create_subprocess_exec(
*fallback_cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await fb_result.communicate()
if fb_result.returncode == 0:
self.log(f"Fallback tiktokuser: succeeded for @{username}", 'info')
else:
fb_error = stderr.decode('utf-8', errors='replace')
self.log(f"Fallback also failed for @{username}: {fb_error}", 'warning')
return []
else:
self.log(f"Could not resolve channel_id for @{username}", 'warning')
return []
else:
self.log(f"Failed to list TikTok videos: {error}", 'warning')
return []
lines = stdout.decode('utf-8', errors='replace').strip().split('\n')
# Parse since_date for filtering
cutoff_str = None
if since_date:
try:
if 'T' in since_date:
cutoff_dt = datetime.fromisoformat(since_date.replace('Z', '+00:00').replace('+00:00', ''))
else:
cutoff_dt = datetime.strptime(since_date[:10], '%Y-%m-%d')
cutoff_str = cutoff_dt.strftime('%Y%m%d')
except (ValueError, IndexError):
pass
videos = []
for line in lines:
if not line.strip():
continue
try:
data = json.loads(line)
except json.JSONDecodeError:
continue
video_id = str(data.get('id', ''))
if not video_id:
continue
upload_date = data.get('upload_date', '')
title = data.get('title', '')
description = data.get('description', '')
# Skip posts where yt-dlp returned no metadata at all
# When cookies are expired, yt-dlp returns no date, no title,
# and no description. Real posts with empty captions still have
# upload_date, so we use that as the key signal.
if not upload_date and not title and not description:
self.log(f"Skipping TikTok {video_id}: no metadata (cookies may be expired)", 'debug')
continue
title = title or description or f"TikTok video #{video_id}"
description = description or title
# Filter by date if cutoff specified
if cutoff_str and upload_date and upload_date < cutoff_str:
continue
# Format upload_date to ISO
formatted_date = None
if upload_date and len(upload_date) == 8 and upload_date.isdigit():
formatted_date = f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:8]}"
video_url = data.get('url') or f"https://www.tiktok.com/@{username}/video/{video_id}"
videos.append({
'video_id': video_id,
'title': title,
'description': description,
'upload_date': formatted_date,
'url': video_url,
'username': username,
})
if progress_callback:
progress_callback(len(videos))
if max_videos and len(videos) >= max_videos:
break
self.log(f"Found {len(videos)} TikTok videos for @{username}", 'info')
self._save_cookies_back()
return videos
except Exception as e:
self.log(f"Error getting TikTok videos: {e}", 'error')
self._save_cookies_back()
return []
async def download_video(self, video_url: str, output_dir: Path, username: str = '') -> Dict:
"""
Download a TikTok video/carousel using gallery-dl.
gallery-dl handles both regular videos and carousel/slideshow posts.
Returns dict with success status and list of downloaded files.
"""
if not self.gallery_dl_path:
return {'success': False, 'error': 'gallery-dl not available'}
try:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
cmd = [
self.gallery_dl_path,
'--write-metadata',
'-D', str(output_dir),
'-f', '{id}_{num}.{extension}',
]
# Add cookies for age-restricted / login-required content
cookies_file = self._get_cookies_file()
if cookies_file:
cmd.extend(['--cookies', cookies_file])
cmd.append(video_url)
self.log(f"Downloading TikTok: {video_url}", 'debug')
# Snapshot existing files before download so we only pick up new ones
existing_files = set(f.name for f in output_dir.iterdir() if f.is_file())
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
# Find newly downloaded files (exclude .json metadata and audio-only files)
downloaded_files = []
for f in output_dir.iterdir():
if f.is_file() and f.name not in existing_files and f.suffix.lower() not in ('.json',):
# Skip audio-only files
if f.suffix.lower() in ('.mp3', '.m4a', '.aac', '.wav', '.ogg'):
continue
downloaded_files.append(f)
if result.returncode != 0:
# gallery-dl exit code 4 = partial failure (e.g. slideshow images OK but audio failed)
# If we got media files, treat as success
if downloaded_files:
self.log(f"gallery-dl partial failure (code {result.returncode}) but {len(downloaded_files)} files downloaded", 'debug')
else:
error_msg = stderr.decode('utf-8', errors='replace').strip()
if 'not available' in error_msg.lower() or '404' in error_msg:
error_msg = 'Video not available (deleted or private)'
elif len(error_msg) > 200:
error_msg = error_msg[:200] + '...'
return {'success': False, 'error': error_msg}
if not downloaded_files:
return {'success': False, 'error': 'No files downloaded'}
# Sort by name to maintain carousel order (e.g. id_1.jpg, id_2.jpg)
downloaded_files.sort(key=lambda f: f.name)
primary_file = downloaded_files[0]
# Determine if this is a photo carousel (multiple images)
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.webp'}
is_carousel = len(downloaded_files) > 1 and all(
f.suffix.lower() in image_exts for f in downloaded_files
)
self._save_cookies_back()
return {
'success': True,
'file_path': str(primary_file),
'filename': primary_file.name,
'file_size': primary_file.stat().st_size,
'all_files': [str(f) for f in downloaded_files],
'file_count': len(downloaded_files),
'is_carousel': is_carousel,
}
except Exception as e:
self.log(f"Error downloading TikTok video: {e}", 'error')
self._save_cookies_back()
return {'success': False, 'error': str(e)}
async def get_creator(self, url: str) -> Optional[Creator]:
"""Get Creator object from URL"""
info = await self.get_creator_info(url)
if not info:
return None
username = info.get('creator_id', '')
return Creator(
creator_id=username,
service_id='tiktok',
platform='tiktok',
username=info.get('creator_name', username),
display_name=info.get('creator_name'),
profile_image_url=info.get('profile_image_url'),
bio=info.get('bio'),
)
async def get_posts(self, url: str, since_date: str = None,
max_videos: int = None, progress_callback=None) -> List[Post]:
"""Get TikTok videos as Post objects"""
videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback)
username = self.extract_username(url) or ''
posts = []
for video in videos:
# Each TikTok post could be video or carousel
# We create a single attachment for now; the actual download determines type
attachment = Attachment(
name=f"{video['video_id']}.mp4",
file_type='video',
extension='.mp4',
server_path=video['url'],
download_url=video['url'],
)
post = Post(
post_id=video['video_id'],
service_id='tiktok',
platform='tiktok',
creator_id=username,
title=None,
content=video.get('description') or video.get('title', ''),
published_at=video.get('upload_date'),
attachments=[attachment],
)
posts.append(post)
return posts

View File

@@ -0,0 +1,751 @@
"""
Twitch Clips Client - Fetches channel clips using yt-dlp
"""
import aiohttp
import asyncio
import hashlib
import json
import os
import re
import subprocess
import tempfile
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment
class TwitchThumbnailCache:
"""Cache for Twitch clip thumbnails"""
def __init__(self, cache_dir: str = None):
self.cache_dir = Path(cache_dir or '/opt/media-downloader/data/cache/twitch_thumbnails')
self.cache_dir.mkdir(parents=True, exist_ok=True)
def _get_cache_path(self, thumbnail_url: str) -> Path:
"""Get local cache path for a thumbnail URL"""
# Create a hash of the URL for the filename
url_hash = hashlib.md5(thumbnail_url.encode()).hexdigest()
# Extract extension from URL or default to jpg
ext = '.jpg'
if '.png' in thumbnail_url.lower():
ext = '.png'
elif '.webp' in thumbnail_url.lower():
ext = '.webp'
return self.cache_dir / f"{url_hash}{ext}"
def get_cached(self, thumbnail_url: str) -> Optional[str]:
"""Get cached thumbnail path if it exists"""
cache_path = self._get_cache_path(thumbnail_url)
if cache_path.exists():
return str(cache_path)
return None
async def cache_thumbnail(self, thumbnail_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
"""Download and cache a thumbnail, return local path"""
if not thumbnail_url:
return None
# Check if already cached
cache_path = self._get_cache_path(thumbnail_url)
if cache_path.exists():
return str(cache_path)
# Download thumbnail
try:
close_session = False
if session is None:
session = aiohttp.ClientSession()
close_session = True
try:
async with session.get(thumbnail_url, timeout=aiohttp.ClientTimeout(total=30)) as resp:
if resp.status == 200:
content = await resp.read()
with open(cache_path, 'wb') as f:
f.write(content)
return str(cache_path)
finally:
if close_session:
await session.close()
except Exception:
pass
return None
async def cache_thumbnails_batch(self, thumbnail_urls: List[str], max_concurrent: int = 5) -> Dict[str, str]:
"""Cache multiple thumbnails in parallel, return url->local_path mapping"""
result = {}
# Filter out already cached
to_download = []
for url in thumbnail_urls:
if not url:
continue
cached = self.get_cached(url)
if cached:
result[url] = cached
else:
to_download.append(url)
if not to_download:
return result
# Download in batches
async with aiohttp.ClientSession() as session:
semaphore = asyncio.Semaphore(max_concurrent)
async def download_one(url: str):
async with semaphore:
path = await self.cache_thumbnail(url, session)
if path:
result[url] = path
await asyncio.gather(*[download_one(url) for url in to_download])
return result
class TwitchClient(LoggingMixin):
"""
Client for fetching Twitch channel clips using yt-dlp
Supports:
- Channel clips URLs (twitch.tv/username/clips)
- Fetching channel metadata
- Listing all clips from a channel
- Downloading clips
"""
# Quality presets for yt-dlp
QUALITY_PRESETS = {
'best': 'best',
'1080p': 'best[height<=1080]',
'720p': 'best[height<=720]',
'480p': 'best[height<=480]',
}
def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None, cache_dir: str = None):
self._init_logger('PaidContent', log_callback, default_module='Twitch')
# Find yt-dlp executable
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
if not self.ytdlp_path:
self.log("yt-dlp not found, Twitch support will be disabled", 'warning')
# Store database reference for cookie access
self.unified_db = unified_db
self._cookies_file = None
# Initialize thumbnail cache
self.thumbnail_cache = TwitchThumbnailCache(cache_dir)
def _find_ytdlp(self) -> Optional[str]:
"""Find yt-dlp executable"""
common_paths = [
'/opt/media-downloader/venv/bin/yt-dlp', # Prefer venv version (kept up to date)
'/usr/local/bin/yt-dlp',
'/usr/bin/yt-dlp',
'/opt/homebrew/bin/yt-dlp',
os.path.expanduser('~/.local/bin/yt-dlp'),
]
for path in common_paths:
if os.path.isfile(path) and os.access(path, os.X_OK):
return path
try:
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def is_available(self) -> bool:
"""Check if yt-dlp is available"""
return self.ytdlp_path is not None
def _get_cookies_file(self) -> Optional[str]:
"""Get path to cookies file, creating it from database if needed"""
if self._cookies_file and os.path.exists(self._cookies_file):
return self._cookies_file
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
# Try twitch-specific cookies first, then fall back to ytdlp
for scraper_id in ['twitch', 'ytdlp']:
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", (scraper_id,))
row = cursor.fetchone()
if row and row[0]:
data = json.loads(row[0])
# Support both {"cookies": [...]} and [...] formats
if isinstance(data, dict) and 'cookies' in data:
cookies_list = data['cookies']
elif isinstance(data, list):
cookies_list = data
else:
cookies_list = []
if cookies_list:
# Write cookies to temp file in Netscape format
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='twitch_cookies_')
with os.fdopen(fd, 'w') as f:
f.write("# Netscape HTTP Cookie File\n")
for cookie in cookies_list:
domain = cookie.get('domain', '')
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
path = cookie.get('path', '/')
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
expiry = str(int(cookie.get('expirationDate', 0)))
name = cookie.get('name', '')
value = cookie.get('value', '')
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
self.log(f"Loaded {len(cookies_list)} cookies from {scraper_id} scraper", 'debug')
return self._cookies_file
except Exception as e:
self.log(f"Could not load cookies: {e}", 'debug')
return None
def _get_base_cmd(self) -> List[str]:
"""Get base yt-dlp command with cookies if available"""
cmd = [self.ytdlp_path]
cookies_file = self._get_cookies_file()
if cookies_file:
cmd.extend(['--cookies', cookies_file])
return cmd
def cleanup(self):
"""Clean up temporary files"""
if self._cookies_file and os.path.exists(self._cookies_file):
try:
os.unlink(self._cookies_file)
except Exception:
pass
self._cookies_file = None
@staticmethod
def extract_channel_name(url: str) -> Optional[str]:
"""
Extract channel name from Twitch URL
Supports:
- twitch.tv/username
- twitch.tv/username/clips
- m.twitch.tv/username/clips
"""
patterns = [
r'twitch\.tv/([a-zA-Z0-9_]+)(?:/clips)?',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1).lower()
return None
@staticmethod
def normalize_clips_url(channel_name: str) -> str:
"""Convert channel name to clips URL with all-time filter"""
return f"https://www.twitch.tv/{channel_name}/clips?filter=clips&range=all"
async def get_channel_info(self, channel_url: str, count_clips: bool = True) -> Optional[Dict]:
"""
Get channel information and optionally count all clips
"""
if not self.is_available():
return None
channel_name = self.extract_channel_name(channel_url)
if not channel_name:
return None
try:
clips_url = self.normalize_clips_url(channel_name)
# First get basic info from first clip
cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'-j',
'--playlist-items', '1',
clips_url
]
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
self.log(f"Failed to get channel info: {stderr.decode()}", 'warning')
return None
first_clip_data = None
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line:
continue
try:
first_clip_data = json.loads(line)
break
except json.JSONDecodeError:
continue
if not first_clip_data:
return None
# Count all clips if requested (this can take a while for channels with many clips)
clip_count = 0
if count_clips:
self.log(f"Counting clips for {channel_name}...", 'debug')
count_cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'--print', 'id',
clips_url
]
count_result = await asyncio.create_subprocess_exec(
*count_cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
count_stdout, _ = await count_result.communicate()
if count_result.returncode == 0:
clip_count = len([l for l in count_stdout.decode('utf-8', errors='replace').strip().split('\n') if l])
self.log(f"Found {clip_count} clips for {channel_name}", 'info')
return {
'channel_id': channel_name,
'channel_name': channel_name,
'channel_url': f"https://www.twitch.tv/{channel_name}",
'clips_url': clips_url,
'thumbnail': first_clip_data.get('thumbnail'),
'clip_count': clip_count,
}
except Exception as e:
self.log(f"Error getting channel info: {e}", 'error')
return None
async def get_channel_clips(self, channel_url: str, since_date: str = None,
max_clips: int = None, progress_callback=None,
cache_thumbnails: bool = True) -> List[Dict]:
"""
Get all clips from a channel
Args:
channel_url: Twitch channel URL
since_date: Only fetch clips created after this date (ISO format)
max_clips: Maximum number of clips to fetch
progress_callback: Callback function(count) for progress updates
cache_thumbnails: Whether to download and cache thumbnails locally
Returns:
List of clip metadata dicts with cached thumbnail paths
"""
if not self.is_available():
return []
channel_name = self.extract_channel_name(channel_url)
if not channel_name:
self.log(f"Could not extract channel name from URL: {channel_url}", 'error')
return []
try:
clips_url = self.normalize_clips_url(channel_name)
# Use flat-playlist for faster extraction (full metadata available in flat mode for Twitch clips)
cmd = self._get_base_cmd() + [
'--no-warnings',
'--flat-playlist',
'-j',
clips_url
]
# Add date filter at yt-dlp level for efficiency
if since_date:
try:
from datetime import datetime
# Convert ISO date to YYYYMMDD format for yt-dlp
date_obj = datetime.fromisoformat(since_date.replace('Z', '+00:00'))
dateafter = date_obj.strftime('%Y%m%d')
cmd.extend(['--dateafter', dateafter])
self.log(f"Filtering clips after {dateafter}", 'debug')
except (ValueError, AttributeError):
pass
if max_clips:
cmd.extend(['--playlist-items', f'1:{max_clips}'])
self.log(f"Fetching clips from channel: {channel_name}", 'info')
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error = stderr.decode('utf-8', errors='replace')
self.log(f"Failed to get channel clips: {error}", 'warning')
return []
clips = []
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
if not line:
continue
try:
data = json.loads(line)
clip_id = data.get('id')
if not clip_id:
continue
# Parse timestamp to ISO format
timestamp = data.get('timestamp')
upload_date = data.get('upload_date')
if timestamp:
try:
upload_date = datetime.fromtimestamp(timestamp).isoformat()
except (ValueError, OSError):
pass
elif upload_date:
# Convert YYYYMMDD to ISO format
try:
upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat()
except ValueError:
pass
# Check if clip is newer than since_date
if since_date and upload_date and upload_date <= since_date:
self.log(f"Reached clip from {upload_date}, stopping", 'debug')
break
# Extract clip slug from URL
clip_url = data.get('url') or data.get('webpage_url', '')
clip_slug = clip_url.split('/')[-1] if clip_url else clip_id
clips.append({
'clip_id': clip_id,
'clip_slug': clip_slug,
'title': data.get('title', f'Clip {clip_id}'),
'upload_date': upload_date,
'timestamp': timestamp,
'duration': data.get('duration'),
'view_count': data.get('view_count'),
'thumbnail': data.get('thumbnail'),
'url': clip_url,
'language': data.get('language'),
'channel_name': channel_name,
})
if progress_callback:
progress_callback(len(clips))
if max_clips and len(clips) >= max_clips:
break
except json.JSONDecodeError:
continue
self.log(f"Found {len(clips)} clips", 'info')
# Cache thumbnails if requested
if cache_thumbnails and clips:
thumbnail_urls = [c.get('thumbnail') for c in clips if c.get('thumbnail')]
if thumbnail_urls:
self.log(f"Caching {len(thumbnail_urls)} thumbnails...", 'debug')
cached_paths = await self.thumbnail_cache.cache_thumbnails_batch(thumbnail_urls)
# Update clips with cached thumbnail paths
for clip in clips:
thumb_url = clip.get('thumbnail')
if thumb_url and thumb_url in cached_paths:
clip['thumbnail_cached'] = cached_paths[thumb_url]
self.log(f"Cached {len(cached_paths)} thumbnails", 'debug')
return clips
except Exception as e:
self.log(f"Error getting channel clips: {e}", 'error')
return []
async def download_clip(self, clip_url: str, output_dir: Path, quality: str = 'best',
progress_callback=None) -> Dict:
"""
Download a clip
Args:
clip_url: Twitch clip URL
output_dir: Directory to save the clip
quality: Quality preset
progress_callback: Callback for download progress
Returns:
Dict with success status and file info
"""
if not self.is_available():
return {'success': False, 'error': 'yt-dlp not available'}
try:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Output template preserves title and ID
output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s')
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
cmd = self._get_base_cmd() + [
'--no-warnings',
'-f', format_str,
'-o', output_template,
'--print-json',
clip_url
]
self.log(f"Downloading clip: {clip_url}", 'debug')
result = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await result.communicate()
if result.returncode != 0:
error_msg = stderr.decode('utf-8', errors='replace').strip()
if len(error_msg) > 200:
error_msg = error_msg[:200] + '...'
return {'success': False, 'error': error_msg}
# Parse output JSON
clip_info = None
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
try:
clip_info = json.loads(line)
break
except json.JSONDecodeError:
continue
if not clip_info:
# Try to find downloaded file
files = list(output_dir.glob('*.mp4'))
if files:
file_path = max(files, key=lambda f: f.stat().st_mtime)
return {
'success': True,
'file_path': str(file_path),
'filename': file_path.name,
'file_size': file_path.stat().st_size
}
return {'success': False, 'error': 'Could not find downloaded file'}
file_path = clip_info.get('_filename') or clip_info.get('filename')
if file_path:
file_path = Path(file_path)
return {
'success': True,
'file_path': str(file_path) if file_path else None,
'filename': file_path.name if file_path else None,
'file_size': file_path.stat().st_size if file_path and file_path.exists() else clip_info.get('filesize'),
'title': clip_info.get('title'),
'duration': clip_info.get('duration'),
'clip_id': clip_info.get('id'),
'upload_date': clip_info.get('upload_date'),
'thumbnail': clip_info.get('thumbnail'),
}
except Exception as e:
self.log(f"Error downloading clip: {e}", 'error')
return {'success': False, 'error': str(e)}
async def get_channel_avatar(self, channel_name: str) -> Optional[str]:
"""
Try to fetch channel avatar from Twitch
Note: This requires either Twitch API credentials or scraping.
Returns None if avatar cannot be fetched.
"""
profile = await self.get_channel_profile(channel_name)
return profile.get('avatar') if profile else None
async def get_channel_profile(self, channel_name: str) -> Optional[Dict]:
"""
Fetch channel profile info using Twitch's GQL API.
Returns dict with avatar, banner, display_name, bio, joined_date, external_links
"""
try:
import aiohttp
async with aiohttp.ClientSession() as session:
headers = {
'Client-Id': 'kimne78kx3ncx6brgo4mv6wki5h1ko', # Public Twitch web client ID
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
}
# GQL query for comprehensive user info
query = '''
query {
user(login: "%s") {
id
login
displayName
description
createdAt
profileImageURL(width: 300)
bannerImageURL
offlineImageURL
channel {
socialMedias {
name
url
}
}
}
}
''' % channel_name
async with session.post(
'https://gql.twitch.tv/gql',
headers=headers,
json={'query': query},
timeout=aiohttp.ClientTimeout(total=15)
) as resp:
if resp.status == 200:
data = await resp.json()
user = data.get('data', {}).get('user')
if not user:
self.log(f"Twitch user not found: {channel_name}", 'warning')
return None
result = {}
# Avatar
if user.get('profileImageURL'):
result['avatar'] = user['profileImageURL']
# Banner - prefer offlineImageURL (larger), fall back to bannerImageURL
if user.get('offlineImageURL'):
result['banner'] = user['offlineImageURL']
elif user.get('bannerImageURL'):
result['banner'] = user['bannerImageURL']
# Display name
if user.get('displayName'):
result['display_name'] = user['displayName']
# Bio/description
if user.get('description'):
result['bio'] = user['description']
# Joined date (format: "Jun 10, 2016")
if user.get('createdAt'):
try:
created_dt = datetime.fromisoformat(user['createdAt'].replace('Z', '+00:00'))
result['joined_date'] = created_dt.strftime('%b %d, %Y')
self.log(f"Found Twitch joined date: {result['joined_date']}", 'debug')
except (ValueError, TypeError):
pass
# Social links
social_medias = user.get('channel', {}).get('socialMedias', [])
if social_medias:
links = []
for social in social_medias:
name = social.get('name', 'Link')
url = social.get('url', '')
if url:
# Capitalize first letter of name
title = name.capitalize() if name else 'Link'
links.append({'title': title, 'url': url})
if links:
result['external_links'] = json.dumps(links)
self.log(f"Found {len(links)} Twitch external links", 'debug')
if result:
self.log(f"Fetched Twitch profile via GQL for {channel_name}: {list(result.keys())}", 'debug')
return result
except Exception as e:
self.log(f"Could not fetch Twitch profile: {e}", 'debug')
return None
async def get_creator(self, channel_url: str) -> Optional[Creator]:
"""
Get Creator object from channel URL
"""
info = await self.get_channel_info(channel_url)
if not info:
return None
channel_name = info.get('channel_name') or self.extract_channel_name(channel_url)
# Try to get the actual channel avatar (not clip thumbnail)
avatar_url = await self.get_channel_avatar(channel_name)
return Creator(
creator_id=info.get('channel_id') or channel_name,
service_id='twitch',
platform='twitch',
username=channel_name or 'Unknown',
display_name=channel_name,
profile_image_url=avatar_url, # Use actual avatar, not clip thumbnail
post_count=info.get('clip_count', 0)
)
async def get_posts(self, channel_url: str, since_date: str = None,
max_clips: int = None, progress_callback=None) -> List[Post]:
"""
Get clips as Post objects
"""
clips = await self.get_channel_clips(channel_url, since_date, max_clips, progress_callback)
posts = []
for clip in clips:
# Create attachment for the clip
attachment = Attachment(
name=f"{clip['title']}.mp4",
file_type='video',
extension='.mp4',
server_path=clip['url'], # Use URL as server_path
download_url=clip['url'],
duration=clip.get('duration'),
)
post = Post(
post_id=clip['clip_id'],
service_id='twitch',
platform='twitch',
creator_id=clip.get('channel_name', ''),
title=clip['title'],
content='', # Clips don't have descriptions
published_at=clip.get('upload_date'),
attachments=[attachment],
)
posts.append(post)
return posts

View File

@@ -0,0 +1,484 @@
"""
Utility functions for Paid Content feature
"""
import re
from typing import Optional, Tuple
from urllib.parse import urlparse
def _extract_xenforo_search_query(parsed) -> Optional[str]:
"""Extract the 'q' search parameter from a XenForo search URL."""
from urllib.parse import parse_qs, unquote_plus
qs = parse_qs(parsed.query)
query = qs.get('q', [''])[0]
if not query:
m = re.search(r'[&?]q=([^&]+)', parsed.query)
if m:
query = unquote_plus(m.group(1))
return query or None
def parse_creator_url(url: str) -> Optional[Tuple[str, str, str]]:
"""
Parse a Coomer/Kemono/YouTube/Twitch/Fansly creator URL
Args:
url: URL like https://coomer.party/onlyfans/user/creatorid
or https://www.youtube.com/@channelhandle
or https://www.youtube.com/channel/UCxxxxx
or https://www.twitch.tv/username/clips
or https://fansly.com/username
Returns:
Tuple of (service_id, platform, creator_id) or None if invalid
"""
try:
parsed = urlparse(url)
host = parsed.netloc.lower()
# Handle YouTube URLs
if 'youtube.com' in host or 'youtu.be' in host:
channel_id = _extract_youtube_channel_id(url)
if channel_id:
return ('youtube', 'youtube', channel_id)
return None
# Handle Twitch URLs
if 'twitch.tv' in host:
channel_name = _extract_twitch_channel_name(url)
if channel_name:
return ('twitch', 'twitch', channel_name)
return None
# Handle Fansly URLs (direct API)
if 'fansly.com' in host:
username = _extract_fansly_username(url)
if username:
return ('fansly_direct', 'fansly', username)
return None
# Handle OnlyFans URLs (direct API)
if 'onlyfans.com' in host:
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
if path_parts:
username = path_parts[0]
if username.lower() not in ('my', 'api2', 'settings', 'search', 'notifications', 'chats', 'vault', 'lists', 'bookmarks', 'statements', 'help', 'terms', 'privacy', 'dmca', 'contact'):
return ('onlyfans_direct', 'onlyfans', username)
return None
# Handle Pornhub URLs
if 'pornhub.com' in host:
creator_id = _extract_pornhub_creator_id(url)
if creator_id:
return ('pornhub', 'pornhub', creator_id)
return None
# Handle XHamster URLs
if 'xhamster' in host:
creator_id = _extract_xhamster_creator_id(url)
if creator_id:
return ('xhamster', 'xhamster', creator_id)
return None
# Handle TikTok URLs
if 'tiktok.com' in host:
username = _extract_tiktok_username(url)
if username:
return ('tiktok', 'tiktok', username)
return None
# Handle Instagram URLs
if 'instagram.com' in host:
username = _extract_instagram_username(url)
if username:
return ('instagram', 'instagram', username)
return None
# Handle BestEyeCandy URLs
if 'besteyecandy.com' in host:
cid_match = re.search(r'cid-(\d+)', parsed.path)
slug_match = re.search(r'/([^/]+)\.html$', parsed.path)
if cid_match and slug_match:
slug = slug_match.group(1)
return ('besteyecandy', 'besteyecandy', f"{cid_match.group(1)}/{slug}")
elif cid_match:
return ('besteyecandy', 'besteyecandy', cid_match.group(1))
return None
# Handle Coppermine gallery URLs
# Match: domain.com/gallery/, domain.com/cpg/, domain.com/coppermine/
# Also match direct index.php/thumbnails.php/displayimage.php pages
if any(p in parsed.path.lower() for p in ['/gallery/', '/cpg/', '/coppermine/']) or \
re.search(r'(?:index|thumbnails|displayimage)\.php', parsed.path):
# Normalize to gallery root
base_path = re.sub(
r'(?:index|thumbnails|displayimage)\.php.*$', '', parsed.path
)
base_path = base_path.rstrip('/')
if base_path:
# Use domain + path as creator_id (e.g. kylie-jenner.org/gallery)
creator_id = host.replace('www.', '') + base_path
return ('coppermine', 'coppermine', creator_id)
# Handle Bellazon URLs (forum threads as creators)
if 'bellazon' in host:
match = re.search(r'/topic/(\d+)-([^/]+)', parsed.path)
if match:
topic_id = match.group(1)
return ('bellazon', 'bellazon', topic_id)
return None
# Handle Reddit URLs
if 'reddit.com' in host:
# Handle reddit.com/r/subreddit, old.reddit.com/r/subreddit, etc.
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
if len(path_parts) >= 2 and path_parts[0] == 'r':
subreddit = path_parts[1].lower()
return ('reddit', 'reddit', subreddit)
return None
# Handle Snapchat URLs
if 'snapchat.com' in host:
# Handle snapchat.com/@username and story.snapchat.com/@username
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
if path_parts:
username = path_parts[0].lstrip('@')
if username:
return ('snapchat', 'snapchat', username)
return None
# Handle HQCelebCorner URLs
if 'hqcelebcorner' in host:
query = _extract_xenforo_search_query(parsed)
if query:
return ('hqcelebcorner', 'hqcelebcorner', query)
return None
# Handle PicturePub URLs
if 'picturepub' in host:
query = _extract_xenforo_search_query(parsed)
if query:
return ('picturepub', 'picturepub', query)
return None
# Handle Soundgasm URLs
if 'soundgasm.net' in host:
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
if len(path_parts) >= 2 and path_parts[0] in ('u', 'user'):
return ('soundgasm', 'soundgasm', path_parts[1])
return None
# Handle Liltsome URLs (archive, maps to soundgasm platform)
if 'liltsome.yerf.org' in host:
# Hash-based routing: /#/artist/{name}
fragment = parsed.fragment # e.g. "/artist/kinkyshibby"
if fragment:
parts = [p for p in fragment.strip('/').split('/') if p]
if len(parts) >= 2 and parts[0] == 'artist':
return ('soundgasm', 'soundgasm', parts[1])
return None
# Determine service (Coomer/Kemono)
if 'coomer' in host:
service_id = 'coomer'
elif 'kemono' in host:
service_id = 'kemono'
else:
return None
# Parse path: /platform/user/creatorid
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
if len(path_parts) >= 3 and path_parts[1] == 'user':
platform = path_parts[0]
creator_id = path_parts[2]
return (service_id, platform, creator_id)
return None
except Exception:
return None
def _extract_youtube_channel_id(url: str) -> Optional[str]:
"""
Extract channel identifier from various YouTube URL formats
Supports:
- youtube.com/channel/UC...
- youtube.com/@handle
- youtube.com/c/channelname
- youtube.com/user/username
"""
patterns = [
r'youtube\.com/channel/([a-zA-Z0-9_-]+)',
r'youtube\.com/@([a-zA-Z0-9_.-]+)',
r'youtube\.com/c/([a-zA-Z0-9_-]+)',
r'youtube\.com/user/([a-zA-Z0-9_-]+)',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
def _extract_twitch_channel_name(url: str) -> Optional[str]:
"""
Extract channel name from Twitch URL
Supports:
- twitch.tv/username
- twitch.tv/username/clips
- m.twitch.tv/username/clips
"""
patterns = [
r'twitch\.tv/([a-zA-Z0-9_]+)(?:/clips)?',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1).lower()
return None
def _extract_fansly_username(url: str) -> Optional[str]:
"""
Extract username from Fansly URL
Supports:
- fansly.com/username
- fansly.com/username/posts
- fansly.com/username/media
"""
patterns = [
r'fansly\.com/([a-zA-Z0-9_.-]+)(?:/(?:posts|media))?',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
username = match.group(1)
# Filter out known non-username paths
if username.lower() not in ('explore', 'search', 'settings', 'notifications', 'messages', 'live'):
return username
return None
def _extract_pornhub_creator_id(url: str) -> Optional[str]:
"""Extract creator identifier from Pornhub URL, returns 'type/name' format"""
patterns = [
r'pornhub\.com/pornstar/([a-zA-Z0-9_-]+)',
r'pornhub\.com/channels/([a-zA-Z0-9_-]+)',
r'pornhub\.com/users/([a-zA-Z0-9_-]+)',
r'pornhub\.com/model/([a-zA-Z0-9_-]+)',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
# Store as "type/name" to preserve the URL type
type_match = re.search(r'pornhub\.com/(pornstar|channels|users|model)/', url)
return f"{type_match.group(1)}/{match.group(1)}" if type_match else match.group(1)
return None
def _extract_xhamster_creator_id(url: str) -> Optional[str]:
"""Extract creator identifier from XHamster URL, returns 'type/name' format"""
patterns = [
r'xhamster\d*\.com/creators/([a-zA-Z0-9_-]+)',
r'xhamster\d*\.com/channels/([a-zA-Z0-9_-]+)',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
type_match = re.search(r'xhamster\d*\.com/(creators|channels)/', url)
return f"{type_match.group(1)}/{match.group(1)}" if type_match else match.group(1)
return None
def _extract_tiktok_username(url: str) -> Optional[str]:
"""Extract username from TikTok URL"""
match = re.search(r'tiktok\.com/@([a-zA-Z0-9_.]+)', url)
if match:
return match.group(1)
return None
def _extract_instagram_username(url: str) -> Optional[str]:
"""Extract username from Instagram URL"""
match = re.search(r'instagram\.com/([a-zA-Z0-9_.]+)/?', url)
if match:
username = match.group(1).lower()
non_usernames = {
'explore', 'reels', 'stories', 'p', 'tv', 'accounts',
'direct', 'about', 'legal', 'developer', 'privacy',
'terms', 'help', 'api', 'reel', 'tags'
}
if username not in non_usernames:
return username
return None
def parse_post_url(url: str) -> Optional[Tuple[str, str, str, str]]:
"""
Parse a Coomer/Kemono post URL
Args:
url: URL like https://coomer.party/onlyfans/user/creatorid/post/postid
Returns:
Tuple of (service_id, platform, creator_id, post_id) or None if invalid
"""
try:
parsed = urlparse(url)
host = parsed.netloc.lower()
# Determine service
if 'coomer' in host:
service_id = 'coomer'
elif 'kemono' in host:
service_id = 'kemono'
else:
return None
# Parse path: /platform/user/creatorid/post/postid
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
if len(path_parts) >= 5 and path_parts[1] == 'user' and path_parts[3] == 'post':
platform = path_parts[0]
creator_id = path_parts[2]
post_id = path_parts[4]
return (service_id, platform, creator_id, post_id)
return None
except Exception:
return None
def format_file_size(size_bytes: int) -> str:
"""Format file size in human-readable format"""
if size_bytes is None:
return 'Unknown'
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if abs(size_bytes) < 1024.0:
return f"{size_bytes:.1f} {unit}"
size_bytes /= 1024.0
return f"{size_bytes:.1f} PB"
def sanitize_filename(name: str, max_length: int = 200) -> str:
"""
Sanitize a string for use in a filename
Args:
name: String to sanitize
max_length: Maximum length of result
Returns:
Sanitized filename
"""
if not name:
return 'unnamed'
# Remove/replace invalid characters
name = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', name)
name = re.sub(r'\s+', '-', name.strip())
name = name.strip('.-')
if len(name) > max_length:
name = name[:max_length]
return name or 'unnamed'
def extract_platform_from_domain(domain: str) -> Optional[str]:
"""Extract platform name from domain"""
domain = domain.lower().replace('www.', '')
platform_domains = {
'onlyfans.com': 'onlyfans',
'fansly.com': 'fansly',
'patreon.com': 'patreon',
'fanbox.cc': 'fanbox',
'gumroad.com': 'gumroad',
'subscribestar.com': 'subscribestar',
'subscribestar.adult': 'subscribestar',
'discord.com': 'discord',
'discord.gg': 'discord',
'candfans.jp': 'candfans',
}
return platform_domains.get(domain)
def detect_content_type(filename: str) -> str:
"""Detect content type from filename extension"""
if not filename:
return 'unknown'
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv', 'mpeg', 'mpg', '3gp'}
audio_exts = {'mp3', 'wav', 'flac', 'aac', 'm4a', 'ogg', 'wma'}
archive_exts = {'zip', 'rar', '7z', 'tar', 'gz', 'bz2'}
document_exts = {'pdf', 'doc', 'docx', 'txt', 'rtf', 'odt'}
if ext in image_exts:
return 'image'
elif ext in video_exts:
return 'video'
elif ext in audio_exts:
return 'audio'
elif ext in archive_exts:
return 'archive'
elif ext in document_exts:
return 'document'
else:
return 'unknown'
def get_service_platforms(service_id: str) -> list:
"""Get supported platforms for a service"""
platforms = {
'coomer': ['onlyfans', 'fansly', 'candfans'],
'kemono': ['patreon', 'fanbox', 'gumroad', 'subscribestar', 'discord'],
'youtube': ['youtube'],
'twitch': ['twitch'],
'fansly_direct': ['fansly'],
'onlyfans_direct': ['onlyfans'],
'pornhub': ['pornhub'],
'xhamster': ['xhamster'],
'tiktok': ['tiktok'],
'instagram': ['instagram'],
'soundgasm': ['soundgasm'],
'bellazon': ['bellazon'],
'besteyecandy': ['besteyecandy'],
'snapchat': ['snapchat'],
'reddit': ['reddit'],
'coppermine': ['coppermine'],
'hqcelebcorner': ['hqcelebcorner'],
'picturepub': ['picturepub'],
}
return platforms.get(service_id, [])
def get_service_base_url(service_id: str) -> Optional[str]:
"""
Get base URL for a service.
Note: For dynamic URLs, use the database (paid_content_services table).
These are fallback defaults only.
"""
# Import here to avoid circular dependency
from .api_client import PaidContentAPIClient
return PaidContentAPIClient.DEFAULT_SERVICE_URLS.get(service_id)

View File

@@ -0,0 +1,744 @@
"""
Generic XenForo Forum Client for Paid Content
Scrapes XenForo-based celebrity image forums (HQCelebCorner, PicturePub, etc.)
treating each celebrity name as a "creator" and each matching thread as a post.
Images are hosted on external hosts (imagebam, pixhost, imagetwist, etc.)
and resolved via ImageHostHandler from forum_downloader.
"""
import asyncio
import html
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Set
from urllib.parse import urlparse, unquote_plus
import aiohttp
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class XenForoForumClient(LoggingMixin):
"""Generic client for scraping XenForo-based forum threads."""
FLARESOLVERR_URL = 'http://localhost:8191/v1'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
}
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
# External image host domains to look for in post links
IMAGE_HOST_DOMAINS = [
'imagebam.com', 'pixhost.to', 'imagetwist.com', 'imgur.com',
'imgbox.com', 'postimg.cc', 'postimages.org', 'catbox.moe',
'turboimagehost.com', 'imageban.ru', 'img.yt', 'acidimg.cc',
'pixxxels.cc', 'imx.to', 'imgbb.com', 'ibb.co',
]
def __init__(self, service_id: str, base_url: str, cookie_path: str, log_callback=None):
self.SERVICE_ID = service_id
self.BASE_URL = base_url.rstrip('/')
self.COOKIE_PATH = cookie_path
self._init_logger('PaidContent', log_callback, default_module=service_id)
self._cookies: Optional[Dict[str, str]] = None
self._image_host_handler = None
# ------------------------------------------------------------------
# Cookie handling
# ------------------------------------------------------------------
def _load_cookies(self) -> Dict[str, str]:
"""Load Playwright-format cookies and convert to {name: value} dict."""
if self._cookies is not None:
return self._cookies
try:
cookie_path = Path(self.COOKIE_PATH)
if cookie_path.exists():
with open(cookie_path, 'r') as f:
raw_cookies = json.load(f)
self._cookies = {c['name']: c['value'] for c in raw_cookies}
self.log(f"Loaded {len(self._cookies)} cookies from {self.COOKIE_PATH}", 'debug')
else:
self.log(f"Cookie file not found: {self.COOKIE_PATH}", 'warning')
self._cookies = {}
except Exception as e:
self.log(f"Error loading cookies: {e}", 'warning')
self._cookies = {}
return self._cookies
def _get_cookie_header(self) -> str:
"""Build Cookie header string from loaded cookies."""
cookies = self._load_cookies()
return '; '.join(f'{k}={v}' for k, v in cookies.items())
def _get_request_headers(self) -> Dict[str, str]:
"""Get headers with cookies for authenticated requests."""
headers = dict(self.HEADERS)
cookie_str = self._get_cookie_header()
if cookie_str:
headers['Cookie'] = cookie_str
return headers
# ------------------------------------------------------------------
# Image host handling
# ------------------------------------------------------------------
def _get_image_host_handler(self):
"""Get or create ImageHostHandler instance."""
if self._image_host_handler is None:
try:
from modules.forum_downloader import ImageHostHandler
self._image_host_handler = ImageHostHandler
self.log("Loaded ImageHostHandler from forum_downloader", 'debug')
except ImportError:
self.log("ImageHostHandler not available", 'warning')
self._image_host_handler = False # sentinel to avoid retrying
return self._image_host_handler if self._image_host_handler is not False else None
# ------------------------------------------------------------------
# HTTP helpers
# ------------------------------------------------------------------
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
"""Fetch a page with cookies. Falls back to FlareSolverr on 403."""
headers = self._get_request_headers()
try:
async with session.get(url, headers=headers, allow_redirects=True) as resp:
if resp.status == 200:
return await resp.text()
if resp.status == 403:
self.log(f"Got 403 for {url}, trying FlareSolverr", 'debug')
return await self._fetch_via_flaresolverr(url)
self.log(f"HTTP {resp.status} for {url}", 'warning')
return None
except Exception as e:
self.log(f"Error fetching {url}: {e}", 'warning')
return await self._fetch_via_flaresolverr(url)
async def _fetch_via_flaresolverr(self, url: str) -> Optional[str]:
"""Fetch a page using FlareSolverr to bypass Cloudflare."""
try:
import requests as std_requests
except ImportError:
self.log("requests library not available for FlareSolverr", 'warning')
return None
fs_session_id = None
try:
# Create session
resp = std_requests.post(self.FLARESOLVERR_URL, json={
'cmd': 'sessions.create'
}, timeout=30)
data = resp.json()
if data.get('status') != 'ok':
self.log("Failed to create FlareSolverr session", 'warning')
return None
fs_session_id = data.get('session')
# Fetch page
cookies = self._load_cookies()
resp = std_requests.post(self.FLARESOLVERR_URL, json={
'cmd': 'request.get',
'url': url,
'session': fs_session_id,
'cookies': [{'name': k, 'value': v} for k, v in cookies.items()],
'maxTimeout': 60000,
}, timeout=70)
page_data = resp.json()
if page_data.get('status') == 'ok':
return page_data.get('solution', {}).get('response', '')
self.log(f"FlareSolverr failed for {url}: {page_data.get('message', 'unknown')}", 'warning')
return None
except Exception as e:
self.log(f"FlareSolverr error for {url}: {e}", 'warning')
return None
finally:
if fs_session_id:
try:
std_requests.post(self.FLARESOLVERR_URL, json={
'cmd': 'sessions.destroy',
'session': fs_session_id,
}, timeout=10)
except Exception:
pass
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
async def search_threads(self, query: str) -> List[Dict]:
"""Search for threads matching a celebrity name.
Returns list of {thread_id, title, url, reply_count}.
"""
threads = []
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
# XenForo search: POST form to /search/search
search_url = f'{self.BASE_URL}/search/search'
headers = self._get_request_headers()
headers['Content-Type'] = 'application/x-www-form-urlencoded'
# Need CSRF token - fetch search page first
search_page_url = f'{self.BASE_URL}/search/'
page_html = await self._fetch_page(session, search_page_url)
if not page_html:
self.log("Failed to fetch search page", 'warning')
return threads
# Extract CSRF token
csrf_match = re.search(r'name="_xfToken"\s+value="([^"]+)"', page_html)
xf_token = csrf_match.group(1) if csrf_match else ''
form_data = {
'keywords': query,
'search_type': 'post',
'c[title_only]': '1',
'order': 'date',
'_xfToken': xf_token,
}
try:
async with session.post(search_url, headers=headers, data=form_data,
allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"Search returned HTTP {resp.status}", 'warning')
return threads
result_html = await resp.text()
result_url = str(resp.url)
except Exception as e:
self.log(f"Search failed: {e}", 'error')
return threads
threads = self._parse_search_results(result_html)
# Handle search result pagination
page = 2
while True:
next_url = self._find_next_search_page(result_html, result_url, page)
if not next_url:
break
await asyncio.sleep(0.3)
result_html = await self._fetch_page(session, next_url)
if not result_html:
break
more = self._parse_search_results(result_html)
if not more:
break
threads.extend(more)
page += 1
self.log(f"Search for '{query}' found {len(threads)} threads", 'info')
return threads
async def get_thread_info(self, thread_url: str) -> Optional[Dict]:
"""Fetch page 1 of a thread and extract metadata.
Returns {thread_id, title, reply_count, page_count, url}.
"""
timeout = aiohttp.ClientTimeout(total=30)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
page_html = await self._fetch_page(session, thread_url)
if not page_html:
return None
title = self._extract_title(page_html)
page_count = self._extract_page_count(page_html)
reply_count = self._extract_reply_count(page_html)
thread_id = self._extract_thread_id(thread_url)
return {
'thread_id': thread_id,
'title': title or 'Untitled',
'reply_count': reply_count,
'page_count': page_count,
'url': thread_url.split('#')[0].rstrip('/'),
}
except Exception as e:
self.log(f"Error getting thread info for {thread_url}: {e}", 'error')
return None
async def get_thread_images(self, thread_url: str, page_count: int = None,
start_page: int = 1) -> List[Dict]:
"""Scrape all pages of a thread and extract image host links.
Returns list of {url, host, post_number} dicts (deduplicated).
"""
images = []
seen_urls: Set[str] = set()
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
# If page_count not provided, fetch page 1 to determine it
if page_count is None:
page1_html = await self._fetch_page(session, thread_url)
if not page1_html:
return images
page_count = self._extract_page_count(page1_html)
page_images = self._extract_image_links(page1_html)
for img in page_images:
if img['url'] not in seen_urls:
seen_urls.add(img['url'])
images.append(img)
start_page = 2
for page_num in range(start_page, page_count + 1):
page_url = self._build_page_url(thread_url, page_num)
await asyncio.sleep(0.5) # Rate limit
page_html = await self._fetch_page(session, page_url)
if not page_html:
self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
break
page_images = self._extract_image_links(page_html)
new_count = 0
for img in page_images:
if img['url'] not in seen_urls:
seen_urls.add(img['url'])
images.append(img)
new_count += 1
self.log(f"Page {page_num}/{page_count}: {new_count} new image links", 'debug')
self.log(f"Total: {len(images)} unique image links from {page_count} pages", 'info')
return images
async def resolve_image_url(self, host_page_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
"""Resolve an image host page URL to a direct image URL.
Uses ImageHostHandler from forum_downloader where possible.
"""
handler = self._get_image_host_handler()
# Try direct extraction without fetching the page
if handler:
direct = handler.extract_direct_url(host_page_url)
if direct:
return direct
# imgbox thumbnail → full image conversion (thumbs2 → images2)
m = re.match(r'https?://thumbs(\d*)\.imgbox\.com/([a-f0-9]+/[a-f0-9]+/)(\w+)_t\.\w+', host_page_url)
if m:
return f"https://images{m.group(1)}.imgbox.com/{m.group(2)}{m.group(3)}_o.jpg"
# For hosts that need page content, fetch and parse
own_session = session is None
if own_session:
timeout = aiohttp.ClientTimeout(total=30)
session = aiohttp.ClientSession(timeout=timeout)
try:
# ImageBam requires sfw_inter=1 cookie to bypass consent page
headers = dict(self.HEADERS)
if 'imagebam' in host_page_url:
headers['Cookie'] = 'sfw_inter=1'
try:
async with session.get(host_page_url, headers=headers,
allow_redirects=True) as resp:
if resp.status != 200:
return None
page_content = await resp.text()
final_url = str(resp.url)
except Exception as e:
self.log(f"Failed to fetch image host page {host_page_url}: {e}", 'debug')
return None
# Try handler with page content
if handler:
direct = handler.extract_direct_url(host_page_url, page_content=page_content)
if direct:
return direct
# Manual extraction fallbacks
return self._extract_direct_image_from_html(host_page_url, page_content, final_url)
finally:
if own_session:
await session.close()
# ------------------------------------------------------------------
# HTML parsing helpers
# ------------------------------------------------------------------
def _parse_search_results(self, html_content: str) -> List[Dict]:
"""Parse XenForo search results page for thread links."""
threads = []
# Parse each contentRow block to extract title, URL, and date
for block_match in re.finditer(
r'<div\s+class="contentRow[^"]*"[^>]*>(.*?)</div>\s*</div>\s*</div>',
html_content, re.DOTALL
):
block = block_match.group(1)
# Extract thread URL and title
title_match = re.search(
r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
block, re.DOTALL
)
if not title_match:
continue
url = title_match.group(1)
title_raw = title_match.group(2)
title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
if not title:
continue
if not url.startswith('http'):
url = self.BASE_URL + url
thread_id = self._extract_thread_id(url)
if not thread_id:
continue
# Extract date from <time datetime="..."> tag
published_at = None
time_match = re.search(r'<time[^>]+datetime="([^"]+)"', block)
if time_match:
published_at = time_match.group(1)
threads.append({
'thread_id': thread_id,
'title': title,
'url': url.split('#')[0].rstrip('/'),
'reply_count': 0,
'published_at': published_at,
})
# Fallback: if contentRow block parsing found nothing, try simpler title-only parsing
if not threads:
for m in re.finditer(
r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
html_content, re.DOTALL
):
url = m.group(1)
title_raw = m.group(2)
title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
if not title:
continue
if not url.startswith('http'):
url = self.BASE_URL + url
thread_id = self._extract_thread_id(url)
if not thread_id:
continue
threads.append({
'thread_id': thread_id,
'title': title,
'url': url.split('#')[0].rstrip('/'),
'reply_count': 0,
'published_at': None,
})
# Deduplicate by thread_id
seen = set()
unique = []
for t in threads:
if t['thread_id'] not in seen:
seen.add(t['thread_id'])
unique.append(t)
return unique
def _find_next_search_page(self, html_content: str, current_url: str, page_num: int) -> Optional[str]:
"""Find URL for the next page of search results."""
# XenForo pagination: <a href="...page-{N}..." class="pageNav-page">
pattern = rf'<a\s+href="([^"]*)"[^>]*class="pageNav-jump[^"]*"[^>]*>\s*Next'
m = re.search(pattern, html_content, re.IGNORECASE)
if m:
url = m.group(1)
if not url.startswith('http'):
url = self.BASE_URL + html.unescape(url)
return url
return None
# Domains/patterns for non-content images (reaction GIFs, emojis, signatures, etc.)
JUNK_URL_PATTERNS = [
'giphy.com', 'tenor.com', 'gfycat.com', # reaction GIFs
'jsdelivr.net', 'joypixels', 'twemoji', # emoji CDNs
'wp-content/', # WordPress media (blog graphics, profile pics)
'/unicode/', '/emoji/', # emoji paths
'haboodadi.com', # forum signature images
]
# Image hosts that are permanently dead (DNS gone / domain expired)
DEAD_HOSTS = [
'someimage.com',
]
def _extract_image_links(self, page_html: str) -> List[Dict]:
"""Extract image host links from all posts on a page."""
images = []
# Find all message bodies: XenForo uses <article class="message ..."> and
# <div class="bbWrapper"> for post content
for content_match in re.finditer(
r'<div\s+class="bbWrapper">(.*?)</div>\s*(?:</div>|<div\s+class="(?:js-post|message))',
page_html, re.DOTALL
):
content = content_match.group(1)
# Extract links to known image hosts
for link_match in re.finditer(r'<a\s+[^>]*href="([^"]+)"[^>]*>', content):
link_url = html.unescape(link_match.group(1))
if self._is_image_host_url(link_url) and not self._is_junk_url(link_url):
images.append({'url': link_url, 'host': self._identify_host(link_url)})
# Also catch direct image URLs (full-size, not thumbnails)
# NOTE: Skip images hosted on known image host CDNs (imgbox, imgur, etc.)
# — legitimate gallery images are posted as <a href> links to host pages
# (handled above), while inline <img> from these hosts are signatures.
for img_match in re.finditer(r'<img\s+[^>]*src="([^"]+)"[^>]*>', content):
img_url = html.unescape(img_match.group(1))
# Skip thumbnails, avatars, smilies, and junk
if any(skip in img_url.lower() for skip in [
'thumb', 'avatar', 'smili', 'emoji', 'icon', 'logo',
'data/assets', '/styles/', 'xenforo'
]):
continue
if self._is_junk_url(img_url):
continue
# Skip inline images from known image hosts — these are signatures,
# not gallery content (gallery images come through as <a> links above)
if self._is_image_host_url(img_url):
continue
if self._is_direct_image_url(img_url):
images.append({'url': img_url, 'host': 'direct'})
return images
def _is_junk_url(self, url: str) -> bool:
"""Filter out non-content images: reaction GIFs, emojis, blog graphics, dead hosts, etc."""
url_lower = url.lower()
if any(pat in url_lower for pat in self.JUNK_URL_PATTERNS):
return True
if any(host in url_lower for host in self.DEAD_HOSTS):
return True
return False
def _is_image_host_url(self, url: str) -> bool:
"""Check if a URL belongs to a known image hosting service."""
try:
domain = urlparse(url).netloc.lower()
return any(host in domain for host in self.IMAGE_HOST_DOMAINS)
except Exception:
return False
def _is_direct_image_url(self, url: str) -> bool:
"""Check if a URL points directly to an image file."""
try:
path = urlparse(url).path.lower()
return any(path.endswith(f'.{ext}') for ext in self.IMAGE_EXTS)
except Exception:
return False
def _identify_host(self, url: str) -> str:
"""Identify which image host a URL belongs to."""
handler = self._get_image_host_handler()
if handler:
host = handler.identify_host(url)
if host:
return host
# Fallback
try:
domain = urlparse(url).netloc.lower()
for host_domain in self.IMAGE_HOST_DOMAINS:
if host_domain in domain:
return host_domain.split('.')[0]
except Exception:
pass
return 'unknown'
def _extract_direct_image_from_html(self, url: str, page_content: str, final_url: str) -> Optional[str]:
"""Manually extract direct image URL from host page HTML."""
domain = urlparse(url).netloc.lower()
# imagebam: <img class="main-image ..." src="..."> (class may have extra classes)
if 'imagebam' in domain:
m = re.search(r'<img\s+[^>]*src="(https?://images\d*\.imagebam\.com/[^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
m = re.search(r'<img\s+[^>]*class="main-image[^"]*"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# Alternative: og:image meta tag
m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# pixhost: <img id="image" src="..."> or img.pixhost.to URL
if 'pixhost' in domain:
m = re.search(r'<img\s+[^>]*id="image"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# Convert thumbnail URL to full: t{N}.pixhost.to/thumbs/ -> img{N}.pixhost.to/images/
m = re.search(r'https?://t(\d+)\.pixhost\.to/thumbs/(\d+)/(.+)', url)
if m:
return f"https://img{m.group(1)}.pixhost.to/images/{m.group(2)}/{m.group(3)}"
# imagetwist: <img class="pic" src="...">
if 'imagetwist' in domain:
m = re.search(r'<img\s+[^>]*class="pic"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
m = re.search(r'<p\s+[^>]*style="text-align:center"[^>]*>\s*<img\s+[^>]*src="([^"]+)"',
page_content)
if m:
return html.unescape(m.group(1))
# imgbox: <img id="img" src="..."> or src before id
if 'imgbox' in domain:
m = re.search(r'<img\s+[^>]*id="img"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
m = re.search(r'<img\s+[^>]*src="([^"]+)"[^>]*id="img"', page_content)
if m:
return html.unescape(m.group(1))
# Direct image URL pattern
m = re.search(r'(https?://images\d*\.imgbox\.com/[^\s"<>]+)', page_content)
if m:
return html.unescape(m.group(1))
# turboimagehost: <img class="uImage" src="...">
if 'turboimagehost' in domain:
m = re.search(r'<img\s+[^>]*class="uImage"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# acidimg: <img class="centred" src="...">
if 'acidimg' in domain:
m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# pixxxels: same pattern as acidimg
if 'pixxxels' in domain:
m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# imx.to: <img class="image-show" src="...">
if 'imx.to' in domain:
m = re.search(r'<img\s+[^>]*class="image-show"[^>]*src="([^"]+)"', page_content)
if m:
return html.unescape(m.group(1))
# Generic: try og:image meta tag
m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
if m:
img_url = html.unescape(m.group(1))
if self._is_direct_image_url(img_url):
return img_url
return None
# ------------------------------------------------------------------
# Utility helpers
# ------------------------------------------------------------------
@staticmethod
def _extract_title(page_html: str) -> Optional[str]:
"""Extract thread title from XenForo <h1 class="p-title-value">."""
m = re.search(r'<h1\s+class="p-title-value"[^>]*>(.*?)</h1>', page_html, re.DOTALL)
if m:
# Remove inner tags (like <span> for prefixes/labels, viewer count spans)
title = re.sub(r'<[^>]+>', '', m.group(1))
# Clean up non-breaking spaces and extra whitespace
title = title.replace('\xa0', ' ')
title = re.sub(r'\s*\(\d+\s*Viewer[s]?\)', '', title) # Remove "(1 Viewer)"
title = re.sub(r'\s+', ' ', title).strip()
return html.unescape(title)
# Fallback: <title> — strip common XenForo site name suffixes
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
if m:
title = html.unescape(m.group(1).strip())
title = re.sub(r'\s*[-–—|]\s*(?:HQCelebCorner|PicturePub|XenForo).*$', '', title, flags=re.IGNORECASE).strip()
return title
return None
@staticmethod
def _extract_page_count(page_html: str) -> int:
"""Extract total page count from XenForo pagination."""
# <li class="pageNav-page"><a href="...">42</a></li>
pages = re.findall(r'<li\s+class="pageNav-page[^"]*">\s*<a[^>]*>(\d+)</a>', page_html)
if pages:
return max(int(p) for p in pages)
return 1
@staticmethod
def _extract_reply_count(page_html: str) -> int:
"""Extract reply count from XenForo thread info."""
# <dl class="pairs pairs--inline"><dt>Replies</dt><dd>123</dd></dl>
m = re.search(r'<dt>Replies</dt>\s*<dd>([\d,]+)</dd>', page_html)
if m:
return int(m.group(1).replace(',', ''))
return 0
@staticmethod
def _extract_thread_id(url: str) -> Optional[str]:
"""Extract thread ID from XenForo URL.
Handles both formats:
- /threads/title.12345/
- /index.php?threads/title.12345/
"""
m = re.search(r'threads/[^/]*?\.(\d+)', url)
if m:
return m.group(1)
# Fallback: just /threads/{id}/
m = re.search(r'threads/(\d+)', url)
if m:
return m.group(1)
return None
@staticmethod
def _build_page_url(thread_url: str, page_num: int) -> str:
"""Build paginated thread URL for XenForo.
Handles: /index.php?threads/slug.12345/page-2
"""
# Remove existing page- suffix and fragment
base = thread_url.split('#')[0].rstrip('/')
base = re.sub(r'/page-\d+$', '', base)
if page_num == 1:
return base + '/'
return f'{base}/page-{page_num}'
@staticmethod
def _get_extension(filename_or_url: str) -> str:
"""Get lowercase file extension."""
clean = filename_or_url.split('?')[0].split('#')[0]
if '.' in clean.split('/')[-1]:
return clean.rsplit('.', 1)[-1].lower()
return ''
@staticmethod
def _filename_from_url(url: str) -> str:
"""Extract filename from URL path."""
path = urlparse(url).path
name = path.rstrip('/').split('/')[-1]
return name if name else 'unnamed.jpg'

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1025
modules/pg_adapter.py Normal file

File diff suppressed because it is too large Load Diff

690
modules/plex_client.py Normal file
View File

@@ -0,0 +1,690 @@
"""Plex Media Server client for linking appearances to library items"""
import asyncio
import uuid
from typing import Dict, List, Optional, Any
from web.backend.core.http_client import http_client
from modules.universal_logger import get_logger
logger = get_logger('Plex')
# Plex API constants
PLEX_TV_API = "https://plex.tv/api/v2"
PLEX_AUTH_URL = "https://app.plex.tv/auth"
CLIENT_IDENTIFIER = "media-downloader-appearances"
PRODUCT_NAME = "Media Downloader"
class PlexOAuth:
"""Handle Plex OAuth PIN-based authentication flow"""
def __init__(self):
self._headers = {
'Accept': 'application/json',
'X-Plex-Client-Identifier': CLIENT_IDENTIFIER,
'X-Plex-Product': PRODUCT_NAME,
'X-Plex-Version': '1.0.0',
'X-Plex-Device': 'Web',
'X-Plex-Platform': 'Web',
}
async def create_pin(self) -> Optional[Dict]:
"""
Create a new PIN for authentication.
Returns:
Dict with 'id', 'code', and 'auth_url' or None on failure
"""
try:
url = f"{PLEX_TV_API}/pins"
response = await http_client.post(
url,
headers=self._headers,
data={'strong': 'true'}
)
data = response.json()
pin_id = data.get('id')
pin_code = data.get('code')
if pin_id and pin_code:
# Build the auth URL for the user to visit
auth_url = (
f"{PLEX_AUTH_URL}#?"
f"clientID={CLIENT_IDENTIFIER}&"
f"code={pin_code}&"
f"context%5Bdevice%5D%5Bproduct%5D={PRODUCT_NAME.replace(' ', '%20')}"
)
logger.info(f"Created Plex PIN {pin_id}")
return {
'id': pin_id,
'code': pin_code,
'auth_url': auth_url,
'expires_at': data.get('expiresAt'),
}
return None
except Exception as e:
logger.error(f"Failed to create Plex PIN: {e}")
return None
async def check_pin(self, pin_id: int) -> Optional[str]:
"""
Check if the user has authenticated with the PIN.
Args:
pin_id: The PIN ID returned from create_pin
Returns:
The auth token if authenticated, None if still pending or expired
"""
try:
url = f"{PLEX_TV_API}/pins/{pin_id}"
response = await http_client.get(url, headers=self._headers)
data = response.json()
auth_token = data.get('authToken')
if auth_token:
logger.info("Plex authentication successful")
return auth_token
return None
except Exception as e:
logger.error(f"Failed to check Plex PIN: {e}")
return None
async def wait_for_auth(self, pin_id: int, timeout: int = 120, poll_interval: int = 2) -> Optional[str]:
"""
Poll for authentication completion.
Args:
pin_id: The PIN ID to check
timeout: Maximum seconds to wait
poll_interval: Seconds between checks
Returns:
The auth token if successful, None on timeout/failure
"""
elapsed = 0
while elapsed < timeout:
token = await self.check_pin(pin_id)
if token:
return token
await asyncio.sleep(poll_interval)
elapsed += poll_interval
logger.warning(f"Plex authentication timed out after {timeout}s")
return None
async def get_user_info(self, token: str) -> Optional[Dict]:
"""
Get information about the authenticated user.
Args:
token: Plex auth token
Returns:
User info dict or None
"""
try:
url = f"{PLEX_TV_API}/user"
headers = {**self._headers, 'X-Plex-Token': token}
response = await http_client.get(url, headers=headers)
data = response.json()
return {
'username': data.get('username'),
'email': data.get('email'),
'thumb': data.get('thumb'),
'title': data.get('title'),
}
except Exception as e:
logger.error(f"Failed to get Plex user info: {e}")
return None
async def get_user_servers(self, token: str) -> List[Dict]:
"""
Get list of Plex servers available to the user.
Args:
token: Plex auth token
Returns:
List of server dictionaries
"""
try:
url = f"{PLEX_TV_API}/resources"
headers = {**self._headers, 'X-Plex-Token': token}
params = {'includeHttps': 1, 'includeRelay': 1}
response = await http_client.get(url, headers=headers, params=params)
data = response.json()
servers = []
for resource in data:
if resource.get('provides') == 'server':
connections = resource.get('connections', [])
# Prefer non-local (relay/remote) connections for server-to-server communication
# Local connections often use internal IPs that aren't reachable externally
remote_conn = next((c for c in connections if not c.get('local') and c.get('relay')), None)
https_conn = next((c for c in connections if not c.get('local') and 'https' in c.get('uri', '')), None)
any_remote = next((c for c in connections if not c.get('local')), None)
local_conn = next((c for c in connections if c.get('local')), None)
# Try in order: relay, https remote, any remote, local
best_conn = remote_conn or https_conn or any_remote or local_conn or (connections[0] if connections else None)
if best_conn:
# Also include all connection URLs for debugging/manual selection
all_urls = [{'url': c.get('uri'), 'local': c.get('local', False), 'relay': c.get('relay', False)} for c in connections]
servers.append({
'name': resource.get('name'),
'machineIdentifier': resource.get('clientIdentifier'),
'owned': resource.get('owned', False),
'url': best_conn.get('uri'),
'local': best_conn.get('local', False),
'relay': best_conn.get('relay', False),
'accessToken': resource.get('accessToken'),
'all_connections': all_urls,
})
return servers
except Exception as e:
logger.error(f"Failed to get Plex servers: {e}")
return []
class PlexClient:
"""Client for interacting with Plex Media Server API"""
def __init__(self, base_url: str, token: str):
"""
Initialize Plex client.
Args:
base_url: Plex server URL (e.g., 'http://192.168.1.100:32400')
token: Plex authentication token
"""
self.base_url = base_url.rstrip('/')
self.token = token
self._headers = {
'X-Plex-Token': token,
'Accept': 'application/json'
}
async def test_connection(self) -> bool:
"""
Test connection to Plex server.
Returns:
True if connection successful, False otherwise
"""
try:
url = f"{self.base_url}/identity"
response = await http_client.get(url, headers=self._headers)
data = response.json()
server_name = data.get('MediaContainer', {}).get('friendlyName', 'Unknown')
logger.info(f"Connected to Plex server: {server_name}")
return True
except Exception as e:
logger.error(f"Plex connection test failed: {e}")
return False
async def get_libraries(self) -> List[Dict]:
"""
Get list of Plex libraries.
Returns:
List of library dictionaries with id, title, type
"""
try:
url = f"{self.base_url}/library/sections"
response = await http_client.get(url, headers=self._headers)
data = response.json()
libraries = []
for section in data.get('MediaContainer', {}).get('Directory', []):
libraries.append({
'id': section.get('key'),
'title': section.get('title'),
'type': section.get('type'), # 'movie', 'show', 'artist', etc.
'uuid': section.get('uuid'),
})
return libraries
except Exception as e:
logger.error(f"Failed to get Plex libraries: {e}")
return []
async def search_by_tmdb_id(self, tmdb_id: int, media_type: str = 'movie') -> Optional[Dict]:
"""
Search for an item in Plex library by TMDB ID.
Args:
tmdb_id: The Movie Database ID
media_type: 'movie' or 'show'
Returns:
Plex item dict with ratingKey, title, etc. or None if not found
"""
try:
# Plex uses guid format like: tmdb://12345
guid = f"tmdb://{tmdb_id}"
# Search across all libraries
url = f"{self.base_url}/library/all"
params = {
'guid': guid,
'type': 1 if media_type == 'movie' else 2 # 1=movie, 2=show
}
response = await http_client.get(url, headers=self._headers, params=params)
data = response.json()
items = data.get('MediaContainer', {}).get('Metadata', [])
if items:
item = items[0]
return {
'ratingKey': item.get('ratingKey'),
'title': item.get('title'),
'year': item.get('year'),
'thumb': item.get('thumb'),
'type': item.get('type'),
'librarySectionID': item.get('librarySectionID'),
}
return None
except Exception as e:
logger.debug(f"TMDB search failed for {tmdb_id}: {e}")
return None
async def search_by_title(self, title: str, year: Optional[int] = None,
media_type: str = 'movie') -> Optional[Dict]:
"""
Search for an item in Plex library by title.
Args:
title: Movie or show title
year: Optional release year for more accurate matching
media_type: 'movie' or 'show'
Returns:
Plex item dict or None if not found
"""
try:
url = f"{self.base_url}/search"
params = {
'query': title,
'type': 1 if media_type == 'movie' else 2
}
response = await http_client.get(url, headers=self._headers, params=params)
data = response.json()
items = data.get('MediaContainer', {}).get('Metadata', [])
# If year provided, filter for matching year
if year and items:
for item in items:
if item.get('year') == year:
return {
'ratingKey': item.get('ratingKey'),
'title': item.get('title'),
'year': item.get('year'),
'thumb': item.get('thumb'),
'type': item.get('type'),
'librarySectionID': item.get('librarySectionID'),
}
# Return first result if no exact year match
if items:
item = items[0]
return {
'ratingKey': item.get('ratingKey'),
'title': item.get('title'),
'year': item.get('year'),
'thumb': item.get('thumb'),
'type': item.get('type'),
'librarySectionID': item.get('librarySectionID'),
}
return None
except Exception as e:
logger.debug(f"Title search failed for '{title}': {e}")
return None
async def get_episode(self, show_rating_key: str, season: int, episode: int) -> Optional[Dict]:
"""
Get a specific episode from a TV show.
Args:
show_rating_key: Plex ratingKey for the show
season: Season number
episode: Episode number
Returns:
Episode dict with ratingKey, title, etc. or None if not found
"""
try:
# Get all episodes of the show
url = f"{self.base_url}/library/metadata/{show_rating_key}/allLeaves"
response = await http_client.get(url, headers=self._headers)
data = response.json()
episodes = data.get('MediaContainer', {}).get('Metadata', [])
for ep in episodes:
if ep.get('parentIndex') == season and ep.get('index') == episode:
return {
'ratingKey': ep.get('ratingKey'),
'title': ep.get('title'),
'season': season,
'episode': episode,
'show_rating_key': show_rating_key,
'type': 'episode',
}
return None
except Exception as e:
logger.debug(f"Episode search failed for S{season}E{episode}: {e}")
return None
async def get_all_episodes(self, show_rating_key: str) -> Dict[tuple, Dict]:
"""
Get all episodes for a TV show, indexed by (season, episode) tuple.
Args:
show_rating_key: Plex ratingKey for the show
Returns:
Dict mapping (season_num, episode_num) to episode info
"""
episodes_map = {}
try:
url = f"{self.base_url}/library/metadata/{show_rating_key}/allLeaves"
response = await http_client.get(url, headers=self._headers)
data = response.json()
episodes = data.get('MediaContainer', {}).get('Metadata', [])
for ep in episodes:
season = ep.get('parentIndex')
episode = ep.get('index')
if season is not None and episode is not None:
episodes_map[(season, episode)] = {
'ratingKey': ep.get('ratingKey'),
'title': ep.get('title'),
'season': season,
'episode': episode,
'show_rating_key': show_rating_key,
'air_date': ep.get('originallyAvailableAt'),
}
logger.debug(f"Found {len(episodes_map)} episodes for show {show_rating_key}")
return episodes_map
except Exception as e:
logger.debug(f"Failed to get episodes for show {show_rating_key}: {e}")
return {}
def get_watch_url(self, rating_key: str) -> str:
"""
Generate a direct watch URL for a Plex item.
Args:
rating_key: Plex ratingKey for the item
Returns:
URL to open the item in Plex Web
"""
# Extract server machine identifier from base URL or use a generic format
# Plex Web URL format: /web/index.html#!/server/{machineId}/details?key=/library/metadata/{ratingKey}
return f"{self.base_url}/web/index.html#!/server/1/details?key=%2Flibrary%2Fmetadata%2F{rating_key}"
async def get_server_identity(self) -> Optional[Dict]:
"""
Get Plex server identity including machine identifier.
Returns:
Server identity dict or None
"""
try:
url = f"{self.base_url}/identity"
response = await http_client.get(url, headers=self._headers)
data = response.json()
container = data.get('MediaContainer', {})
return {
'machineIdentifier': container.get('machineIdentifier'),
'friendlyName': container.get('friendlyName'),
'version': container.get('version'),
}
except Exception as e:
logger.error(f"Failed to get server identity: {e}")
return None
def get_full_watch_url(self, rating_key: str, machine_id: str) -> str:
"""
Generate a complete Plex watch URL with machine identifier.
Args:
rating_key: Plex ratingKey for the item
machine_id: Plex server machine identifier
Returns:
Complete Plex Web URL
"""
encoded_key = f"%2Flibrary%2Fmetadata%2F{rating_key}"
return f"{self.base_url}/web/index.html#!/server/{machine_id}/details?key={encoded_key}"
async def search_by_actor(self, actor_name: str) -> List[Dict]:
"""
Search Plex library for all movies and TV shows featuring an actor.
Uses Plex's actor filter to find all content with the actor in cast.
Args:
actor_name: Name of the actor to search for
Returns:
List of appearances with show/movie info and role details
"""
appearances = []
seen_keys = set() # Track to avoid duplicates
actor_name_lower = actor_name.lower()
try:
# Get all libraries
libraries = await self.get_libraries()
for library in libraries:
lib_key = library.get('id')
lib_type = library.get('type')
# Only search movie and show libraries
if lib_type not in ('movie', 'show'):
continue
try:
# Use actor filter to find all content featuring this actor
# This is the most reliable method in Plex
url = f"{self.base_url}/library/sections/{lib_key}/all"
params = {
'type': 1 if lib_type == 'movie' else 2, # 1=movie, 2=show
'actor': actor_name, # Plex accepts actor name directly
}
response = await http_client.get(url, headers=self._headers, params=params)
data = response.json()
items = data.get('MediaContainer', {}).get('Metadata', [])
logger.debug(f"Found {len(items)} {lib_type}s for '{actor_name}' in library {library.get('title')}")
for item in items:
rating_key = item.get('ratingKey')
if not rating_key or rating_key in seen_keys:
continue
seen_keys.add(rating_key)
# Get detailed metadata for character name
detail_url = f"{self.base_url}/library/metadata/{rating_key}"
detail_response = await http_client.get(detail_url, headers=self._headers)
detail_data = detail_response.json()
detail_items = detail_data.get('MediaContainer', {}).get('Metadata', [])
if not detail_items:
continue
detail = detail_items[0]
# Find the actor's role/character name
character_name = None
roles = detail.get('Role', [])
for role in roles:
role_tag = (role.get('tag') or '').lower()
if actor_name_lower in role_tag or role_tag in actor_name_lower:
character_name = role.get('role')
break
# Build poster URL with auth token
thumb = detail.get('thumb')
poster_url = None
if thumb:
poster_url = f"{self.base_url}{thumb}?X-Plex-Token={self.token}"
# Build appearance data
appearance = {
'appearance_type': 'Movie' if lib_type == 'movie' else 'TV',
'show_name': detail.get('title'),
'episode_title': None,
'network': detail.get('studio'),
'appearance_date': detail.get('originallyAvailableAt'),
'year': detail.get('year'),
'status': 'aired',
'description': detail.get('summary'),
'poster_url': poster_url,
'credit_type': 'acting',
'character_name': character_name,
'plex_rating_key': rating_key,
'plex_library_id': lib_key,
'source': 'plex',
}
# For TV shows, get episode count
if lib_type == 'show':
appearance['episode_count'] = detail.get('leafCount', 1)
appearances.append(appearance)
logger.info(f"Found Plex appearance: {actor_name} in '{detail.get('title')}'" +
(f" as {character_name}" if character_name else ""))
# Small delay between detail requests
await asyncio.sleep(0.02)
except Exception as e:
logger.debug(f"Error searching library {lib_key}: {e}")
continue
logger.info(f"Found {len(appearances)} Plex appearances for {actor_name}")
return appearances
except Exception as e:
logger.error(f"Failed to search Plex by actor: {e}")
return []
async def batch_match_appearances(self, appearances: List[Dict], on_match=None) -> Dict[int, Dict]:
"""
Match multiple appearances to Plex library items.
Args:
appearances: List of appearance dicts with tmdb_show_id or tmdb_movie_id
on_match: Optional async callback(appearance_id, match_info) called for each match
Returns:
Dict mapping appearance ID to Plex match info {rating_key, library_id}
"""
matches = {}
server_info = await self.get_server_identity()
machine_id = server_info.get('machineIdentifier') if server_info else None
# Dedupe by TMDB ID to avoid redundant searches
tmdb_cache: Dict[tuple, Optional[Dict]] = {}
# Cache episode lookups per show
episode_cache: Dict[str, Dict[tuple, Optional[Dict]]] = {}
for appearance in appearances:
appearance_id = appearance.get('id')
if not appearance_id:
continue
# Determine media type and TMDB ID
tmdb_id = appearance.get('tmdb_movie_id') or appearance.get('tmdb_show_id')
is_movie = appearance.get('appearance_type') == 'Movie'
media_type = 'movie' if is_movie else 'show'
if not tmdb_id:
continue
cache_key = (tmdb_id, media_type)
# Check cache first
if cache_key in tmdb_cache:
plex_item = tmdb_cache[cache_key]
else:
# Rate limiting
await asyncio.sleep(0.1)
# Try TMDB ID first
plex_item = await self.search_by_tmdb_id(tmdb_id, media_type)
# Fall back to title search if no TMDB match
if not plex_item:
title = appearance.get('movie_name') or appearance.get('show_name')
year = None
if appearance.get('release_date'):
try:
year = int(appearance['release_date'][:4])
except (ValueError, TypeError):
pass
if title:
plex_item = await self.search_by_title(title, year, media_type)
tmdb_cache[cache_key] = plex_item
if plex_item:
show_rating_key = plex_item.get('ratingKey') # Always the show/movie key
rating_key = show_rating_key if is_movie else None # Movies get the key, TV starts with None
library_id = plex_item.get('librarySectionID')
# For TV shows with season/episode data, try to match the specific episode
season = appearance.get('season_number')
episode = appearance.get('episode_number')
if not is_movie and season and episode:
# Check episode cache first
show_key = str(show_rating_key)
ep_key = (season, episode)
if show_key not in episode_cache:
episode_cache[show_key] = {}
if ep_key in episode_cache[show_key]:
episode_item = episode_cache[show_key][ep_key]
else:
episode_item = await self.get_episode(show_rating_key, season, episode)
episode_cache[show_key][ep_key] = episode_item
if episode_item:
rating_key = episode_item.get('ratingKey') # Episode-specific key
# If episode not found, rating_key stays None - episode not in Plex
match_info = {
'plex_rating_key': rating_key, # Episode key if found, movie key for movies, None for missing TV episodes
'plex_show_rating_key': show_rating_key if not is_movie else None, # Show key for TV (for series-level navigation)
'plex_library_id': library_id,
'plex_watch_url': self.get_full_watch_url(rating_key, machine_id) if (rating_key and machine_id) else (self.get_watch_url(rating_key) if rating_key else None),
}
matches[appearance_id] = match_info
# Call the on_match callback for real-time updates
if on_match:
await on_match(appearance_id, match_info)
logger.info(f"Matched {len(matches)} of {len(appearances)} appearances to Plex library")
return matches

445
modules/podchaser_client.py Normal file
View File

@@ -0,0 +1,445 @@
"""Podchaser GraphQL API client for podcast guest appearances tracking"""
import asyncio
from datetime import datetime, timedelta
from typing import Dict, List, Optional
from web.backend.core.http_client import http_client
from modules.universal_logger import get_logger
logger = get_logger('Podchaser')
class PodchaserClient:
"""Client for interacting with the Podchaser GraphQL API"""
API_URL = "https://api.podchaser.com/graphql"
def __init__(self, api_key: str):
# API key is actually the access token (already exchanged from client credentials)
self.api_key = api_key
self.headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
@classmethod
async def from_client_credentials(cls, client_id: str, client_secret: str):
"""
Create a PodchaserClient by exchanging client credentials for an access token
Args:
client_id: Podchaser client ID
client_secret: Podchaser client secret
Returns:
PodchaserClient instance with access token
"""
from web.backend.core.http_client import http_client
mutation = """
mutation GetToken($client_id: String!, $client_secret: String!) {
requestAccessToken(
input: {
grant_type: CLIENT_CREDENTIALS
client_id: $client_id
client_secret: $client_secret
}
) {
access_token
}
}
"""
variables = {
"client_id": client_id,
"client_secret": client_secret
}
try:
response = await http_client.post(
cls.API_URL,
json={"query": mutation, "variables": variables},
headers={"Content-Type": "application/json"}
)
data = response.json()
if "errors" in data:
logger.error(f"Failed to get Podchaser access token: {data['errors']}")
raise Exception(f"Podchaser authentication failed: {data['errors']}")
access_token = data.get("data", {}).get("requestAccessToken", {}).get("access_token")
if not access_token:
raise Exception("No access token returned from Podchaser")
logger.info("Successfully obtained Podchaser access token")
return cls(access_token)
except Exception as e:
logger.error(f"Error getting Podchaser access token: {e}")
raise
async def _execute_query(self, query: str, variables: Optional[Dict] = None) -> Dict:
"""Execute a GraphQL query"""
try:
payload = {"query": query}
if variables:
payload["variables"] = variables
response = await http_client.post(
self.API_URL,
json=payload,
headers=self.headers
)
data = response.json()
if "errors" in data:
logger.error(f"GraphQL errors: {data['errors']}")
return {}
return data.get("data", {})
except Exception as e:
logger.error(f"Podchaser API error: {e}")
return {}
async def search_creator_by_creators_endpoint(self, name: str) -> Optional[Dict]:
"""
Search for a creator using the creators endpoint
This is more direct than searching via credits or podcasts
"""
query = """
query FindCreator($term: String!) {
creators(searchTerm: $term, first: 10) {
data {
pcid
name
informalName
subtitle
imageUrl
url
episodeAppearanceCount
}
}
}
"""
variables = {"term": name}
data = await self._execute_query(query, variables)
if data and "creators" in data and data["creators"]["data"]:
creators = data["creators"]["data"]
# Prefer exact case-insensitive match
name_lower = name.strip().lower()
for creator in creators:
if creator.get("name") and creator["name"].strip().lower() == name_lower:
logger.info(f"Found exact creator match: {creator['name']} (pcid: {creator['pcid']})")
return creator
# Return first result if no exact match
if creators:
logger.info(f"Found creator: {creators[0]['name']} (pcid: {creators[0]['pcid']})")
return creators[0]
return None
async def search_creator(self, name: str) -> Optional[Dict]:
"""
Search for a creator by name using the creators endpoint
Returns the first matching creator or None
"""
return await self.search_creator_by_creators_endpoint(name)
async def get_creator_guest_appearances(self, creator_id: str, days_back: int = 30, days_ahead: int = 365) -> List[Dict]:
"""
Get all guest AND host appearances (episodeCredits) for a creator
Filters for recent and upcoming episodes
Args:
creator_id: Podchaser creator ID
days_back: How many days in the past to search
days_ahead: How many days in the future to search
Returns:
List of episode appearances with metadata (both guest and host roles)
"""
today = datetime.now().date()
cutoff_past = today - timedelta(days=days_back)
cutoff_future = today + timedelta(days=days_ahead)
query = """
query GetCreatorAppearances($creatorId: String!, $page: Int) {
creator(identifier: {type: PCID, id: $creatorId}) {
pcid
name
episodeCredits(
filters: { role: ["guest", "host"] }
first: 20
page: $page
sort: {sortBy: DATE, direction: DESCENDING}
) {
data {
role {
code
title
}
episode {
id
title
description
url
imageUrl
audioUrl
airDate
podcast {
id
title
imageUrl
url
categories {
title
slug
}
}
}
}
paginatorInfo {
currentPage
hasMorePages
lastPage
}
}
}
}
"""
page = 1
max_pages = 10 # Limit to prevent excessive API calls
appearances = []
while page <= max_pages:
variables = {
"creatorId": str(creator_id),
"page": page
}
data = await self._execute_query(query, variables)
if not data or "creator" not in data or not data["creator"]:
break
creator_data = data["creator"]
episode_credits = creator_data.get("episodeCredits", {}).get("data", [])
logger.info(f"Fetched {len(episode_credits)} episodes from Podchaser (page {page})")
for credit in episode_credits:
episode = credit.get("episode")
if not episode:
continue
# Check air date
air_date_str = episode.get("airDate")
if not air_date_str:
continue
try:
# Handle both "YYYY-MM-DD" and "YYYY-MM-DD HH:MM:SS" formats
# Take only the date part (first 10 characters for YYYY-MM-DD)
date_part = air_date_str[:10] if len(air_date_str) >= 10 else air_date_str
air_date = datetime.strptime(date_part, "%Y-%m-%d").date()
# Only include episodes within our time window
if cutoff_past <= air_date <= cutoff_future:
podcast = episode.get("podcast", {})
role_obj = credit.get("role", {})
role_name = role_obj.get("title") if isinstance(role_obj, dict) else None
appearances.append({
"podchaser_episode_id": episode.get("id"),
"episode_title": episode.get("title"),
"podcast_name": podcast.get("title"),
"description": episode.get("description"),
"air_date": air_date_str,
"episode_url": episode.get("url"),
"audio_url": episode.get("audioUrl"),
"poster_url": episode.get("imageUrl") or podcast.get("imageUrl"),
"role": role_name,
"podchaser_podcast_id": podcast.get("id"),
})
except ValueError as e:
logger.debug(f"Date parse error for episode: {e}")
continue
# Check if there are more pages
paginator = creator_data.get("episodeCredits", {}).get("paginatorInfo", {})
if not paginator.get("hasMorePages"):
break
page += 1
await asyncio.sleep(0.15) # Rate limiting
logger.info(f"Returning {len(appearances)} guest/host appearances for creator {creator_id}")
return appearances
async def get_creator_podcast_episodes(self, creator_name: str, days_back: int = 30, days_ahead: int = 365) -> List[Dict]:
"""
Get podcast episodes where the creator is a host
Searches for podcasts by the creator's name and returns recent episodes
Args:
creator_name: Creator's name to search for
days_back: How many days in the past to search
days_ahead: How many days in the future to search
Returns:
List of podcast episodes with metadata
"""
today = datetime.now().date()
cutoff_past = today - timedelta(days=days_back)
cutoff_future = today + timedelta(days=days_ahead)
# Search for podcasts by creator name
query = """
query SearchPodcastByHost($searchTerm: String!) {
podcasts(searchTerm: $searchTerm, first: 5) {
data {
id
title
imageUrl
url
credits(first: 20) {
data {
role {
code
title
}
creator {
pcid
name
}
}
}
episodes(first: 50, sort: {sortBy: AIR_DATE, direction: DESCENDING}) {
data {
id
title
description
url
imageUrl
audioUrl
airDate
}
}
}
}
}
"""
variables = {"searchTerm": creator_name}
data = await self._execute_query(query, variables)
appearances = []
if data and "podcasts" in data and data["podcasts"]["data"]:
for podcast in data["podcasts"]["data"]:
# Check if the creator is a host of this podcast
credits = podcast.get("credits", {}).get("data", [])
is_host = False
host_role = None
for credit in credits:
creator = credit.get("creator", {})
role = credit.get("role", {})
# Check if this is our creator and they're a host
if (role.get("code") == "host" and
creator.get("name") and
(creator_name.lower() in creator["name"].lower() or
creator["name"].lower() in creator_name.lower())):
is_host = True
host_role = role.get("title")
break
if not is_host:
continue
# Get episodes from this podcast
episodes = podcast.get("episodes", {}).get("data", [])
for episode in episodes:
air_date_str = episode.get("airDate")
if not air_date_str:
continue
try:
# Handle both "YYYY-MM-DD" and "YYYY-MM-DD HH:MM:SS" formats
# Take only the date part (first 10 characters for YYYY-MM-DD)
date_part = air_date_str[:10] if len(air_date_str) >= 10 else air_date_str
air_date = datetime.strptime(date_part, "%Y-%m-%d").date()
# Only include episodes within our time window
if cutoff_past <= air_date <= cutoff_future:
appearances.append({
"podchaser_episode_id": episode.get("id"),
"episode_title": episode.get("title"),
"podcast_name": podcast.get("title"),
"description": episode.get("description"),
"air_date": air_date_str,
"episode_url": episode.get("url"),
"audio_url": episode.get("audioUrl"),
"poster_url": episode.get("imageUrl") or podcast.get("imageUrl"),
"role": host_role,
"podchaser_podcast_id": podcast.get("id"),
})
except ValueError:
continue
return appearances
async def find_upcoming_podcast_appearances(self, creator_id: str, creator_name: str = None) -> List[Dict]:
"""
Find upcoming podcast appearances for a creator
Includes both guest appearances (episodeCredits) and hosted podcast episodes
Returns episodes that haven't aired yet or aired within last 90 days
Args:
creator_id: Podchaser creator ID (pcid)
creator_name: Creator's name (required for podcast search)
"""
# Get both guest appearances and hosted episodes
guest_appearances = await self.get_creator_guest_appearances(
creator_id,
days_back=365, # Look back 1 year for recent episodes
days_ahead=365
)
# For hosted episodes, we need the creator name
hosted_episodes = []
if creator_name:
hosted_episodes = await self.get_creator_podcast_episodes(
creator_name,
days_back=365, # Look back 1 year for recent episodes
days_ahead=365
)
else:
logger.warning(f"No creator name provided for {creator_id}, skipping podcast host search")
# Combine and deduplicate by episode ID
all_appearances = {}
for appearance in guest_appearances + hosted_episodes:
episode_id = appearance.get("podchaser_episode_id")
if episode_id:
# If duplicate, prefer the one with more info (hosted episodes usually have more)
if episode_id not in all_appearances or len(str(appearance.get("description", ""))) > len(str(all_appearances[episode_id].get("description", ""))):
all_appearances[episode_id] = appearance
# Sort by air date
sorted_appearances = sorted(
all_appearances.values(),
key=lambda x: x.get("air_date", ""),
reverse=True
)
return sorted_appearances

View File

@@ -0,0 +1,873 @@
#!/usr/bin/env python3
"""
Private Gallery Encryption Module
Provides security features for the Private Gallery:
- Password hashing with bcrypt
- Key derivation with Argon2id
- File encryption/decryption with AES-256-GCM
- Field encryption with Fernet
- Session token management
"""
import os
import secrets
import hashlib
import base64
import time
from datetime import datetime, timedelta
from typing import Optional, Dict, Tuple
from pathlib import Path
from threading import Lock
try:
import bcrypt
except ImportError:
bcrypt = None
try:
from argon2 import PasswordHasher
from argon2.low_level import hash_secret_raw, Type
ARGON2_AVAILABLE = True
except ImportError:
ARGON2_AVAILABLE = False
try:
from cryptography.fernet import Fernet
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
CRYPTO_AVAILABLE = True
except ImportError:
CRYPTO_AVAILABLE = False
from modules.universal_logger import get_logger
logger = get_logger('PrivateGalleryCrypto')
class PrivateGalleryCrypto:
"""
Handles all encryption operations for the Private Gallery.
Security features:
- Passwords hashed with bcrypt (cost factor 12)
- Encryption key derived from password using Argon2id
- Files encrypted with AES-256-GCM
- Database fields encrypted with Fernet (AES-128-CBC + HMAC)
- Session tokens with configurable timeout
"""
# Argon2id parameters (OWASP recommended)
ARGON2_TIME_COST = 3
ARGON2_MEMORY_COST = 65536 # 64 MiB
ARGON2_PARALLELISM = 4
ARGON2_HASH_LENGTH = 32 # 256 bits for AES-256
# AES-GCM parameters
AES_KEY_SIZE = 32 # 256 bits
AES_NONCE_SIZE = 12 # 96 bits (GCM recommended)
AES_TAG_SIZE = 16 # 128 bits
# Encryption chunk size for streaming large files
CHUNK_SIZE = 8 * 1024 * 1024 # 8 MB chunks
CHUNKED_THRESHOLD = 50 * 1024 * 1024 # Use chunked encryption for files > 50 MB
CHUNKED_MAGIC = b'\x01PGCE' # Magic bytes: version 1, Private Gallery Chunked Encryption
def __init__(self):
self._sessions: Dict[str, Dict] = {} # token -> {expiry, username}
self._session_lock = Lock()
self._derived_key: Optional[bytes] = None
self._fernet: Optional[Fernet] = None
self._aesgcm: Optional[AESGCM] = None
# Check dependencies
if not bcrypt:
logger.warning("bcrypt not available - password hashing will use fallback")
if not ARGON2_AVAILABLE:
logger.warning("argon2-cffi not available - key derivation will use PBKDF2")
if not CRYPTO_AVAILABLE:
raise ImportError("cryptography library required for Private Gallery")
# =========================================================================
# PASSWORD HASHING (bcrypt)
# =========================================================================
def hash_password(self, password: str) -> str:
"""
Hash a password using bcrypt with cost factor 12.
Args:
password: Plain text password
Returns:
bcrypt hash string (includes salt)
"""
if bcrypt:
salt = bcrypt.gensalt(rounds=12)
hashed = bcrypt.hashpw(password.encode('utf-8'), salt)
return hashed.decode('utf-8')
else:
# Fallback to PBKDF2 if bcrypt not available
salt = secrets.token_bytes(16)
kdf = PBKDF2HMAC(
algorithm=hashes.SHA256(),
length=32,
salt=salt,
iterations=600000,
)
key = kdf.derive(password.encode('utf-8'))
return f"pbkdf2${base64.b64encode(salt).decode()}${base64.b64encode(key).decode()}"
def verify_password(self, password: str, password_hash: str) -> bool:
"""
Verify a password against its hash.
Args:
password: Plain text password to check
password_hash: Stored hash to verify against
Returns:
True if password matches
"""
try:
if password_hash.startswith('pbkdf2$'):
# PBKDF2 fallback hash
parts = password_hash.split('$')
if len(parts) != 3:
return False
salt = base64.b64decode(parts[1])
stored_key = base64.b64decode(parts[2])
kdf = PBKDF2HMAC(
algorithm=hashes.SHA256(),
length=32,
salt=salt,
iterations=600000,
)
try:
kdf.verify(password.encode('utf-8'), stored_key)
return True
except Exception:
return False
elif bcrypt:
return bcrypt.checkpw(
password.encode('utf-8'),
password_hash.encode('utf-8')
)
else:
return False
except Exception as e:
logger.error(f"Password verification failed: {e}")
return False
# =========================================================================
# KEY DERIVATION (Argon2id or PBKDF2)
# =========================================================================
def derive_key(self, password: str, salt: bytes) -> bytes:
"""
Derive an encryption key from password using Argon2id.
Args:
password: User's password
salt: Random salt (should be stored)
Returns:
32-byte derived key for AES-256
"""
if ARGON2_AVAILABLE:
key = hash_secret_raw(
secret=password.encode('utf-8'),
salt=salt,
time_cost=self.ARGON2_TIME_COST,
memory_cost=self.ARGON2_MEMORY_COST,
parallelism=self.ARGON2_PARALLELISM,
hash_len=self.ARGON2_HASH_LENGTH,
type=Type.ID # Argon2id
)
return key
else:
# Fallback to PBKDF2 with high iterations
kdf = PBKDF2HMAC(
algorithm=hashes.SHA256(),
length=self.AES_KEY_SIZE,
salt=salt,
iterations=600000, # OWASP recommended minimum
)
return kdf.derive(password.encode('utf-8'))
def generate_salt(self) -> bytes:
"""Generate a cryptographically secure random salt."""
return secrets.token_bytes(16)
def initialize_encryption(self, password: str, salt: bytes) -> None:
"""
Initialize encryption with derived key.
Must be called after successful unlock.
Args:
password: User's password
salt: Stored salt for key derivation
"""
self._derived_key = self.derive_key(password, salt)
# Initialize Fernet for field encryption
# Fernet requires a 32-byte key, base64-encoded
fernet_key = base64.urlsafe_b64encode(self._derived_key)
self._fernet = Fernet(fernet_key)
# Initialize AES-GCM for file encryption
self._aesgcm = AESGCM(self._derived_key)
logger.info("Encryption initialized successfully")
def clear_encryption(self) -> None:
"""Clear encryption keys from memory (on lock)."""
self._derived_key = None
self._fernet = None
self._aesgcm = None
logger.info("Encryption keys cleared")
def is_initialized(self) -> bool:
"""Check if encryption is initialized (unlocked)."""
return self._derived_key is not None
# =========================================================================
# FIELD ENCRYPTION (Fernet - for database fields)
# =========================================================================
def encrypt_field(self, plaintext: str) -> str:
"""
Encrypt a database field value.
Args:
plaintext: Plain text to encrypt
Returns:
Base64-encoded encrypted string
"""
if not self._fernet:
raise RuntimeError("Encryption not initialized - call initialize_encryption first")
if not plaintext:
return ""
encrypted = self._fernet.encrypt(plaintext.encode('utf-8'))
return base64.urlsafe_b64encode(encrypted).decode('utf-8')
def decrypt_field(self, ciphertext: str) -> str:
"""
Decrypt a database field value.
Args:
ciphertext: Base64-encoded encrypted string
Returns:
Decrypted plain text
"""
if not self._fernet:
raise RuntimeError("Encryption not initialized - call initialize_encryption first")
if not ciphertext:
return ""
try:
encrypted = base64.urlsafe_b64decode(ciphertext.encode('utf-8'))
decrypted = self._fernet.decrypt(encrypted)
return decrypted.decode('utf-8')
except Exception as e:
logger.error(f"Field decryption failed: {e}")
return "[Decryption Error]"
# =========================================================================
# FILE ENCRYPTION (AES-256-GCM)
# =========================================================================
def encrypt_file(self, input_path: Path, output_path: Path) -> bool:
"""
Encrypt a file using AES-256-GCM.
Small files (<=50MB): single-shot format
[12-byte nonce][encrypted data + 16-byte tag]
Large files (>50MB): chunked format for memory efficiency
[5-byte magic 0x01PGCE][4-byte chunk_size BE]
[12-byte nonce][encrypted chunk + 16-byte tag] (repeated)
Args:
input_path: Path to plaintext file
output_path: Path for encrypted output
Returns:
True if successful
"""
if not self._aesgcm:
raise RuntimeError("Encryption not initialized")
try:
file_size = input_path.stat().st_size
output_path.parent.mkdir(parents=True, exist_ok=True)
if file_size <= self.CHUNKED_THRESHOLD:
# Small file: single-shot encryption (backward compatible)
nonce = secrets.token_bytes(self.AES_NONCE_SIZE)
with open(input_path, 'rb') as f:
plaintext = f.read()
ciphertext = self._aesgcm.encrypt(nonce, plaintext, None)
with open(output_path, 'wb') as f:
f.write(nonce)
f.write(ciphertext)
else:
# Large file: chunked encryption
import struct
with open(input_path, 'rb') as fin, open(output_path, 'wb') as fout:
# Write header
fout.write(self.CHUNKED_MAGIC)
fout.write(struct.pack('>I', self.CHUNK_SIZE))
# Encrypt in chunks
while True:
chunk = fin.read(self.CHUNK_SIZE)
if not chunk:
break
nonce = secrets.token_bytes(self.AES_NONCE_SIZE)
encrypted_chunk = self._aesgcm.encrypt(nonce, chunk, None)
# Write chunk: nonce + encrypted data (includes GCM tag)
fout.write(nonce)
fout.write(struct.pack('>I', len(encrypted_chunk)))
fout.write(encrypted_chunk)
return True
except Exception as e:
logger.error(f"File encryption failed: {e}")
# Clean up partial output
if output_path.exists():
try:
output_path.unlink()
except Exception:
pass
return False
def _is_chunked_format(self, input_path: Path) -> bool:
"""Check if an encrypted file uses the chunked format."""
try:
with open(input_path, 'rb') as f:
magic = f.read(len(self.CHUNKED_MAGIC))
return magic == self.CHUNKED_MAGIC
except Exception:
return False
def decrypt_file(self, input_path: Path, output_path: Optional[Path] = None) -> Optional[bytes]:
"""
Decrypt a file encrypted with AES-256-GCM.
Handles both single-shot and chunked formats.
Args:
input_path: Path to encrypted file
output_path: Optional path to write decrypted file
Returns:
Decrypted bytes if output_path is None, else None on success
"""
if not self._aesgcm:
raise RuntimeError("Encryption not initialized")
try:
if self._is_chunked_format(input_path):
return self._decrypt_file_chunked(input_path, output_path)
# Single-shot format: [nonce][ciphertext+tag]
with open(input_path, 'rb') as f:
nonce = f.read(self.AES_NONCE_SIZE)
if len(nonce) != self.AES_NONCE_SIZE:
raise ValueError("Invalid encrypted file: missing nonce")
ciphertext = f.read()
plaintext = self._aesgcm.decrypt(nonce, ciphertext, None)
if output_path:
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'wb') as f:
f.write(plaintext)
return None
return plaintext
except Exception as e:
logger.error(f"File decryption failed: {e}")
return None
def _decrypt_file_chunked(self, input_path: Path, output_path: Optional[Path] = None) -> Optional[bytes]:
"""Decrypt a chunked-format encrypted file."""
import struct
try:
parts = [] if output_path is None else None
with open(input_path, 'rb') as fin:
# Read header
magic = fin.read(len(self.CHUNKED_MAGIC))
if magic != self.CHUNKED_MAGIC:
raise ValueError("Invalid chunked file header")
chunk_size_bytes = fin.read(4)
# chunk_size from header (informational, actual sizes are per-chunk)
struct.unpack('>I', chunk_size_bytes)
fout = None
if output_path:
output_path.parent.mkdir(parents=True, exist_ok=True)
fout = open(output_path, 'wb')
try:
while True:
# Read chunk: [12-byte nonce][4-byte encrypted_len][encrypted data]
nonce = fin.read(self.AES_NONCE_SIZE)
if len(nonce) == 0:
break # EOF
if len(nonce) != self.AES_NONCE_SIZE:
raise ValueError("Truncated chunk nonce")
enc_len_bytes = fin.read(4)
if len(enc_len_bytes) != 4:
raise ValueError("Truncated chunk length")
enc_len = struct.unpack('>I', enc_len_bytes)[0]
encrypted_chunk = fin.read(enc_len)
if len(encrypted_chunk) != enc_len:
raise ValueError("Truncated chunk data")
decrypted_chunk = self._aesgcm.decrypt(nonce, encrypted_chunk, None)
if fout:
fout.write(decrypted_chunk)
else:
parts.append(decrypted_chunk)
finally:
if fout:
fout.close()
if output_path:
return None
return b''.join(parts)
except Exception as e:
logger.error(f"Chunked file decryption failed for {input_path}: {type(e).__name__}: {e}")
return None
def re_encrypt_to_chunked(self, file_path: Path) -> bool:
"""
Re-encrypt a single-shot encrypted file to chunked format in-place.
Decrypts and re-encrypts in chunks to avoid loading the entire file into memory.
Args:
file_path: Path to the single-shot encrypted file
Returns:
True if successful, False if already chunked or on error
"""
if not self._aesgcm:
raise RuntimeError("Encryption not initialized")
if self._is_chunked_format(file_path):
return False # Already chunked
import struct
temp_path = file_path.with_suffix(f'.enc.{secrets.token_hex(4)}.tmp')
try:
# Decrypt the single-shot file fully (required by AES-GCM)
with open(file_path, 'rb') as f:
nonce = f.read(self.AES_NONCE_SIZE)
if len(nonce) != self.AES_NONCE_SIZE:
raise ValueError("Invalid encrypted file")
ciphertext = f.read()
plaintext = self._aesgcm.decrypt(nonce, ciphertext, None)
del ciphertext # Free memory
# Write chunked format to temp file
with open(temp_path, 'wb') as fout:
fout.write(self.CHUNKED_MAGIC)
fout.write(struct.pack('>I', self.CHUNK_SIZE))
offset = 0
while offset < len(plaintext):
chunk = plaintext[offset:offset + self.CHUNK_SIZE]
offset += len(chunk)
chunk_nonce = secrets.token_bytes(self.AES_NONCE_SIZE)
encrypted_chunk = self._aesgcm.encrypt(chunk_nonce, chunk, None)
fout.write(chunk_nonce)
fout.write(struct.pack('>I', len(encrypted_chunk)))
fout.write(encrypted_chunk)
del plaintext # Free memory
# Atomic replace
temp_path.replace(file_path)
return True
except Exception as e:
logger.error(f"Re-encryption to chunked failed for {file_path}: {e}")
if temp_path.exists():
try:
temp_path.unlink()
except Exception:
pass
return False
def decrypt_file_streaming(self, input_path: Path) -> Optional[bytes]:
"""
Decrypt a file and return bytes for streaming.
Only suitable for small files (single-shot format, ≤50MB).
For large chunked files, use decrypt_file_generator() instead.
Args:
input_path: Path to encrypted file
Returns:
Decrypted bytes or None on error
"""
return self.decrypt_file(input_path, output_path=None)
def decrypt_file_generator(self, input_path: Path):
"""
Generator that yields decrypted chunks for streaming large files.
For chunked files, yields one decrypted chunk at a time (~8MB each).
For single-shot files, yields the entire content at once.
Args:
input_path: Path to encrypted file
Yields:
bytes: Decrypted data chunks
"""
import struct
if not self._aesgcm:
raise RuntimeError("Encryption not initialized")
if self._is_chunked_format(input_path):
with open(input_path, 'rb') as fin:
# Skip header
fin.read(len(self.CHUNKED_MAGIC))
fin.read(4)
while True:
nonce = fin.read(self.AES_NONCE_SIZE)
if len(nonce) == 0:
break
if len(nonce) != self.AES_NONCE_SIZE:
raise ValueError("Truncated chunk nonce")
enc_len_bytes = fin.read(4)
if len(enc_len_bytes) != 4:
raise ValueError("Truncated chunk length")
enc_len = struct.unpack('>I', enc_len_bytes)[0]
encrypted_chunk = fin.read(enc_len)
if len(encrypted_chunk) != enc_len:
raise ValueError("Truncated chunk data")
yield self._aesgcm.decrypt(nonce, encrypted_chunk, None)
else:
# Single-shot: yield everything at once (≤50MB)
with open(input_path, 'rb') as f:
nonce = f.read(self.AES_NONCE_SIZE)
if len(nonce) != self.AES_NONCE_SIZE:
raise ValueError("Invalid encrypted file: missing nonce")
ciphertext = f.read()
yield self._aesgcm.decrypt(nonce, ciphertext, None)
def decrypt_file_range_generator(self, input_path: Path, start: int, end: int):
"""
Generator that yields only the decrypted bytes for a specific byte range.
For chunked files, only decrypts the necessary chunks and slices them.
For single-shot files, decrypts all and slices.
Args:
input_path: Path to encrypted file
start: Start byte offset (inclusive)
end: End byte offset (inclusive)
Yields:
bytes: Decrypted data for the requested range
"""
import struct
if not self._aesgcm:
raise RuntimeError("Encryption not initialized")
if not self._is_chunked_format(input_path):
# Single-shot: decrypt all and slice (file is ≤50MB)
with open(input_path, 'rb') as f:
nonce = f.read(self.AES_NONCE_SIZE)
ciphertext = f.read()
plaintext = self._aesgcm.decrypt(nonce, ciphertext, None)
yield plaintext[start:end + 1]
return
chunk_size = self.CHUNK_SIZE
first_chunk = start // chunk_size
last_chunk = end // chunk_size
# Header: 5 magic + 4 chunk_size = 9 bytes
header_size = len(self.CHUNKED_MAGIC) + 4
# Each full encrypted chunk: 12 nonce + 4 length + (chunk_size + 16 tag)
enc_chunk_stride = self.AES_NONCE_SIZE + 4 + chunk_size + self.AES_TAG_SIZE
with open(input_path, 'rb') as fin:
for chunk_idx in range(first_chunk, last_chunk + 1):
# Seek to this chunk's position in the encrypted file
fin.seek(header_size + chunk_idx * enc_chunk_stride)
nonce = fin.read(self.AES_NONCE_SIZE)
if len(nonce) == 0:
break
if len(nonce) != self.AES_NONCE_SIZE:
raise ValueError("Truncated chunk nonce")
enc_len_bytes = fin.read(4)
if len(enc_len_bytes) != 4:
raise ValueError("Truncated chunk length")
enc_len = struct.unpack('>I', enc_len_bytes)[0]
encrypted_chunk = fin.read(enc_len)
if len(encrypted_chunk) != enc_len:
raise ValueError("Truncated chunk data")
decrypted_chunk = self._aesgcm.decrypt(nonce, encrypted_chunk, None)
# Calculate which part of this chunk we need
chunk_start_byte = chunk_idx * chunk_size
slice_start = max(start - chunk_start_byte, 0)
slice_end = min(end - chunk_start_byte + 1, len(decrypted_chunk))
yield decrypted_chunk[slice_start:slice_end]
# =========================================================================
# SESSION MANAGEMENT
# =========================================================================
def create_session(self, username: str = "user", timeout_minutes: int = 30) -> str:
"""
Create a new session token.
Args:
username: Username for the session
timeout_minutes: Session timeout in minutes
Returns:
Session token string
"""
token = secrets.token_urlsafe(32)
expiry = datetime.now() + timedelta(minutes=timeout_minutes)
with self._session_lock:
self._sessions[token] = {
'expiry': expiry,
'username': username,
'created_at': datetime.now()
}
logger.info(f"Created session for {username}, expires in {timeout_minutes} minutes")
return token
def verify_session(self, token: str) -> Optional[Dict]:
"""
Verify a session token is valid and not expired.
Args:
token: Session token to verify
Returns:
Session info dict if valid, None otherwise
"""
with self._session_lock:
session = self._sessions.get(token)
if not session:
return None
if datetime.now() > session['expiry']:
# Expired - remove it
del self._sessions[token]
return None
return session
def refresh_session(self, token: str, timeout_minutes: int = 30) -> bool:
"""
Refresh a session's expiry time.
Args:
token: Session token to refresh
timeout_minutes: New timeout in minutes
Returns:
True if refreshed, False if token invalid
"""
with self._session_lock:
session = self._sessions.get(token)
if not session:
return False
if datetime.now() > session['expiry']:
del self._sessions[token]
return False
session['expiry'] = datetime.now() + timedelta(minutes=timeout_minutes)
return True
def invalidate_session(self, token: str) -> bool:
"""
Invalidate a session token (logout/lock).
Args:
token: Session token to invalidate
Returns:
True if invalidated, False if not found
"""
with self._session_lock:
if token in self._sessions:
del self._sessions[token]
return True
return False
def invalidate_all_sessions(self) -> int:
"""
Invalidate all sessions (master lock).
Returns:
Number of sessions invalidated
"""
with self._session_lock:
count = len(self._sessions)
self._sessions.clear()
return count
def cleanup_expired_sessions(self) -> int:
"""
Remove all expired sessions.
Returns:
Number of sessions removed
"""
with self._session_lock:
now = datetime.now()
expired = [t for t, s in self._sessions.items() if now > s['expiry']]
for token in expired:
del self._sessions[token]
return len(expired)
def get_active_session_count(self) -> int:
"""Get count of active (non-expired) sessions."""
self.cleanup_expired_sessions()
return len(self._sessions)
# Global instance
_crypto_instance: Optional[PrivateGalleryCrypto] = None
_crypto_lock = Lock()
def get_private_gallery_crypto() -> PrivateGalleryCrypto:
"""Get or create the global crypto instance."""
global _crypto_instance
with _crypto_lock:
if _crypto_instance is None:
_crypto_instance = PrivateGalleryCrypto()
return _crypto_instance
def export_key_to_file(path: str) -> bool:
"""
Save the current derived key from the global crypto instance to a file.
The file is written with mode 0600 for security.
Args:
path: File path to write the key material to
Returns:
True if successful
"""
import json as _json
crypto = get_private_gallery_crypto()
if not crypto.is_initialized() or crypto._derived_key is None:
logger.warning("Cannot export key: encryption not initialized")
return False
try:
key_data = {
'derived_key': base64.b64encode(crypto._derived_key).decode('utf-8')
}
key_path = Path(path)
key_path.parent.mkdir(parents=True, exist_ok=True)
# Write atomically via temp file
tmp_path = key_path.with_suffix('.tmp')
with open(tmp_path, 'w') as f:
_json.dump(key_data, f)
os.chmod(str(tmp_path), 0o600)
tmp_path.replace(key_path)
logger.info(f"Exported encryption key to {path}")
return True
except Exception as e:
logger.error(f"Failed to export key to {path}: {e}")
return False
def load_key_from_file(path: str) -> Optional[PrivateGalleryCrypto]:
"""
Load a derived key from a file and return an initialized crypto instance.
Args:
path: File path containing the key material
Returns:
Initialized PrivateGalleryCrypto instance, or None if unavailable
"""
import json as _json
key_path = Path(path)
if not key_path.exists():
return None
try:
with open(key_path, 'r') as f:
key_data = _json.load(f)
derived_key = base64.b64decode(key_data['derived_key'])
crypto = PrivateGalleryCrypto()
crypto._derived_key = derived_key
# Initialize Fernet for field encryption
fernet_key = base64.urlsafe_b64encode(derived_key)
crypto._fernet = Fernet(fernet_key)
# Initialize AES-GCM for file encryption
crypto._aesgcm = AESGCM(derived_key)
return crypto
except Exception as e:
logger.error(f"Failed to load key from {path}: {e}")
return None
def delete_key_file(path: str) -> bool:
"""Delete the key file if it exists."""
try:
key_path = Path(path)
if key_path.exists():
key_path.unlink()
logger.info(f"Deleted key file {path}")
return True
except Exception as e:
logger.error(f"Failed to delete key file {path}: {e}")
return False

View File

@@ -0,0 +1,961 @@
#!/usr/bin/env python3
"""
Pushover Notification Module
Sends professional push notifications when new media is downloaded
"""
import os
import requests
from datetime import datetime
from typing import Dict, Optional, Any
from pathlib import Path
from modules.universal_logger import get_logger
logger = get_logger('Notifier')
class PushoverNotifier:
"""Handles Pushover push notifications for media downloads"""
# Pushover API endpoint
API_URL = "https://api.pushover.net/1/messages.json"
# Plural forms for proper grammar
PLURALS = {
'story': 'stories',
'video': 'videos',
'photo': 'photos',
'image': 'images',
'reel': 'reels',
'post': 'posts',
'thread': 'threads',
'item': 'items',
'media': 'media', # Already plural (singular: medium)
'tagged': 'tagged', # "Tagged" doesn't change in plural (7 Tagged Photos)
'audio': 'audio', # Uncountable (3 Audio Downloaded)
}
# Priority levels
PRIORITY_LOW = -2
PRIORITY_NORMAL = -1
PRIORITY_DEFAULT = 0
PRIORITY_HIGH = 1
PRIORITY_EMERGENCY = 2
# Platform emoji/icons for better visual appeal
PLATFORM_ICONS = {
'instagram': '📸',
'fastdl': '📸',
'imginn': '📸',
'toolzu': '📸',
'tiktok': '🎵',
'forums': '💬',
'snapchat': '👻',
'youtube': '▶️',
'twitter': '🐦',
'easynews': '📰',
}
# Platform name mapping (service name -> user-friendly platform name)
PLATFORM_NAMES = {
'fastdl': 'Instagram',
'imginn': 'Instagram',
'toolzu': 'Instagram',
'instagram': 'Instagram',
'tiktok': 'TikTok',
'snapchat': 'Snapchat',
'forums': 'Forum',
'easynews': 'Easynews',
}
# Content type icons
CONTENT_ICONS = {
'post': '🖼️',
'story': '',
'reel': '🎬',
'video': '🎥',
'image': '🖼️',
'thread': '🧵',
'photo': '📷',
'audio': '🎵',
}
def __init__(self, user_key: str, api_token: str, enabled: bool = True,
default_priority: int = 0, device: str = None, include_image: bool = True,
unified_db=None, enable_review_queue_notifications: bool = True):
"""
Initialize Pushover notifier
Args:
user_key: Your Pushover user key
api_token: Your Pushover application API token
enabled: Whether notifications are enabled
default_priority: Default notification priority (-2 to 2)
device: Specific device name to send to (optional)
include_image: Whether to include image thumbnails in notifications (default: True)
unified_db: UnifiedDatabase instance for recording notifications (optional)
enable_review_queue_notifications: Whether to send push notifications for review queue items (default: True)
"""
self.user_key = user_key
self.api_token = api_token
self.enabled = enabled
self.default_priority = default_priority
self.device = device
self.include_image = include_image
self.unified_db = unified_db
self.enable_review_queue_notifications = enable_review_queue_notifications
self.stats = {
'sent': 0,
'failed': 0,
'skipped': 0
}
# Tracking for database recording
self._current_notification_context = None
def _record_notification(self, title: str, message: str, priority: int, status: str, response_data: dict, image_path: str = None):
"""Record notification to database
Args:
title: Notification title
message: Notification message
priority: Priority level
status: Status ('sent' or 'failed')
response_data: Response from Pushover API
image_path: Optional path to thumbnail image
"""
if not self.unified_db:
logger.debug("[Pushover] No database connection available for recording notification")
return
if not self._current_notification_context:
logger.debug("[Pushover] No notification context available for recording")
return
try:
import json
context = self._current_notification_context
# Add image path to metadata if provided
metadata = context.get('metadata', {}) or {}
if image_path:
metadata['image_path'] = str(image_path)
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
INSERT INTO notifications (
platform, source, content_type, message, title,
priority, download_count, sent_at, status, response_data, metadata
) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'), ?, ?, ?)
""", (
context.get('platform'),
context.get('source'),
context.get('content_type'),
message,
title,
priority,
context.get('download_count', 1),
status,
json.dumps(response_data) if response_data else None,
json.dumps(metadata) if metadata else None
))
conn.commit()
logger.info(f"[Pushover] Recorded notification to database: {title} (status: {status})")
# Broadcast to frontend for real-time toast notification
try:
from web.backend.api import manager
if manager and manager.active_connections:
manager.broadcast_sync({
'type': 'notification_created',
'notification': {
'title': title,
'message': message,
'platform': context.get('platform'),
'source': context.get('source'),
'content_type': context.get('content_type'),
'download_count': context.get('download_count', 1),
'status': status,
}
})
except Exception:
# Fail silently - API may not be running or manager not available
pass
# Clear context after recording to prevent stale data on subsequent notifications
self._current_notification_context = None
except Exception as e:
logger.warning(f"[Pushover] Failed to record notification to database: {e}")
import traceback
logger.warning(f"[Pushover] Traceback: {traceback.format_exc()}")
def _get_platform_display_name(self, platform: str, source: str = None) -> str:
"""
Convert service name to user-friendly platform name
Args:
platform: Service/platform name (fastdl, imginn, toolzu, etc.)
source: Source/username (for forums, this is the forum name)
Returns:
User-friendly platform name (Instagram, TikTok, etc.)
"""
platform_lower = platform.lower()
# For forums, use the forum name (source) as the platform display name
if platform_lower == 'forums' and source:
return source.title()
return self.PLATFORM_NAMES.get(platform_lower, platform.title())
def _pluralize(self, word: str, count: int) -> str:
"""
Get the correct plural form of a word
Args:
word: Singular word
count: Count to determine if plural needed
Returns:
Singular or plural form
"""
# Handle None or empty word
if not word:
return "items" if count != 1 else "item"
if count == 1:
return word
# Check if we have a custom plural
word_lower = word.lower()
if word_lower in self.PLURALS:
return self.PLURALS[word_lower].title() if word[0].isupper() else self.PLURALS[word_lower]
# Check if word is already a plural form (value in PLURALS)
if word_lower in self.PLURALS.values():
return word # Already plural, return as-is
# Default: just add 's' (but not if already ends with 's')
if word_lower.endswith('s'):
return word
return f"{word}s"
def _extract_random_video_frame(self, video_path: str) -> str:
"""
Extract a random frame from a video file
Args:
video_path: Path to the video file
Returns:
Path to extracted frame (temp file) or None if extraction failed
"""
import subprocess
import random
import tempfile
try:
# Get video duration using ffprobe
ffprobe_cmd = [
'ffprobe',
'-v', 'error',
'-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1',
video_path
]
result = subprocess.run(
ffprobe_cmd,
capture_output=True,
text=True,
timeout=10
)
if result.returncode != 0:
logger.warning(f"[Pushover] ffprobe failed to get video duration: {result.stderr[:200]}")
return None
duration = float(result.stdout.strip())
# Skip first and last 10% to avoid black frames
start_offset = duration * 0.1
end_offset = duration * 0.9
if end_offset <= start_offset:
# Video too short, just use middle
timestamp = duration / 2
else:
# Pick random timestamp in the middle 80%
timestamp = random.uniform(start_offset, end_offset)
logger.debug(f"[Pushover] Video duration: {duration:.2f}s, extracting frame at {timestamp:.2f}s")
# Create temp file for the frame
temp_fd, temp_path = tempfile.mkstemp(suffix='.jpg', prefix='pushover_frame_')
os.close(temp_fd) # Close the file descriptor, ffmpeg will write to it
success = False
try:
# Extract frame using ffmpeg
ffmpeg_cmd = [
'ffmpeg',
'-ss', str(timestamp), # Seek to timestamp
'-i', video_path, # Input file
'-vframes', '1', # Extract 1 frame
'-q:v', '2', # High quality
'-y', # Overwrite output
temp_path
]
result = subprocess.run(
ffmpeg_cmd,
capture_output=True,
text=True,
timeout=30
)
if result.returncode != 0:
logger.debug(f"[Pushover] ffmpeg failed: {result.stderr}")
return None
# Verify the frame was created
if Path(temp_path).exists() and Path(temp_path).stat().st_size > 0:
success = True
return temp_path
else:
logger.debug("[Pushover] Frame extraction produced empty file")
return None
except subprocess.TimeoutExpired:
logger.debug("[Pushover] Video frame extraction timed out")
return None
finally:
# Clean up temp file if extraction failed
if not success:
try:
Path(temp_path).unlink(missing_ok=True)
except OSError:
pass
except Exception as e:
logger.debug(f"[Pushover] Error extracting video frame: {e}")
return None
def send_notification(self,
title: str,
message: str,
priority: int = None,
url: str = None,
url_title: str = None,
sound: str = None,
device: str = None,
html: bool = False,
image_path: str = None,
max_retries: int = 3,
retry_delay: int = 5) -> bool:
"""
Send a Pushover notification with automatic retry on transient failures
Args:
title: Notification title
message: Notification message
priority: Priority level (-2 to 2)
url: Supplementary URL
url_title: Title for the URL
sound: Notification sound name
device: Specific device to send to
html: Enable HTML formatting
image_path: Path to image file to attach as thumbnail
max_retries: Maximum number of retry attempts (default 3)
retry_delay: Initial retry delay in seconds, doubles each retry (default 5)
Returns:
True if notification sent successfully
"""
if not self.enabled:
logger.debug("[Pushover] Notifications disabled, skipping")
self.stats['skipped'] += 1
return False
if not self.user_key or not self.api_token:
logger.warning("[Pushover] Missing user_key or api_token")
self.stats['failed'] += 1
return False
# Normalize priority
actual_priority = priority if priority is not None else self.default_priority
# Prepare payload
payload = {
'token': self.api_token,
'user': self.user_key,
'title': title,
'message': message,
'priority': actual_priority,
}
# Add optional parameters
if url:
payload['url'] = url
if url_title:
payload['url_title'] = url_title
if sound:
payload['sound'] = sound
if device or self.device:
payload['device'] = device or self.device
if html:
payload['html'] = 1
# Retry loop with exponential backoff
for attempt in range(max_retries):
try:
# Check if we have an image to attach
files = None
if image_path:
from pathlib import Path
img_path = Path(image_path)
# Only attach if file exists and is an image
if img_path.exists() and img_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
try:
# Determine MIME type
mime_type = 'image/jpeg'
if img_path.suffix.lower() == '.png':
mime_type = 'image/png'
elif img_path.suffix.lower() == '.gif':
mime_type = 'image/gif'
elif img_path.suffix.lower() == '.bmp':
mime_type = 'image/bmp'
elif img_path.suffix.lower() == '.webp':
mime_type = 'image/webp'
# Open and attach the image
files = {'attachment': (img_path.name, open(img_path, 'rb'), mime_type)}
logger.debug(f"[Pushover] Attaching image: {img_path.name}")
except Exception as e:
logger.warning(f"[Pushover] Failed to attach image {image_path}: {e}")
response = requests.post(self.API_URL, data=payload, files=files, timeout=30)
# Close file if opened
if files and 'attachment' in files:
files['attachment'][1].close()
if response.status_code == 200:
result = response.json()
if result.get('status') == 1:
request_id = result.get('request', 'unknown')
if attempt > 0:
logger.info(f"[Pushover] Notification sent after {attempt + 1} attempt(s): {title} (request: {request_id})")
else:
logger.info(f"[Pushover] Notification sent: {title} (request: {request_id})")
self.stats['sent'] += 1
# Record to database if available and we have context
self._record_notification(title, message, actual_priority, 'sent', result, image_path)
return True
else:
# API returned error status - don't retry client errors
logger.error(f"[Pushover] API error: {result}")
self.stats['failed'] += 1
# Record failure to database
self._record_notification(title, message, actual_priority, 'failed', result, image_path)
return False
# Handle HTTP errors with retry logic
elif response.status_code >= 500:
# Server error (5xx) - retry with backoff
if attempt < max_retries - 1:
wait_time = retry_delay * (2 ** attempt)
logger.warning(f"[Pushover] HTTP {response.status_code}: {response.text[:100]}, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
import time
time.sleep(wait_time)
continue
else:
# Max retries exceeded
logger.error(f"[Pushover] HTTP {response.status_code} after {max_retries} attempts: {response.text}")
self.stats['failed'] += 1
self._record_notification(title, message, actual_priority, 'failed', {'error': f"HTTP {response.status_code} after {max_retries} retries"}, image_path)
return False
else:
# Client error (4xx) - don't retry
logger.error(f"[Pushover] HTTP {response.status_code}: {response.text}")
self.stats['failed'] += 1
self._record_notification(title, message, actual_priority, 'failed', {'error': response.text}, image_path)
return False
except (requests.ConnectionError, requests.Timeout) as e:
# Network errors - retry with backoff
if attempt < max_retries - 1:
wait_time = retry_delay * (2 ** attempt)
logger.warning(f"[Pushover] Network error: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
import time
time.sleep(wait_time)
continue
else:
# Max retries exceeded
logger.error(f"[Pushover] Network error after {max_retries} attempts: {e}")
self.stats['failed'] += 1
self._record_notification(title, message, actual_priority, 'failed', {'error': f"Network error after {max_retries} retries: {str(e)}"}, image_path)
return False
except Exception as e:
# Other exceptions - don't retry
logger.error(f"[Pushover] Failed to send notification: {e}")
self.stats['failed'] += 1
self._record_notification(title, message, actual_priority, 'failed', {'error': str(e)}, image_path)
return False
# Should never reach here, but just in case
return False
def notify_download(self,
platform: str,
source: str,
content_type: str,
filename: str = None,
search_term: str = None,
count: int = 1,
metadata: Dict[str, Any] = None,
priority: int = None) -> bool:
"""
Send a professional notification for a new download
Args:
platform: Platform name (instagram, tiktok, forum, etc.)
source: Username or source identifier
content_type: Type of content (post, story, reel, thread, etc.)
filename: Optional filename
search_term: Optional search term (for forum searches)
count: Number of items downloaded (default 1)
metadata: Additional metadata dictionary
priority: Notification priority
Returns:
True if notification sent successfully
"""
metadata = metadata or {}
# Handle None content_type
content_type = content_type or 'item'
# Get appropriate icons
platform_icon = self.PLATFORM_ICONS.get(platform.lower(), '📥')
content_icon = self.CONTENT_ICONS.get(content_type.lower(), '📄')
# Build title with proper grammar
if count > 1:
plural_type = self._pluralize(content_type, count)
title = f"{platform_icon} {count} {plural_type.title()} Downloaded"
else:
title = f"{platform_icon} New {content_type.title()} Downloaded"
# Build message
message_parts = []
# Add platform (convert service name to user-friendly platform name)
# For forums, use forum name; for Instagram services, use "Instagram"
platform_display = self._get_platform_display_name(platform, source)
message_parts.append(f"📱 <b>Platform:</b> {platform_display}")
# Add source/username (skip for forums since source becomes the platform name)
if source and platform.lower() != 'forums':
message_parts.append(f"{content_icon} <b>Source:</b> {source}")
# Add search term if available
if search_term:
message_parts.append(f"🔍 <b>Search:</b> {search_term}")
# Add post date if available
if metadata.get('post_date'):
try:
if isinstance(metadata['post_date'], str):
post_date = datetime.fromisoformat(metadata['post_date'])
else:
post_date = metadata['post_date']
date_str = post_date.strftime("%Y-%m-%d %H:%M")
message_parts.append(f"📅 <b>Posted:</b> {date_str}")
except Exception:
pass
# Add timestamp
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
message_parts.append(f"⏰ <b>Downloaded:</b> {now}")
message = "\n".join(message_parts)
# Set context for database recording
self._current_notification_context = {
'platform': platform,
'source': source,
'content_type': content_type,
'download_count': count,
'metadata': {'search_term': search_term} if search_term else metadata
}
# Determine sound based on platform or priority
sound = None
if priority and priority >= self.PRIORITY_HIGH:
sound = "pushover" # Default urgent sound
return self.send_notification(
title=title,
message=message,
priority=priority,
sound=sound,
html=True
)
def notify_batch_download(self,
platform: str,
downloads: list,
search_term: str = None,
is_review_queue: bool = False) -> bool:
"""
Send notification for batch downloads
Args:
platform: Platform name
downloads: List of download dicts with keys: source, content_type, filename, file_path
search_term: Optional search term
is_review_queue: True if these are review queue items (no face match)
Returns:
True if notification sent successfully
"""
if not downloads:
return False
# Check if review queue notifications are disabled
# Always check current database value for review queue notifications
if is_review_queue:
if self.unified_db:
try:
from modules.settings_manager import SettingsManager
settings_manager = SettingsManager(str(self.unified_db.db_path))
pushover_settings = settings_manager.get('pushover', {})
enable_review_notifications = pushover_settings.get('enable_review_queue_notifications', True)
if not enable_review_notifications:
logger.debug("[Pushover] Skipping review queue notification (disabled in settings)")
return False
except Exception as e:
logger.warning(f"[Pushover] Could not check review queue notification setting, using cached value: {e}")
# Fall back to cached value
if not self.enable_review_queue_notifications:
logger.debug("[Pushover] Skipping review queue notification (disabled in cached settings)")
return False
else:
# No database, use cached value
if not self.enable_review_queue_notifications:
logger.debug("[Pushover] Skipping review queue notification (disabled in settings)")
return False
# Extract source from first download
source = None
if downloads and downloads[0].get('source'):
source = downloads[0]['source']
# Extract content type (handle None explicitly)
content_type = (downloads[0].get('content_type') or 'item') if downloads else 'item'
# Collect all media file paths for the notification database record
image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.heic', '.heif', '.avif', '.tiff', '.tif'}
video_extensions = {'.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v', '.flv'}
audio_extensions = {'.mp3', '.wav', '.flac', '.aac', '.m4a', '.ogg', '.wma'}
all_media_paths = []
for dl in downloads:
file_path = dl.get('file_path')
if file_path and Path(file_path).exists():
suffix = Path(file_path).suffix.lower()
ct = dl.get('content_type', '').lower()
if ct == 'audio' or suffix in audio_extensions:
media_type = 'audio'
elif ct == 'image' or suffix in image_extensions:
media_type = 'image'
elif ct == 'video' or suffix in video_extensions:
media_type = 'video'
else:
continue
all_media_paths.append({
'file_path': file_path,
'filename': dl.get('filename', Path(file_path).name),
'media_type': media_type
})
# Set context for database recording with all media files
metadata = {}
if search_term:
metadata['search_term'] = search_term
if all_media_paths:
metadata['media_files'] = all_media_paths # Store all media files for notifications page
self._current_notification_context = {
'platform': platform,
'source': source,
'content_type': content_type,
'download_count': len(downloads),
'metadata': metadata if metadata else None
}
# Use different icon for review queue
if is_review_queue:
platform_icon = "👁️" # Eye icon for review
else:
platform_icon = self.PLATFORM_ICONS.get(platform.lower(), '📥')
# Group by content type
by_type = {}
for dl in downloads:
content_type = dl.get('content_type') or 'item' # Handle None explicitly
by_type.setdefault(content_type, []).append(dl)
# Build title with proper grammar
total = len(downloads)
if is_review_queue:
# Review queue notification
if len(by_type) == 1:
content_type = list(by_type.keys())[0]
plural_type = self._pluralize(content_type, total)
title = f"{platform_icon} {total} {plural_type.title()} - Review Queue"
else:
title = f"{platform_icon} {total} Items - Review Queue"
else:
# Regular download notification
if len(by_type) == 1:
# Single content type - use specific name
content_type = list(by_type.keys())[0]
plural_type = self._pluralize(content_type, total)
title = f"{platform_icon} {total} {plural_type.title()} Downloaded"
else:
# Multiple content types - use "Items"
title = f"{platform_icon} {total} Items Downloaded"
# Build message
message_parts = []
# Extract source from first download since they're all from same source
source = None
if downloads and downloads[0].get('source'):
source = downloads[0]['source']
# Add platform (convert service name to user-friendly platform name)
# For forums, use forum name; for Instagram services, use "Instagram"
platform_display = self._get_platform_display_name(platform, source)
message_parts.append(f"📱 <b>Platform:</b> {platform_display}")
# Add source/username (skip for forums since source becomes the platform name)
if source and platform.lower() != 'forums':
# Get content icon for the primary content type
primary_content_type = list(by_type.keys())[0] if by_type else 'item'
content_icon = self.CONTENT_ICONS.get(primary_content_type.lower(), '📄')
message_parts.append(f"{content_icon} <b>Source:</b> {source}")
if search_term:
message_parts.append(f"🔍 <b>Search:</b> {search_term}")
# Add review queue notice if applicable
if is_review_queue:
message_parts.append(f"\n⚠️ <b>No face match detected</b> - Items moved to review queue for manual review")
# Summary by type (only show if multiple types)
if len(by_type) > 1:
message_parts.append(f"\n<b>Breakdown:</b>")
for content_type, items in by_type.items():
content_icon = self.CONTENT_ICONS.get(content_type.lower(), '📄')
count = len(items)
plural_type = self._pluralize(content_type, count)
message_parts.append(f"{content_icon} {count} {plural_type}")
# Add timestamp
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
message_parts.append(f"\n⏰ <b>Downloaded:</b> {now}")
message = "\n".join(message_parts)
# Select a random file for thumbnail attachment (if enabled)
# Can be an image or video (extract random frame from video)
import random
image_path = None
temp_frame_path = None # Track temporary frame extractions
if self.include_image:
image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'}
video_extensions = {'.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v'}
# Collect all valid media file paths (images and videos)
media_files = []
for dl in downloads:
file_path = dl.get('file_path')
if file_path:
exists = Path(file_path).exists()
if exists:
suffix = Path(file_path).suffix.lower()
if suffix in image_extensions or suffix in video_extensions:
media_files.append(file_path)
else:
logger.debug(f"[Pushover] Skipping file (invalid extension): {Path(file_path).name} ({suffix})")
else:
logger.warning(f"[Pushover] Skipping file (doesn't exist): {file_path}")
else:
logger.warning(f"[Pushover] Download entry has no file_path")
logger.debug(f"[Pushover] Found {len(media_files)} valid media files out of {len(downloads)} downloads")
# Randomly select one file if available
if media_files:
selected_file = random.choice(media_files)
selected_suffix = Path(selected_file).suffix.lower()
if selected_suffix in image_extensions:
# It's an image, use directly
image_path = selected_file
logger.debug(f"[Pushover] Selected image thumbnail: {Path(image_path).name}")
elif selected_suffix in video_extensions:
# It's a video, extract a random frame
logger.info(f"[Pushover] Selected video for thumbnail, extracting random frame: {Path(selected_file).name}")
temp_frame_path = self._extract_random_video_frame(selected_file)
if temp_frame_path:
image_path = temp_frame_path
logger.info(f"[Pushover] Successfully extracted video frame for thumbnail: {Path(temp_frame_path).name}")
else:
logger.warning("[Pushover] Failed to extract frame from video - notification will be sent without thumbnail")
else:
logger.debug("[Pushover] No media files available for thumbnail attachment")
else:
logger.debug("[Pushover] Image thumbnails disabled in settings")
# Send notification with lower priority for review queue
priority = -1 if is_review_queue else None # Low priority for review queue
result = self.send_notification(
title=title,
message=message,
html=True,
image_path=image_path,
priority=priority
)
# Clean up temporary frame file if we created one
if temp_frame_path and Path(temp_frame_path).exists():
try:
Path(temp_frame_path).unlink()
logger.debug(f"[Pushover] Cleaned up temp frame: {Path(temp_frame_path).name}")
except Exception as e:
logger.debug(f"[Pushover] Failed to cleanup temp frame: {e}")
return result
def notify_error(self, platform: str, error_message: str, source: str = None) -> bool:
"""
Send error notification
Args:
platform: Platform name
error_message: Error description
source: Optional source/username
Returns:
True if notification sent successfully
"""
# Convert service name to user-friendly platform name
# For forums, use forum name; for Instagram services, use "Instagram"
platform_display = self._get_platform_display_name(platform, source)
title = f"⚠️ {platform_display} Download Error"
message_parts = [
f"<b>Platform:</b> {platform_display}",
]
# Add source (skip for forums since source becomes the platform name)
if source and platform.lower() != 'forums':
message_parts.append(f"<b>Source:</b> {source}")
message_parts.append(f"\n<b>Error:</b> {error_message}")
message = "\n".join(message_parts)
return self.send_notification(
title=title,
message=message,
priority=self.PRIORITY_HIGH,
sound="siren",
html=True
)
def get_stats(self) -> Dict[str, int]:
"""Get notification statistics"""
return self.stats.copy()
def reset_stats(self):
"""Reset statistics"""
self.stats = {
'sent': 0,
'failed': 0,
'skipped': 0
}
def create_notifier_from_config(config: Dict, unified_db=None) -> Optional[PushoverNotifier]:
"""
Create a PushoverNotifier from configuration dictionary
Args:
config: Configuration dict with pushover settings
unified_db: UnifiedDatabase instance for recording notifications (optional)
Returns:
PushoverNotifier instance or None if disabled/invalid
"""
pushover_config = config.get('pushover', {})
if not pushover_config.get('enabled', False):
logger.info("[Pushover] Notifications disabled in config")
return None
user_key = pushover_config.get('user_key')
api_token = pushover_config.get('api_token')
if not user_key or not api_token:
logger.warning("[Pushover] Missing user_key or api_token in config")
return None
return PushoverNotifier(
user_key=user_key,
api_token=api_token,
enabled=True,
default_priority=pushover_config.get('priority', 0),
device=pushover_config.get('device'),
include_image=pushover_config.get('include_image', True),
unified_db=unified_db,
enable_review_queue_notifications=pushover_config.get('enable_review_queue_notifications', True)
)
if __name__ == "__main__":
# Test the notifier
print("Testing Pushover Notifier...")
# This is a test - replace with your actual credentials
notifier = PushoverNotifier(
user_key="YOUR_USER_KEY",
api_token="YOUR_API_TOKEN",
enabled=False # Set to True to test
)
# Test notification
notifier.notify_download(
platform="instagram",
source="evalongoria",
content_type="story",
filename="evalongoria_story_20251018.mp4",
metadata={'post_date': datetime.now()}
)
print(f"Stats: {notifier.get_stats()}")

File diff suppressed because it is too large Load Diff

3243
modules/scheduler.py Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,194 @@
#!/usr/bin/env python3
"""
Thread-safe WebSocket event emitter for scraper monitoring
Provides real-time events for the scraping monitor page:
- Scraper sessions starting/completing
- File downloads and movements
- Progress updates
"""
from datetime import datetime
from typing import Optional, Dict, Any
class ScraperEventEmitter:
"""Emits WebSocket events for real-time scraper monitoring"""
def __init__(self, websocket_manager=None, app_state=None):
"""
Initialize event emitter
Args:
websocket_manager: WebSocket connection manager (optional)
app_state: Application state for tracking active sessions (optional)
"""
self.websocket_manager = websocket_manager
self.app_state = app_state
def emit_scraper_started(self, session_id: str, platform: str, account: str,
content_type: str, estimated_count: int = 0, accounts_list: list = None):
"""
Emit when scraper session begins
Args:
session_id: Unique session identifier
platform: Platform name (instagram, snapchat, etc.)
account: Account/username being scraped (or comma-separated list)
content_type: Type of content (stories, posts, etc.)
estimated_count: Estimated number of items to download
accounts_list: Optional list of all accounts to be processed
"""
event_data = {
'session_id': session_id,
'platform': platform,
'account': account,
'content_type': content_type,
'estimated_count': estimated_count,
'timestamp': datetime.now().isoformat()
}
# Include accounts list if provided
if accounts_list:
event_data['accounts_list'] = accounts_list
# Store session in app_state for API retrieval
# Match the scheduler's data structure exactly
if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
self.app_state.active_scraper_sessions[session_id] = {
'session_id': session_id,
'platform': platform,
'account': account,
'content_type': content_type,
'start_time': datetime.now().isoformat(),
'status': 'Starting...',
'detailed_status': 'Starting...',
'progress': {
'current': 0,
'total': estimated_count or 100
},
'stats': {'media': 0, 'review': 0, 'failed': 0}
}
self._broadcast({
'type': 'scraper_started',
'data': event_data
})
def emit_scraper_progress(self, session_id: str, status: str,
current: int, total: int, current_account: str = None,
completed_accounts: list = None):
"""
Emit progress update
Args:
session_id: Session identifier
status: Status message (e.g., "Downloading stories...")
current: Current item count
total: Total item count
current_account: Currently active account/forum name (optional)
completed_accounts: List of completed accounts (optional)
"""
event_data = {
'session_id': session_id,
'status': status,
'progress_current': current,
'progress_total': total,
'timestamp': datetime.now().isoformat()
}
# Include current account if provided
if current_account:
event_data['current_account'] = current_account
# Include completed accounts if provided
if completed_accounts:
event_data['completed_accounts'] = completed_accounts
# Update session in app_state - match scheduler structure
if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
if session_id in self.app_state.active_scraper_sessions:
session = self.app_state.active_scraper_sessions[session_id]
session['status'] = status
session['detailed_status'] = status
# Update account to current account if provided
if current_account:
session['account'] = current_account
# Use nested progress structure to match scheduler
session['progress'] = {
'current': current,
'total': total
}
if completed_accounts:
session['completed_accounts'] = completed_accounts
self._broadcast({
'type': 'scraper_progress',
'data': event_data
})
def emit_scraper_completed(self, session_id: str, stats: Dict[str, int]):
"""
Emit when scraper session completes
Args:
session_id: Session identifier
stats: Statistics dict with keys: total_downloaded, moved, review, duplicates, failed
"""
# Remove session from app_state
if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
self.app_state.active_scraper_sessions.pop(session_id, None)
self._broadcast({
'type': 'scraper_completed',
'data': {
'session_id': session_id,
'stats': stats,
'timestamp': datetime.now().isoformat()
}
})
def emit_file_moved(self, session_id: str, platform: str, account: str,
filename: str, media_type: str, destination_type: str,
destination_path: str, thumbnail_url: str = None,
face_match: Dict[str, Any] = None):
"""
Emit when file is moved to destination
Args:
session_id: Session identifier
platform: Platform name
account: Account/username
filename: File name
media_type: 'image' or 'video'
destination_type: 'media', 'review', or 'recycle'
destination_path: Full path to destination file
thumbnail_url: URL to thumbnail (optional)
face_match: Face recognition result dict (optional)
"""
self._broadcast({
'type': 'file_moved',
'data': {
'session_id': session_id,
'platform': platform,
'account': account,
'filename': filename,
'media_type': media_type,
'destination_type': destination_type,
'destination_path': destination_path,
'thumbnail_url': thumbnail_url,
'face_match': face_match or {'matched': False},
'timestamp': datetime.now().isoformat()
}
})
def _broadcast(self, message: dict):
"""
Thread-safe broadcast to WebSocket clients
Args:
message: Event message dict
"""
if self.websocket_manager:
# Use broadcast_sync for thread-safe emission from background threads
self.websocket_manager.broadcast_sync(message)

View File

@@ -0,0 +1,652 @@
"""
Scraper Gallery Bridge
Maps scraper accounts (Instagram, TikTok, Snapchat) to private gallery persons.
After each download session, auto-imports new media as gallery posts.
"""
import hashlib
import logging
import mimetypes
import sqlite3
import subprocess
import tempfile
import uuid
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
SCRAPER_BRIDGE_KEY_FILE = '/opt/immich/private/.scraper_bridge_key'
# Map scraper module names → platform
SCRAPER_TO_PLATFORM = {
'fastdl': 'instagram',
'imginn': 'instagram',
'imginn_api': 'instagram',
'instagram_client': 'instagram',
'toolzu': 'instagram',
'instagram': 'instagram',
'instagram_unified': 'instagram',
'tiktok': 'tiktok',
'snapchat': 'snapchat',
'snapchat_client': 'snapchat',
}
PLATFORM_COLORS = {
'instagram': '#E1306C',
'tiktok': '#00f2ea',
'snapchat': '#FFFC00',
}
PLATFORM_LABELS = {
'instagram': 'Instagram',
'tiktok': 'TikTok',
'snapchat': 'Snapchat',
}
def get_crypto():
"""Load crypto from key file for background access (works when gallery is locked)."""
from modules.private_gallery_crypto import load_key_from_file
crypto = load_key_from_file(SCRAPER_BRIDGE_KEY_FILE)
if crypto is None:
logger.debug("Scraper bridge crypto unavailable - key file missing or invalid")
return crypto
def get_available_accounts(platform: str, config: dict, db) -> List[Dict[str, Any]]:
"""
Aggregate usernames from all scraper configs + paid_content_creators for a platform.
Returns de-duplicated list with source annotations.
"""
accounts = {} # username -> set of sources
if platform == 'instagram':
# instagram.accounts[].username
ig_cfg = config.get('instagram', {})
if ig_cfg.get('enabled', False):
ig_accounts = ig_cfg.get('accounts', [])
if not ig_accounts and 'usernames' in ig_cfg:
ig_accounts = [{'username': u} for u in ig_cfg['usernames']]
for acc in ig_accounts:
u = acc.get('username', '').strip().lower()
if u:
accounts.setdefault(u, set()).add('instagram')
# Collect usernames + phrase_search usernames from each scraper
for scraper_id in ('fastdl', 'imginn', 'imginn_api', 'instagram_client', 'toolzu'):
scraper_cfg = config.get(scraper_id, {})
if not scraper_cfg.get('enabled', False):
continue
for u in scraper_cfg.get('usernames', []):
u = u.strip().lower()
if u:
accounts.setdefault(u, set()).add(scraper_id)
# phrase_search usernames are also downloadable accounts
for u in scraper_cfg.get('phrase_search', {}).get('usernames', []):
u = u.strip().lower()
if u:
accounts.setdefault(u, set()).add(scraper_id)
elif platform == 'tiktok':
tt_cfg = config.get('tiktok', {})
if tt_cfg.get('enabled', False):
tt_accounts = tt_cfg.get('accounts', [])
if not tt_accounts and 'usernames' in tt_cfg:
tt_accounts = [{'username': u} for u in tt_cfg['usernames']]
for acc in tt_accounts:
u = acc.get('username', '').strip().lower()
if u:
accounts.setdefault(u, set()).add('tiktok')
elif platform == 'snapchat':
# snapchat.usernames
sc_cfg = config.get('snapchat', {})
if sc_cfg.get('enabled', False):
for u in sc_cfg.get('usernames', []):
u = u.strip().lower()
if u:
accounts.setdefault(u, set()).add('snapchat')
# snapchat_client.usernames
sc_client_cfg = config.get('snapchat_client', {})
if sc_client_cfg.get('enabled', False):
for u in sc_client_cfg.get('usernames', []):
u = u.strip().lower()
if u:
accounts.setdefault(u, set()).add('snapchat_client')
# Add from paid_content_creators table
try:
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute(
'SELECT username FROM paid_content_creators WHERE platform = ? AND enabled = 1',
(platform,)
)
for row in cursor.fetchall():
u = row['username'].strip().lower()
if u:
accounts.setdefault(u, set()).add('paid_content')
conn.close()
except Exception as e:
logger.debug(f"Could not query paid_content_creators: {e}")
# Check which are already mapped
mapped_usernames = set()
try:
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute(
'SELECT username FROM private_media_scraper_accounts WHERE platform = ?',
(platform,)
)
for row in cursor.fetchall():
mapped_usernames.add(row['username'].lower())
conn.close()
except Exception:
pass
result = []
for username, sources in sorted(accounts.items()):
result.append({
'username': username,
'sources': sorted(sources),
'is_mapped': username.lower() in mapped_usernames,
})
return result
def _ensure_platform_tag(platform: str, db, crypto) -> int:
"""Find or create a tag for the platform in private_gallery_tags."""
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
try:
cursor = conn.cursor()
cursor.execute("SELECT id, encrypted_name FROM private_gallery_tags")
label = PLATFORM_LABELS.get(platform, platform.title())
for row in cursor.fetchall():
try:
name = crypto.decrypt_field(row['encrypted_name'])
if name.lower() == label.lower():
return row['id']
except Exception:
continue
# Create the tag
encrypted_name = crypto.encrypt_field(label)
color = PLATFORM_COLORS.get(platform, '#888888')
cursor.execute('''
INSERT INTO private_gallery_tags (encrypted_name, color)
VALUES (?, ?)
''', (encrypted_name, color))
conn.commit()
tag_id = cursor.lastrowid
logger.info(f"Created '{label}' tag with ID {tag_id}")
return tag_id
finally:
conn.close()
def _get_file_info(file_path: Path) -> Dict[str, Any]:
"""Get file type, mime type, and dimensions."""
ext = file_path.suffix.lower().lstrip('.')
mime_type, _ = mimetypes.guess_type(str(file_path))
if not mime_type:
mime_type = 'application/octet-stream'
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
if ext in image_exts:
file_type = 'image'
elif ext in video_exts:
file_type = 'video'
else:
file_type = 'other'
width, height, duration = 0, 0, 0
if file_type == 'image':
try:
from PIL import Image
with Image.open(file_path) as img:
width, height = img.size
except Exception:
pass
elif file_type == 'video':
try:
result = subprocess.run(
['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_streams', str(file_path)],
capture_output=True, text=True, timeout=15
)
if result.returncode == 0:
import json
probe = json.loads(result.stdout)
for stream in probe.get('streams', []):
if stream.get('codec_type') == 'video':
width = int(stream.get('width', 0))
height = int(stream.get('height', 0))
dur = stream.get('duration')
if dur:
duration = int(float(dur))
break
except Exception:
pass
return {
'file_type': file_type,
'mime_type': mime_type,
'width': width,
'height': height,
'duration': duration,
}
def _compute_perceptual_hash(file_path: Path) -> Optional[str]:
"""Calculate perceptual hash for an image or video file."""
try:
import imagehash
from PIL import Image
except ImportError:
return None
ext = file_path.suffix.lower().lstrip('.')
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
pil_image = None
try:
if ext in video_exts:
try:
import cv2
except ImportError:
return None
cap = cv2.VideoCapture(str(file_path))
if not cap.isOpened():
return None
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
cap.set(cv2.CAP_PROP_POS_FRAMES, int(total_frames * 0.5))
ret, frame = cap.read()
cap.release()
if not ret or frame is None:
return None
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(frame_rgb)
elif ext in image_exts:
pil_image = Image.open(file_path)
else:
return None
return str(imagehash.dhash(pil_image, hash_size=16))
except Exception:
return None
finally:
if pil_image:
try:
pil_image.close()
except Exception:
pass
def _generate_thumbnail(file_path: Path, output_path: Path, file_type: str) -> bool:
"""Generate a thumbnail for an image or video."""
try:
output_path.parent.mkdir(parents=True, exist_ok=True)
if file_type == 'image':
from PIL import Image, ImageOps
with Image.open(file_path) as img:
img = ImageOps.exif_transpose(img)
img.thumbnail((400, 400))
if img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
img.save(output_path, 'JPEG', quality=85)
return True
elif file_type == 'video':
result = subprocess.run([
'ffmpeg', '-y', '-i', str(file_path),
'-ss', '00:00:01', '-vframes', '1',
'-vf', 'scale=400:-1:force_original_aspect_ratio=decrease',
str(output_path)
], capture_output=True, timeout=30)
return result.returncode == 0 and output_path.exists()
except Exception:
pass
return False
def import_new_media(platform: str, username: str, person_id: int,
last_imported_at: Optional[str], db, crypto,
last_imported_file_id: int = 0) -> int:
"""
Import new media files from file_inventory into the private gallery.
Returns count of imported files.
"""
conn = sqlite3.connect(db.db_path, timeout=30)
conn.row_factory = sqlite3.Row
try:
cursor = conn.cursor()
# Use id-based filtering (reliable, monotonically increasing with insertion order).
# Falls back to created_date only for legacy accounts without last_imported_file_id.
if last_imported_file_id and last_imported_file_id > 0:
cursor.execute('''
SELECT id, file_path, filename, created_date FROM file_inventory
WHERE platform = ? AND source = ? AND id > ?
AND location IN ('final', 'review')
ORDER BY id ASC
''', (platform, username, last_imported_file_id))
elif last_imported_at:
cursor.execute('''
SELECT id, file_path, filename, created_date FROM file_inventory
WHERE platform = ? AND source = ? AND created_date > ?
AND location IN ('final', 'review')
ORDER BY id ASC
''', (platform, username, last_imported_at))
else:
# First run: only import files from the last 1 hour
from datetime import timedelta
cutoff = (datetime.now() - timedelta(hours=1)).isoformat()
cursor.execute('''
SELECT id, file_path, filename, created_date FROM file_inventory
WHERE platform = ? AND source = ? AND created_date > ?
AND location IN ('final', 'review')
ORDER BY id ASC
''', (platform, username, cutoff))
files = cursor.fetchall()
finally:
conn.close()
if not files:
return 0
# Filter to existing files, track max id for updating last_imported_file_id
valid_files = []
max_file_id = last_imported_file_id or 0
for f in files:
fp = Path(f['file_path'])
file_id = f['id']
if file_id > max_file_id:
max_file_id = file_id
if fp.exists() and fp.stat().st_size > 0:
valid_files.append({'path': fp, 'created_date': f['created_date'], 'id': file_id})
if not valid_files:
return 0
# Get storage path
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
try:
cursor = conn.cursor()
cursor.execute("SELECT value FROM private_media_config WHERE key = 'storage_path'")
row = cursor.fetchone()
storage_path = Path(row['value']) if row else Path('/opt/immich/private')
finally:
conn.close()
data_path = storage_path / 'data'
thumbs_path = storage_path / 'thumbs'
data_path.mkdir(parents=True, exist_ok=True)
thumbs_path.mkdir(parents=True, exist_ok=True)
# Get/create platform tag
tag_id = _ensure_platform_tag(platform, db, crypto)
# Create a post for this batch
now_iso = datetime.now().isoformat()
encrypted_desc = crypto.encrypt_field(f"{PLATFORM_LABELS.get(platform, platform)} - @{username}")
encrypted_date = crypto.encrypt_field(now_iso)
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
try:
cursor = conn.cursor()
cursor.execute('''
INSERT INTO private_media_posts (person_id, encrypted_description, encrypted_media_date, created_at, updated_at)
VALUES (?, ?, ?, ?, ?)
''', (person_id, encrypted_desc, encrypted_date, now_iso, now_iso))
conn.commit()
post_id = cursor.lastrowid
finally:
conn.close()
media_count = 0
latest_date = last_imported_at
for file_info_entry in valid_files:
file_path = file_info_entry['path']
created_date = file_info_entry['created_date']
# Normalize to string for consistent comparison (PostgreSQL returns datetime objects)
if hasattr(created_date, 'isoformat'):
created_date = created_date.isoformat()
try:
# Calculate file hash
sha256 = hashlib.sha256()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(65536), b''):
sha256.update(chunk)
file_hash = sha256.hexdigest()
# Check for duplicates (scoped by person)
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
try:
cursor = conn.cursor()
cursor.execute(
'SELECT id FROM private_media WHERE file_hash = ? AND person_id = ?',
(file_hash, person_id)
)
if cursor.fetchone():
logger.debug(f"Duplicate file skipped: {file_path.name}")
if created_date and (not latest_date or created_date > latest_date):
latest_date = created_date
continue
finally:
conn.close()
# Get file info
finfo = _get_file_info(file_path)
file_size = file_path.stat().st_size
# Compute perceptual hash
perceptual_hash = _compute_perceptual_hash(file_path)
# Generate storage ID
storage_id = str(uuid.uuid4())
# Generate thumbnail
temp_thumb = Path(tempfile.gettempdir()) / f"pg_thumb_{storage_id}.jpg"
_generate_thumbnail(file_path, temp_thumb, finfo['file_type'])
# Encrypt the file
encrypted_file = data_path / f"{storage_id}.enc"
if not crypto.encrypt_file(file_path, encrypted_file):
logger.error(f"Encryption failed for {file_path.name}")
continue
# Encrypt thumbnail
if temp_thumb.exists():
encrypted_thumb = thumbs_path / f"{storage_id}.enc"
crypto.encrypt_file(temp_thumb, encrypted_thumb)
try:
temp_thumb.unlink()
except Exception:
pass
# Insert media record
encrypted_filename = crypto.encrypt_field(file_path.name)
encrypted_source = crypto.encrypt_field(f"@{username}")
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
try:
cursor = conn.cursor()
cursor.execute('''
INSERT INTO private_media (
post_id, storage_id, encrypted_filename, encrypted_description,
file_hash, file_size, file_type, mime_type,
width, height, duration, person_id,
encrypted_media_date, source_type, encrypted_source_path,
perceptual_hash, created_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
post_id,
storage_id,
encrypted_filename,
None,
file_hash,
file_size,
finfo['file_type'],
finfo['mime_type'],
finfo['width'],
finfo['height'],
finfo['duration'],
person_id,
encrypted_date,
platform,
encrypted_source,
perceptual_hash,
now_iso,
))
conn.commit()
finally:
conn.close()
media_count += 1
if created_date and (not latest_date or created_date > latest_date):
latest_date = created_date
except Exception as e:
logger.error(f"Failed to import {file_path.name}: {e}")
# Apply platform tag to the post if we imported media
if media_count > 0:
conn = sqlite3.connect(db.db_path, timeout=10)
try:
cursor = conn.cursor()
cursor.execute('''
INSERT OR IGNORE INTO private_media_post_tags (post_id, tag_id)
VALUES (?, ?)
''', (post_id, tag_id))
conn.commit()
finally:
conn.close()
# Update the mapping row with both timestamp and file id markers
conn = sqlite3.connect(db.db_path, timeout=10)
try:
cursor = conn.cursor()
cursor.execute('''
UPDATE private_media_scraper_accounts
SET last_imported_at = ?,
last_imported_file_id = ?,
total_media_imported = total_media_imported + ?,
updated_at = ?
WHERE platform = ? AND username = ? AND person_id = ?
''', (latest_date or now_iso, max_file_id, media_count, now_iso, platform, username, person_id))
conn.commit()
finally:
conn.close()
logger.info(f"Imported {media_count} files from {platform}/@{username} to gallery (last_file_id={max_file_id})")
else:
# No media imported - still update the file id marker so we don't re-check these files
if max_file_id > (last_imported_file_id or 0):
conn = sqlite3.connect(db.db_path, timeout=10)
try:
cursor = conn.cursor()
cursor.execute('''
UPDATE private_media_scraper_accounts
SET last_imported_file_id = ?
WHERE platform = ? AND username = ? AND person_id = ?
''', (max_file_id, platform, username, person_id))
conn.commit()
finally:
conn.close()
# Delete the empty post
conn = sqlite3.connect(db.db_path, timeout=10)
try:
cursor = conn.cursor()
cursor.execute("DELETE FROM private_media_posts WHERE id = ?", (post_id,))
conn.commit()
finally:
conn.close()
return media_count
def on_download_complete(task_id: str, download_count: int, db, crypto) -> int:
"""
Called from scheduler after a task completes.
Checks ALL mapped accounts for the platform for new media.
This handles all cases:
- Batch tasks (fastdl:all, imginn_api:all)
- Per-user tasks that also download phrase_search users (instagram_client:evalongoria)
- Simple per-user tasks (toolzu:evalongoria)
The id-based filtering is cheap — accounts with no new files return quickly.
"""
if not task_id or ':' not in task_id:
return 0
scraper_module = task_id.split(':')[0]
# Map scraper module to platform
platform = SCRAPER_TO_PLATFORM.get(scraper_module)
if not platform:
return 0
# Always check ALL mapped accounts for the platform.
# A single task can download for many users (batch tasks, phrase_search),
# and id-based filtering makes per-account checks cheap.
return _import_all_mapped_accounts(platform, db, crypto)
def _import_all_mapped_accounts(platform: str, db, crypto) -> int:
"""
After a batch task (e.g. fastdl:all), check ALL mapped accounts
for the platform and import any new media.
"""
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
try:
cursor = conn.cursor()
cursor.execute('''
SELECT id, username, person_id, last_imported_at, last_imported_file_id
FROM private_media_scraper_accounts
WHERE platform = ? AND enabled = 1
''', (platform,))
rows = cursor.fetchall()
finally:
conn.close()
if not rows:
return 0
total_imported = 0
for row in rows:
try:
count = import_new_media(
platform, row['username'], row['person_id'],
row['last_imported_at'], db, crypto,
last_imported_file_id=row['last_imported_file_id'] or 0
)
total_imported += count
except Exception as e:
logger.error(f"Gallery bridge batch import error for {platform}/@{row['username']}: {e}")
if total_imported > 0:
logger.info(f"Batch import for {platform}: {total_imported} files across {len(rows)} accounts")
return total_imported

728
modules/semantic_search.py Normal file
View File

@@ -0,0 +1,728 @@
#!/usr/bin/env python3
"""
Semantic Search Module using CLIP
Provides image/video similarity search and natural language search capabilities
"""
import os
import struct
import numpy as np
from typing import Dict, List, Optional, Tuple, Any
from pathlib import Path
from PIL import Image
import threading
import queue
from datetime import datetime
from modules.universal_logger import get_logger
logger = get_logger('SemanticSearch')
# Global model instance (lazy loaded)
_clip_model = None
_clip_model_name = None
_model_lock = threading.Lock()
def get_configured_model_name() -> str:
"""Get the configured CLIP model name from settings"""
try:
from modules.settings_manager import SettingsManager
from pathlib import Path
# Use the correct database path
db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db'
settings_manager = SettingsManager(str(db_path))
semantic_settings = settings_manager.get('semantic_search', {})
if isinstance(semantic_settings, dict):
model = semantic_settings.get('model', 'clip-ViT-B-32')
logger.info(f"Configured CLIP model: {model}")
return model
return 'clip-ViT-B-32'
except Exception as e:
logger.error(f"Failed to get configured model: {e}")
return 'clip-ViT-B-32'
def get_clip_model(model_name: str = None):
"""Get or load the CLIP model (thread-safe singleton)"""
global _clip_model, _clip_model_name
if model_name is None:
model_name = get_configured_model_name()
# Check if we need to reload (model changed)
if _clip_model is not None and _clip_model_name != model_name:
with _model_lock:
logger.info(f"Model changed from {_clip_model_name} to {model_name}, reloading...")
_clip_model = None
_clip_model_name = None
if _clip_model is None:
with _model_lock:
if _clip_model is None:
logger.info(f"Loading CLIP model ({model_name})...")
try:
from sentence_transformers import SentenceTransformer
_clip_model = SentenceTransformer(model_name)
_clip_model_name = model_name
logger.info(f"CLIP model {model_name} loaded successfully")
except Exception as e:
logger.error(f"Failed to load CLIP model: {e}")
raise
return _clip_model
def embedding_to_bytes(embedding: np.ndarray) -> bytes:
"""Convert numpy embedding to bytes for database storage"""
return embedding.astype(np.float32).tobytes()
def bytes_to_embedding(data: bytes) -> np.ndarray:
"""Convert bytes from database back to numpy embedding"""
return np.frombuffer(data, dtype=np.float32)
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""Calculate cosine similarity between two embeddings"""
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
class SemanticSearch:
"""Semantic search engine using CLIP embeddings"""
SUPPORTED_IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'}
SUPPORTED_VIDEO_EXTENSIONS = {'.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v'}
def __init__(self, unified_db):
"""
Initialize Semantic Search
Args:
unified_db: UnifiedDatabase instance
"""
self.db = unified_db
self.logger = get_logger('SemanticSearch')
self._model = None
@property
def model(self):
"""Lazy load CLIP model"""
if self._model is None:
self._model = get_clip_model()
return self._model
def get_image_embedding(self, image_path: str) -> Optional[np.ndarray]:
"""
Generate CLIP embedding for an image
Args:
image_path: Path to the image file
Returns:
Embedding vector or None on error
"""
try:
# Load and preprocess image
with Image.open(image_path) as image:
# Convert to RGB if necessary
if image.mode != 'RGB':
image = image.convert('RGB')
# Generate embedding
embedding = self.model.encode(image, convert_to_numpy=True)
return embedding
except Exception as e:
self.logger.debug(f"Failed to get embedding for {image_path}: {e}")
return None
def get_video_frame_embedding(self, video_path: str, frame_position: float = 0.1) -> Optional[np.ndarray]:
"""
Generate CLIP embedding for a video by extracting a frame
Args:
video_path: Path to the video file
frame_position: Position in video (0-1) to extract frame from
Returns:
Embedding vector or None on error
"""
# Try cv2 first, fall back to ffmpeg for codecs cv2 can't handle (e.g. AV1)
image = self._extract_frame_cv2(video_path, frame_position)
if image is None:
image = self._extract_frame_ffmpeg(video_path, frame_position)
if image is None:
return None
try:
embedding = self.model.encode(image, convert_to_numpy=True)
return embedding
except Exception as e:
self.logger.debug(f"Failed to encode video frame for {video_path}: {e}")
return None
finally:
# Clean up image to prevent memory leaks
if image is not None:
try:
image.close()
except Exception:
pass
def _extract_frame_cv2(self, video_path: str, frame_position: float) -> Optional[Image.Image]:
"""Extract frame using OpenCV"""
try:
import cv2
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
return None
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
if total_frames <= 0:
cap.release()
return None
target_frame = int(total_frames * frame_position)
cap.set(cv2.CAP_PROP_POS_FRAMES, target_frame)
ret, frame = cap.read()
cap.release()
if not ret:
return None
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
return Image.fromarray(frame_rgb)
except Exception as e:
self.logger.debug(f"cv2 frame extraction failed for {video_path}: {e}")
return None
def _extract_frame_ffmpeg(self, video_path: str, frame_position: float) -> Optional[Image.Image]:
"""Extract frame using ffmpeg (fallback for codecs cv2 can't handle)"""
try:
import subprocess
import tempfile
# Get video duration
probe_cmd = [
'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1', video_path
]
result = subprocess.run(probe_cmd, capture_output=True, text=True, timeout=10)
if result.returncode != 0:
return None
duration = float(result.stdout.strip())
seek_time = duration * frame_position
# Extract frame to temp file
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
tmp_path = tmp.name
extract_cmd = [
'ffmpeg', '-y', '-ss', str(seek_time), '-i', video_path,
'-vframes', '1', '-q:v', '2', tmp_path
]
result = subprocess.run(extract_cmd, capture_output=True, timeout=30)
if result.returncode != 0 or not os.path.exists(tmp_path):
return None
image = Image.open(tmp_path)
image.load() # Load into memory before deleting file
# Clean up temp file
try:
os.unlink(tmp_path)
except OSError:
pass # Best effort cleanup of temp file
if image.mode != 'RGB':
image = image.convert('RGB')
return image
except Exception as e:
self.logger.debug(f"ffmpeg frame extraction failed for {video_path}: {e}")
return None
def get_text_embedding(self, text: str) -> Optional[np.ndarray]:
"""
Generate CLIP embedding for text query
Args:
text: Text query
Returns:
Embedding vector or None on error
"""
try:
embedding = self.model.encode(text, convert_to_numpy=True)
return embedding
except Exception as e:
self.logger.error(f"Failed to get text embedding: {e}")
return None
def store_embedding(self, file_id: int, embedding: np.ndarray) -> bool:
"""
Store embedding in database
Args:
file_id: File inventory ID
embedding: Embedding vector
Returns:
Success status
"""
try:
embedding_bytes = embedding_to_bytes(embedding)
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
INSERT OR REPLACE INTO content_embeddings
(file_id, embedding, embedding_model, embedding_version, created_date)
VALUES (?, ?, 'clip-ViT-B-32', 1, CURRENT_TIMESTAMP)
''', (file_id, embedding_bytes))
return True
except Exception as e:
self.logger.error(f"Failed to store embedding for file {file_id}: {e}")
return False
def get_embedding(self, file_id: int) -> Optional[np.ndarray]:
"""
Get stored embedding from database
Args:
file_id: File inventory ID
Returns:
Embedding vector or None
"""
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT embedding FROM content_embeddings WHERE file_id = ?
''', (file_id,))
row = cursor.fetchone()
if row and row['embedding']:
return bytes_to_embedding(row['embedding'])
return None
except Exception as e:
self.logger.error(f"Failed to get embedding for file {file_id}: {e}")
return None
def delete_embedding(self, file_id: int) -> bool:
"""
Delete embedding for a file
Args:
file_id: File inventory ID
Returns:
True if deleted, False otherwise
"""
try:
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('DELETE FROM content_embeddings WHERE file_id = ?', (file_id,))
if cursor.rowcount > 0:
self.logger.debug(f"Deleted embedding for file_id {file_id}")
return True
return False
except Exception as e:
self.logger.error(f"Failed to delete embedding for file {file_id}: {e}")
return False
def delete_embedding_by_path(self, file_path: str) -> bool:
"""
Delete embedding for a file by its path
Args:
file_path: File path
Returns:
True if deleted, False otherwise
"""
try:
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# First get the file_id
cursor.execute('SELECT id FROM file_inventory WHERE file_path = ?', (file_path,))
row = cursor.fetchone()
if row:
cursor.execute('DELETE FROM content_embeddings WHERE file_id = ?', (row['id'],))
if cursor.rowcount > 0:
self.logger.debug(f"Deleted embedding for {file_path}")
return True
return False
except Exception as e:
self.logger.error(f"Failed to delete embedding for {file_path}: {e}")
return False
def generate_embedding_for_file(self, file_id: int, file_path: str, content_type: str = None) -> bool:
"""
Generate and store embedding for a single file
Args:
file_id: File inventory ID
file_path: Path to the file
content_type: Optional content type ('image' or 'video')
Returns:
True if embedding generated and stored successfully
"""
try:
if not os.path.exists(file_path):
self.logger.debug(f"File not found for embedding: {file_path}")
return False
ext = Path(file_path).suffix.lower()
# Determine file type
if content_type:
is_image = 'image' in content_type.lower()
is_video = 'video' in content_type.lower()
else:
is_image = ext in self.SUPPORTED_IMAGE_EXTENSIONS
is_video = ext in self.SUPPORTED_VIDEO_EXTENSIONS
embedding = None
if is_image:
embedding = self.get_image_embedding(file_path)
elif is_video:
embedding = self.get_video_frame_embedding(file_path)
if embedding is not None:
if self.store_embedding(file_id, embedding):
self.logger.debug(f"Generated embedding for file_id {file_id}: {Path(file_path).name}")
return True
return False
except Exception as e:
self.logger.error(f"Failed to generate embedding for file {file_id}: {e}")
return False
def get_embedding_stats(self) -> Dict:
"""Get statistics about embeddings in the database"""
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
# Total embeddings for files in 'final' location only
# (excludes embeddings for files moved to recycle bin or review)
cursor.execute('''
SELECT COUNT(*) FROM content_embeddings ce
JOIN file_inventory fi ON ce.file_id = fi.id
WHERE fi.location = 'final'
''')
total_embeddings = cursor.fetchone()[0]
# Total files in final location
cursor.execute("SELECT COUNT(*) FROM file_inventory WHERE location = 'final'")
total_files = cursor.fetchone()[0]
# Files without embeddings
cursor.execute('''
SELECT COUNT(*) FROM file_inventory fi
WHERE fi.location = 'final'
AND NOT EXISTS (SELECT 1 FROM content_embeddings ce WHERE ce.file_id = fi.id)
''')
missing_embeddings = cursor.fetchone()[0]
return {
'total_embeddings': total_embeddings,
'total_files': total_files,
'missing_embeddings': missing_embeddings,
'coverage_percent': round((total_embeddings / total_files * 100) if total_files > 0 else 0, 2)
}
except Exception as e:
self.logger.error(f"Failed to get embedding stats: {e}")
return {}
def generate_embeddings_batch(self, limit: int = 100, platform: str = None,
progress_callback=None) -> Dict:
"""
Generate embeddings for files that don't have them yet
Args:
limit: Maximum files to process
platform: Filter by platform
progress_callback: Optional callback(processed, total, current_file)
Returns:
Dict with success/error counts
"""
results = {'processed': 0, 'success': 0, 'errors': 0, 'skipped': 0}
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
# Get files without embeddings
query = '''
SELECT fi.id, fi.file_path, fi.content_type, fi.filename
FROM file_inventory fi
WHERE fi.location = 'final'
AND NOT EXISTS (SELECT 1 FROM content_embeddings ce WHERE ce.file_id = fi.id)
'''
params = []
if platform:
query += ' AND fi.platform = ?'
params.append(platform)
query += ' LIMIT ?'
params.append(limit)
cursor.execute(query, params)
files = cursor.fetchall()
total = len(files)
self.logger.info(f"Processing {total} files for embedding generation")
for i, file_row in enumerate(files):
file_id = file_row['id']
file_path = file_row['file_path']
content_type = file_row['content_type'] or ''
filename = file_row['filename'] or ''
results['processed'] += 1
if progress_callback:
progress_callback(i + 1, total, filename)
# Skip if file doesn't exist
if not os.path.exists(file_path):
results['skipped'] += 1
continue
# Determine file type
ext = Path(file_path).suffix.lower()
embedding = None
if ext in self.SUPPORTED_IMAGE_EXTENSIONS or 'image' in content_type.lower():
embedding = self.get_image_embedding(file_path)
elif ext in self.SUPPORTED_VIDEO_EXTENSIONS or 'video' in content_type.lower():
embedding = self.get_video_frame_embedding(file_path)
else:
results['skipped'] += 1
continue
if embedding is not None:
if self.store_embedding(file_id, embedding):
results['success'] += 1
else:
results['errors'] += 1
else:
results['errors'] += 1
self.logger.info(f"Embedding generation complete: {results}")
return results
except Exception as e:
self.logger.error(f"Failed to generate embeddings batch: {e}")
return results
def search_by_text(self, query: str, limit: int = 50, platform: str = None,
source: str = None, threshold: float = 0.2) -> List[Dict]:
"""
Search for images/videos using natural language
Args:
query: Natural language search query
limit: Maximum results
platform: Filter by platform
source: Filter by source
threshold: Minimum similarity score (0-1)
Returns:
List of files with similarity scores
"""
try:
# Get text embedding
query_embedding = self.get_text_embedding(query)
if query_embedding is None:
return []
return self._search_by_embedding(query_embedding, limit, platform, source, threshold)
except Exception as e:
self.logger.error(f"Text search failed: {e}")
return []
def search_by_image(self, image_path: str, limit: int = 50, platform: str = None,
source: str = None, threshold: float = 0.5) -> List[Dict]:
"""
Find similar images to a given image
Args:
image_path: Path to query image
limit: Maximum results
platform: Filter by platform
source: Filter by source
threshold: Minimum similarity score (0-1)
Returns:
List of similar files with scores
"""
try:
# Get image embedding
query_embedding = self.get_image_embedding(image_path)
if query_embedding is None:
return []
return self._search_by_embedding(query_embedding, limit, platform, source, threshold)
except Exception as e:
self.logger.error(f"Image search failed: {e}")
return []
def search_by_file_id(self, file_id: int, limit: int = 50, platform: str = None,
source: str = None, threshold: float = 0.5) -> List[Dict]:
"""
Find similar files to a file already in the database
Args:
file_id: File inventory ID
limit: Maximum results
platform: Filter by platform
source: Filter by source
threshold: Minimum similarity score (0-1)
Returns:
List of similar files with scores
"""
try:
# Get existing embedding
query_embedding = self.get_embedding(file_id)
if query_embedding is None:
# Try to generate it
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('SELECT file_path FROM file_inventory WHERE id = ?', (file_id,))
row = cursor.fetchone()
if row:
query_embedding = self.get_image_embedding(row['file_path'])
if query_embedding is None:
return []
results = self._search_by_embedding(query_embedding, limit + 1, platform, source, threshold)
# Remove the query file itself from results
return [r for r in results if r['id'] != file_id][:limit]
except Exception as e:
self.logger.error(f"Similar file search failed: {e}")
return []
def _search_by_embedding(self, query_embedding: np.ndarray, limit: int,
platform: str = None, source: str = None,
threshold: float = 0.2) -> List[Dict]:
"""
Internal search using embedding vector
Args:
query_embedding: Query embedding vector
limit: Maximum results
platform: Filter by platform
source: Filter by source
threshold: Minimum similarity score
Returns:
List of files with similarity scores, sorted by score
"""
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
# Build query to get all embeddings (with optional filters)
query = '''
SELECT ce.file_id, ce.embedding, fi.file_path, fi.filename,
fi.platform, fi.source, fi.content_type, fi.file_size
FROM content_embeddings ce
JOIN file_inventory fi ON fi.id = ce.file_id
WHERE fi.location = 'final'
'''
params = []
if platform:
query += ' AND fi.platform = ?'
params.append(platform)
if source:
query += ' AND fi.source = ?'
params.append(source)
cursor.execute(query, params)
results = []
for row in cursor.fetchall():
embedding = bytes_to_embedding(row['embedding'])
similarity = cosine_similarity(query_embedding, embedding)
if similarity >= threshold:
results.append({
'id': row['file_id'],
'file_path': row['file_path'],
'filename': row['filename'],
'platform': row['platform'],
'source': row['source'],
'content_type': row['content_type'],
'file_size': row['file_size'],
'similarity': round(similarity, 4)
})
# Sort by similarity descending
results.sort(key=lambda x: x['similarity'], reverse=True)
return results[:limit]
except Exception as e:
self.logger.error(f"Embedding search failed: {e}")
return []
# Global instance (lazy initialization)
_semantic_search = None
def reset_clip_model():
"""Reset the global CLIP model so it will be reloaded with new config"""
global _clip_model, _clip_model_name
with _model_lock:
_clip_model = None
_clip_model_name = None
logger.info("CLIP model cache cleared, will reload on next use")
def get_semantic_search(unified_db=None, force_reload=False):
"""Get or create global semantic search instance
Args:
unified_db: Database instance to use
force_reload: If True, recreate the instance (useful when model config changes)
"""
global _semantic_search
if _semantic_search is None or force_reload:
if force_reload:
# Also reset the CLIP model so it reloads with new config
reset_clip_model()
if unified_db is None:
from modules.unified_database import UnifiedDatabase
unified_db = UnifiedDatabase()
_semantic_search = SemanticSearch(unified_db)
return _semantic_search

View File

@@ -0,0 +1,319 @@
#!/usr/bin/env python3
"""
Service Health Monitor - Tracks service failures and sends alerts
Only active during scheduler mode for unattended operation monitoring
"""
import json
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, Optional
from modules.universal_logger import get_logger
class ServiceHealthMonitor:
"""Monitor service health and send alerts when services get stuck"""
def __init__(self,
state_file: str = "/opt/media-downloader/database/service_health.json",
config: dict = None,
error_monitoring_config: dict = None,
pushover_notifier = None,
scheduler_mode: bool = False):
"""
Initialize health monitor
Args:
state_file: Path to JSON file storing health state
config: Configuration dict from settings.json
error_monitoring_config: Error monitoring settings (for push alert delay)
pushover_notifier: Instance of PushoverNotifier for alerts
scheduler_mode: Only monitor when True (scheduler mode)
"""
self.state_file = Path(state_file)
self.state_file.parent.mkdir(parents=True, exist_ok=True)
self.pushover = pushover_notifier
self.scheduler_mode = scheduler_mode
self.error_monitoring_config = error_monitoring_config or {}
# Default configuration
self.config = {
'enabled': True,
'notification_cooldown_hours': 24,
'min_consecutive_failures': 2, # Number of consecutive run failures before alerting
'services': {
'fastdl': {'monitor': True, 'notify': True},
'imginn': {'monitor': True, 'notify': True},
'snapchat': {'monitor': True, 'notify': True},
'toolzu': {'monitor': True, 'notify': True},
'tiktok': {'monitor': True, 'notify': True},
'forums': {'monitor': True, 'notify': True}
},
'pushover': {
'enabled': True,
'priority': 0,
'sound': 'pushover'
}
}
# Merge user config
if config:
self.config.update(config)
# Load or initialize state
self.state = self._load_state()
# Setup logging
self.logger = get_logger('ServiceHealthMonitor')
def _load_state(self) -> Dict:
"""Load health state from file"""
if self.state_file.exists():
try:
with open(self.state_file, 'r') as f:
return json.load(f)
except Exception as e:
self.logger.error(f"Failed to load health state: {e}")
# Initialize empty state
return {'service_health': {}}
def _save_state(self):
"""Save health state to file"""
try:
with open(self.state_file, 'w') as f:
json.dump(self.state, f, indent=2, default=str)
except Exception as e:
self.logger.error(f"Failed to save health state: {e}")
def _get_service_state(self, service: str) -> Dict:
"""Get state for a service, initialize if doesn't exist"""
if service not in self.state['service_health']:
self.state['service_health'][service] = {
'status': 'healthy',
'consecutive_failures': 0,
'last_success': None,
'last_failure': None,
'last_notification_sent': None,
'failure_type': None,
'total_failures': 0,
'total_successes': 0
}
return self.state['service_health'][service]
def record_success(self, service: str):
"""
Record successful operation for a service
Args:
service: Service name (fastdl, imginn, snapchat, etc.)
"""
# Only monitor in scheduler mode
if not self.scheduler_mode:
return
# Check if service is monitored
if not self._is_monitored(service):
return
state = self._get_service_state(service)
now = datetime.now()
# Was service previously stuck? Send recovery notification
was_stuck = state['status'] == 'stuck'
# Update state
state['status'] = 'healthy'
state['consecutive_failures'] = 0
state['last_success'] = now.isoformat()
state['failure_type'] = None
state['total_successes'] += 1
self._save_state()
# Send recovery notification if service was stuck
if was_stuck and self._should_notify(service):
self._send_recovery_notification(service, now)
def record_failure(self, service: str, reason: str = 'unknown'):
"""
Record failure for a service
Args:
service: Service name (fastdl, imginn, snapchat, etc.)
reason: Reason for failure (cloudflare, rate_limit, timeout, etc.)
"""
# Only monitor in scheduler mode
if not self.scheduler_mode:
return
# Check if service is monitored
if not self._is_monitored(service):
return
state = self._get_service_state(service)
now = datetime.now()
# Update state - increment consecutive failures
state['consecutive_failures'] += 1
state['last_failure'] = now.isoformat()
state['failure_type'] = reason
state['total_failures'] += 1
# Check if service should be marked as stuck based on consecutive run failures
min_failures = self.config.get('min_consecutive_failures', 2)
if state['consecutive_failures'] >= min_failures:
state['status'] = 'stuck'
# Send notification if cooldown period has passed
if self._should_notify(service) and self._notification_cooldown_expired(service):
self._send_alert_notification(service, reason, now)
state['last_notification_sent'] = now.isoformat()
self._save_state()
def _is_monitored(self, service: str) -> bool:
"""Check if service should be monitored"""
if not self.config.get('enabled', True):
return False
service_config = self.config.get('services', {}).get(service, {})
return service_config.get('monitor', True)
def _should_notify(self, service: str) -> bool:
"""Check if notifications are enabled for this service"""
if not self.pushover:
return False
if not self.config.get('pushover', {}).get('enabled', True):
return False
service_config = self.config.get('services', {}).get(service, {})
return service_config.get('notify', True)
def _notification_cooldown_expired(self, service: str) -> bool:
"""Check if notification cooldown period has expired"""
state = self._get_service_state(service)
last_sent = state.get('last_notification_sent')
if not last_sent:
return True # Never sent, can send now
try:
last_sent_time = datetime.fromisoformat(last_sent)
# Use push_alert_delay_hours from error_monitoring config if available,
# otherwise fall back to notification_cooldown_hours or default 24
cooldown_hours = self.error_monitoring_config.get('push_alert_delay_hours',
self.config.get('notification_cooldown_hours', 24))
cooldown_period = timedelta(hours=cooldown_hours)
return datetime.now() - last_sent_time > cooldown_period
except (ValueError, TypeError):
return True # Error parsing date, allow notification
def _send_alert_notification(self, service: str, reason: str, now: datetime):
"""Send Pushover alert notification"""
state = self._get_service_state(service)
# Calculate time since last success
time_stuck = "Unknown"
if state['last_success']:
try:
last_success = datetime.fromisoformat(state['last_success'])
delta = now - last_success
hours = int(delta.total_seconds() / 3600)
if hours < 1:
time_stuck = f"{int(delta.total_seconds() / 60)} minutes ago"
elif hours < 48:
time_stuck = f"{hours} hours ago"
else:
days = int(hours / 24)
time_stuck = f"{days} days ago"
except (ValueError, TypeError):
pass
# Format service name
service_name = service.replace('_', ' ').title()
# Format reason
reason_map = {
'cloudflare': 'Cloudflare Challenge',
'cloudflare_challenge': 'Cloudflare Challenge',
'rate_limit': 'Rate Limited (429)',
'forbidden': 'Access Forbidden (403)',
'timeout': 'Connection Timeout',
'authentication': 'Authentication Required',
'captcha': 'CAPTCHA Challenge',
'blocked': 'IP Blocked',
'unknown': 'Unknown Error'
}
reason_text = reason_map.get(reason.lower(), reason)
# Build message
title = f"⚠️ Service Alert: {service_name}"
message = f"""Status: Stuck/Blocked
Issue: {reason_text}
Failed Since: {now.strftime('%b %d, %I:%M %p')} ({state['consecutive_failures']} consecutive failures)
Last successful download: {time_stuck if state['last_success'] else 'Never'}
Action may be required.
"""
# Send notification
try:
priority = self.config.get('pushover', {}).get('priority', 0)
sound = self.config.get('pushover', {}).get('sound', 'pushover')
self.pushover.send_notification(
title=title,
message=message,
priority=priority,
sound=sound
)
self.logger.info(f"Sent alert notification for {service}: {reason}")
except Exception as e:
self.logger.error(f"Failed to send alert notification: {e}")
def _send_recovery_notification(self, service: str, now: datetime):
"""Send recovery notification (optional)"""
# Recovery notifications are optional - can be disabled
if not self.config.get('send_recovery_notifications', False):
return
state = self._get_service_state(service)
service_name = service.replace('_', ' ').title()
title = f"✅ Service Recovered: {service_name}"
message = f"""Status: Healthy
Service is working again.
Recovered at: {now.strftime('%b %d, %I:%M %p')}
"""
try:
self.pushover.send_notification(
title=title,
message=message,
priority=-1, # Low priority for recovery
sound='magic'
)
self.logger.info(f"Sent recovery notification for {service}")
except Exception as e:
self.logger.error(f"Failed to send recovery notification: {e}")
def get_service_status(self, service: str) -> Dict:
"""Get current status for a service"""
return self._get_service_state(service).copy()
def get_all_status(self) -> Dict:
"""Get status for all services"""
return self.state['service_health'].copy()
def reset_service(self, service: str):
"""Reset state for a service"""
if service in self.state['service_health']:
del self.state['service_health'][service]
self._save_state()

257
modules/settings_manager.py Normal file
View File

@@ -0,0 +1,257 @@
#!/usr/bin/env python3
"""
Settings Manager for Media Downloader
Handles settings storage in database with JSON file compatibility
"""
import json
import sqlite3
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Any, Union, Tuple
from contextlib import contextmanager
import threading
from modules.universal_logger import get_logger
logger = get_logger('SettingsManager')
class SettingsManager:
"""Manage application settings in database (thread-safe)"""
def __init__(self, db_path: str):
"""
Initialize settings manager
Args:
db_path: Path to SQLite database
"""
self.db_path = db_path
self._write_lock = threading.RLock() # Reentrant lock for write operations
self._create_tables()
@contextmanager
def _get_connection(self, for_write: bool = False):
"""Get database connection (thread-safe)"""
conn = sqlite3.connect(self.db_path, timeout=30.0, check_same_thread=False)
conn.row_factory = sqlite3.Row
try:
if for_write:
with self._write_lock:
yield conn
else:
yield conn
finally:
conn.close()
def _create_tables(self):
"""Create settings table if it doesn't exist"""
with self._get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS settings (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
value_type TEXT NOT NULL,
category TEXT,
description TEXT,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_by TEXT DEFAULT 'system'
)
''')
# Create index for category lookups
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_settings_category
ON settings(category)
''')
conn.commit()
logger.info("Settings tables initialized")
def get(self, key: str, default: Any = None) -> Any:
"""
Get a setting value
Args:
key: Setting key (supports dot notation, e.g., 'instagram.enabled')
default: Default value if not found
Returns:
Setting value or default
"""
with self._get_connection() as conn:
cursor = conn.cursor()
cursor.execute('SELECT value, value_type FROM settings WHERE key = ?', (key,))
row = cursor.fetchone()
if not row:
return default
value, value_type = row['value'], row['value_type']
return self._deserialize_value(value, value_type)
def set(self, key: str, value: Any, category: str = None,
description: str = None, updated_by: str = 'system'):
"""
Set a setting value
Args:
key: Setting key
value: Setting value (will be serialized to JSON if needed)
category: Optional category
description: Optional description
updated_by: Who updated the setting
"""
value_str, value_type = self._serialize_value(value)
with self._get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
INSERT OR REPLACE INTO settings
(key, value, value_type, category, description, updated_at, updated_by)
VALUES (?, ?, ?, ?, ?, ?, ?)
''', (key, value_str, value_type, category, description,
datetime.now().isoformat(), updated_by))
conn.commit()
logger.debug(f"Setting updated: {key} = {value_str[:100]}")
def get_category(self, category: str) -> Dict[str, Any]:
"""
Get all settings in a category
Args:
category: Category name
Returns:
Dictionary of settings
"""
with self._get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT key, value, value_type
FROM settings
WHERE category = ?
''', (category,))
result = {}
for row in cursor.fetchall():
key = row['key']
value = self._deserialize_value(row['value'], row['value_type'])
result[key] = value
return result
def get_all(self) -> Dict[str, Any]:
"""
Get all settings as a nested dictionary
Returns:
Nested dictionary of all settings
"""
with self._get_connection() as conn:
cursor = conn.cursor()
cursor.execute('SELECT key, value, value_type FROM settings')
result = {}
for row in cursor.fetchall():
key = row['key']
value = self._deserialize_value(row['value'], row['value_type'])
# Support nested keys like 'instagram.enabled'
self._set_nested(result, key, value)
return result
def delete(self, key: str):
"""Delete a setting"""
with self._get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('DELETE FROM settings WHERE key = ?', (key,))
conn.commit()
logger.debug(f"Setting deleted: {key}")
def migrate_from_json(self, json_path: str):
"""
Migrate settings from JSON file to database
Args:
json_path: Path to settings.json file
"""
json_file = Path(json_path)
if not json_file.exists():
logger.warning(f"JSON file not found: {json_path}")
return
with open(json_file, 'r') as f:
settings = json.load(f)
# Flatten and store settings
self._migrate_dict(settings, prefix='', category='root')
logger.info(f"Settings migrated from {json_path}")
def _migrate_dict(self, data: Dict, prefix: str = '', category: str = None):
"""Recursively migrate nested dictionary"""
for key, value in data.items():
full_key = f"{prefix}.{key}" if prefix else key
if isinstance(value, dict):
# Store the entire dict as a value
self.set(full_key, value, category=category or key)
else:
# Store primitive value
self.set(full_key, value, category=category or prefix.split('.')[0])
def export_to_json(self, json_path: str):
"""
Export settings to JSON file
Args:
json_path: Path to save settings.json
"""
settings = self.get_all()
with open(json_path, 'w') as f:
json.dump(settings, f, indent=2)
logger.info(f"Settings exported to {json_path}")
def _serialize_value(self, value: Any) -> Tuple[str, str]:
"""
Serialize value to string and determine type
Returns:
Tuple of (value_string, value_type)
"""
if isinstance(value, bool):
return (json.dumps(value), 'boolean')
elif isinstance(value, int):
return (json.dumps(value), 'number')
elif isinstance(value, float):
return (json.dumps(value), 'number')
elif isinstance(value, str):
return (value, 'string')
elif isinstance(value, (dict, list)):
return (json.dumps(value), 'object' if isinstance(value, dict) else 'array')
else:
return (json.dumps(value), 'object')
def _deserialize_value(self, value_str: str, value_type: str) -> Any:
"""Deserialize value from string"""
if value_type == 'string':
return value_str
else:
return json.loads(value_str)
def _set_nested(self, data: Dict, key: str, value: Any):
"""Set value in nested dictionary using dot notation"""
parts = key.split('.')
current = data
for part in parts[:-1]:
if part not in current:
current[part] = {}
current = current[part]
current[parts[-1]] = value

View File

@@ -0,0 +1,871 @@
#!/usr/bin/env python3
"""
Snapchat Client Module - Direct HTTP-based Snapchat downloader using curl_cffi.
Replaces Playwright-based scraping with direct HTTP requests. Snapchat embeds
all page data in <script id="__NEXT_DATA__"> JSON tags, so no JavaScript
execution is needed. Uses story.snapchat.com which may not require Cloudflare.
Follows the same pattern as instagram_client_module.py.
"""
import os
import json
import re
import subprocess
import time
import random
import platform
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Dict, List, Set
from modules.base_module import LoggingMixin
from modules.snapchat_scraper import SnapMedia, SnapCollection
class SnapchatClientDownloader(LoggingMixin):
"""Snapchat downloader using direct HTTP via curl_cffi (no Playwright)"""
def __init__(self,
show_progress: bool = True,
use_database: bool = True,
log_callback=None,
unified_db=None):
"""Initialize the Snapchat Client downloader.
Args:
show_progress: Whether to show download progress
use_database: Whether to use database for dedup
log_callback: Optional logging callback
unified_db: UnifiedDatabase instance
"""
self._init_logger('SnapchatClient', log_callback, default_module='Download')
self.scraper_id = 'snapchat_client'
self.show_progress = show_progress
self.use_database = use_database
self.download_count = 0
self.downloaded_files: Set[str] = set()
self.pending_downloads = []
# Session (lazy-initialized)
self._session = None
# Database
if unified_db and use_database:
from modules.unified_database import SnapchatDatabaseAdapter
self.db = SnapchatDatabaseAdapter(unified_db)
self.unified_db = unified_db
else:
self.db = None
self.unified_db = None
self.use_database = False
# Activity status manager
try:
from modules.activity_status import get_activity_manager
self.activity_manager = get_activity_manager(unified_db)
except ImportError:
self.activity_manager = None
# Cookie data from DB
self.cookies = []
self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
def _get_session(self):
"""Get or create a curl_cffi session with browser TLS fingerprinting."""
if self._session is None:
from curl_cffi.requests import Session
# Try multiple browser versions for curl_cffi compatibility
for _browser in ("chrome131", "chrome136", "chrome"):
try:
self._session = Session(impersonate=_browser)
break
except Exception:
continue
else:
self._session = Session()
self._session.headers.update({
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'no-cache',
})
# Load cookies from database
self._load_cookies()
return self._session
def _load_cookies(self):
"""Load cookies from database for authenticated requests."""
if not self.unified_db:
return
# Try snapchat_client cookies first, fall back to snapchat
for scraper_id in ['snapchat_client', 'snapchat']:
try:
cookies = self.unified_db.get_scraper_cookies(scraper_id)
if cookies:
self.log(f"Loaded {len(cookies)} cookies from '{scraper_id}' scraper", "debug")
self.cookies = cookies
for cookie in cookies:
name = cookie.get('name', '')
value = cookie.get('value', '')
domain = cookie.get('domain', '.snapchat.com')
if name and value and self._session:
self._session.cookies.set(name, value, domain=domain)
# Check if we have a stored user-agent (important for cf_clearance match)
try:
import json as _json
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT user_agent FROM scrapers WHERE id = ?",
(scraper_id,)
)
row = cursor.fetchone()
if row and row[0]:
self.user_agent = row[0]
if self._session:
self._session.headers['User-Agent'] = self.user_agent
except Exception:
pass
return
except Exception as e:
self.log(f"Error loading cookies from '{scraper_id}': {e}", "debug")
def _fetch_page(self, url: str) -> Optional[str]:
"""Fetch a page via HTTP and return the HTML content.
Tries story.snapchat.com first (no Cloudflare), falls back to www.snapchat.com.
"""
session = self._get_session()
# If URL uses www.snapchat.com, try story.snapchat.com first
story_url = url.replace('www.snapchat.com', 'story.snapchat.com')
www_url = url.replace('story.snapchat.com', 'www.snapchat.com')
# Try story.snapchat.com first (likely no Cloudflare)
for attempt_url in [story_url, www_url]:
try:
resp = session.get(attempt_url, timeout=30)
if resp.status_code == 200 and '__NEXT_DATA__' in resp.text:
return resp.text
elif resp.status_code == 403:
self.log(f"403 Forbidden from {attempt_url.split('/@')[0]}", "debug")
continue
elif resp.status_code != 200:
self.log(f"HTTP {resp.status_code} from {attempt_url.split('/@')[0]}", "debug")
continue
except Exception as e:
self.log(f"Error fetching {attempt_url.split('/@')[0]}: {e}", "debug")
continue
return None
def _extract_next_data(self, html: str) -> Optional[Dict]:
"""Extract __NEXT_DATA__ JSON from HTML page."""
match = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
if not match:
return None
try:
return json.loads(match.group(1))
except json.JSONDecodeError as e:
self.log(f"Failed to parse __NEXT_DATA__ JSON: {e}", "error")
return None
def get_profile_content(self, username: str) -> Dict[str, List]:
"""Get all spotlight URLs, highlight URLs, and inline story/highlight data from a profile.
Parses __NEXT_DATA__ JSON to extract:
- spotlights: list of spotlight URL strings
- highlights: list of highlight URL strings
- story_collection: SnapCollection from story.snapList (recent stories), or None
- highlight_collections: list of SnapCollection from curatedHighlights (inline data)
The inline data avoids needing separate HTTP requests for stories and highlights.
"""
result = {'spotlights': [], 'highlights': [], 'story_collection': None, 'highlight_collections': []}
url = f"https://story.snapchat.com/@{username}"
self.log(f"Fetching profile for @{username}", "info")
html = self._fetch_page(url)
if not html:
self.log(f"Failed to fetch profile page for @{username}", "warning")
return result
# Extract spotlight URLs via regex (still needed — spotlight metadata requires per-URL fetch)
spotlight_pattern = rf'/@{re.escape(username)}/spotlight/([A-Za-z0-9_-]+)'
spotlight_ids = list(set(re.findall(spotlight_pattern, html)))
result['spotlights'] = [
f"https://story.snapchat.com/@{username}/spotlight/{sid}"
for sid in spotlight_ids
]
self.log(f"Found {len(result['spotlights'])} spotlights", "info")
# Parse __NEXT_DATA__ for stories and highlights (much more reliable than regex)
data = self._extract_next_data(html)
if not data:
# Fall back to regex for highlights
highlight_pattern = rf'/@{re.escape(username)}/highlight/([A-Za-z0-9-]+)'
highlight_ids = list(set(re.findall(highlight_pattern, html)))
result['highlights'] = [
f"https://story.snapchat.com/@{username}/highlight/{hid}"
for hid in highlight_ids
]
self.log(f"Found {len(result['highlights'])} highlights (regex fallback)", "info")
return result
props = (data.get('props') or {}).get('pageProps') or {}
# Extract story snapList (recent stories — not available via individual URLs)
story = props.get('story') or {}
story_snaps = story.get('snapList') or []
if story_snaps:
story_id = story.get('storyId') or {}
if isinstance(story_id, dict):
story_id = story_id.get('value', 'story')
story_collection = SnapCollection(
collection_id=story_id or 'story',
collection_type='story',
title=story.get('storyTitle', '') or 'Stories',
username=username,
url=url
)
for snap_data in story_snaps:
snap = self._parse_snap_data(snap_data)
if snap:
story_collection.snaps.append(snap)
if story_collection.snaps:
result['story_collection'] = story_collection
self.log(f"Found {len(story_collection.snaps)} story snaps", "info")
# Extract curatedHighlights inline (avoids per-highlight HTTP requests)
curated_highlights = props.get('curatedHighlights') or []
for highlight in curated_highlights:
highlight_id = highlight.get('highlightId') or {}
if isinstance(highlight_id, dict):
highlight_id = highlight_id.get('value', '')
title = highlight.get('storyTitle') or {}
if isinstance(title, dict):
title = title.get('value', '')
collection = SnapCollection(
collection_id=highlight_id,
collection_type='highlight',
title=title or 'Untitled Highlight',
username=username,
url=f"https://story.snapchat.com/@{username}/highlight/{highlight_id}"
)
for snap_data in highlight.get('snapList') or []:
snap = self._parse_snap_data(snap_data)
if snap:
collection.snaps.append(snap)
if collection.snaps:
result['highlight_collections'].append(collection)
self.log(f"Found {len(result['highlight_collections'])} highlights (inline)", "info")
return result
def _parse_snap_data(self, snap_data: Dict) -> Optional[SnapMedia]:
"""Parse a snap from __NEXT_DATA__ snapList into a SnapMedia object."""
snap_urls = snap_data.get('snapUrls') or {}
media_url = snap_urls.get('mediaUrl', '')
if not media_url:
return None
snap_id = (snap_data.get('snapId') or {}).get('value', '')
media_id = ''
if '/d/' in media_url:
media_id = media_url.split('/d/')[1].split('.')[0]
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str and ts_str != '0' else datetime.now()
lat = snap_data.get('lat')
lng = snap_data.get('lng')
return SnapMedia(
media_id=media_id or snap_id,
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
media_url=media_url,
timestamp=timestamp,
index=snap_data.get('snapIndex', 0),
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
lat=float(lat) if lat else None,
lng=float(lng) if lng else None
)
def get_spotlight_metadata(self, url: str) -> Optional[SnapCollection]:
"""Extract full metadata from a spotlight URL via __NEXT_DATA__."""
html = self._fetch_page(url)
if not html:
return None
data = self._extract_next_data(html)
if not data:
return None
props = (data.get('props') or {}).get('pageProps') or {}
feed = props.get('spotlightFeed') or {}
stories = feed.get('spotlightStories') or []
if not stories:
return None
story_data = stories[0]
story = story_data.get('story') or {}
metadata = (story_data.get('metadata') or {}).get('videoMetadata') or {}
story_id = (story.get('storyId') or {}).get('value', '')
creator = (metadata.get('creator') or {}).get('personCreator') or {}
username = creator.get('username', '')
collection = SnapCollection(
collection_id=story_id,
collection_type='spotlight',
title=metadata.get('description', ''),
username=username,
url=url
)
for snap_data in story.get('snapList') or []:
snap_id = (snap_data.get('snapId') or {}).get('value', '')
snap_urls = snap_data.get('snapUrls') or {}
media_url = snap_urls.get('mediaUrl', '')
media_id = ''
if '/d/' in media_url:
media_id = media_url.split('/d/')[1].split('.')[0]
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
snap = SnapMedia(
media_id=media_id or snap_id,
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
media_url=media_url,
timestamp=timestamp,
index=snap_data.get('snapIndex', 0),
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
duration_ms=int(metadata.get('durationMs', 0)),
description=metadata.get('description', ''),
view_count=int(metadata.get('viewCount', 0)),
width=int(metadata.get('width', 540)),
height=int(metadata.get('height', 960))
)
collection.snaps.append(snap)
return collection
def get_highlight_metadata(self, url: str) -> Optional[SnapCollection]:
"""Extract full metadata from a highlight URL via __NEXT_DATA__."""
html = self._fetch_page(url)
if not html:
return None
data = self._extract_next_data(html)
if not data:
return None
props = (data.get('props') or {}).get('pageProps') or {}
highlight = props.get('highlight') or {}
if not highlight:
return None
highlight_id = highlight.get('highlightId') or {}
if isinstance(highlight_id, dict):
highlight_id = highlight_id.get('value', '')
username_match = re.search(r'@([^/]+)', url)
username = username_match.group(1) if username_match else ''
title = highlight.get('storyTitle') or {}
if isinstance(title, dict):
title = title.get('value', '')
collection = SnapCollection(
collection_id=highlight_id,
collection_type='highlight',
title=title or 'Untitled Highlight',
username=username,
url=url
)
for snap_data in highlight.get('snapList') or []:
snap_urls = snap_data.get('snapUrls') or {}
media_url = snap_urls.get('mediaUrl', '')
media_id = ''
if '/d/' in media_url:
media_id = media_url.split('/d/')[1].split('.')[0]
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
lat = snap_data.get('lat')
lng = snap_data.get('lng')
snap = SnapMedia(
media_id=media_id,
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
media_url=media_url,
timestamp=timestamp,
index=snap_data.get('snapIndex', 0),
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
lat=float(lat) if lat else None,
lng=float(lng) if lng else None
)
collection.snaps.append(snap)
return collection
def _download_media_file(self, snap: SnapMedia, output_path: str) -> bool:
"""Download a single media file via curl_cffi."""
try:
url = snap.media_url.replace('&amp;', '&')
session = self._get_session()
resp = session.get(url, timeout=60)
if resp.status_code == 200 and len(resp.content) > 0:
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'wb') as f:
f.write(resp.content)
self._set_metadata(output_path, snap)
return True
self.log(f"Download failed: HTTP {resp.status_code}", "debug")
return False
except Exception as e:
self.log(f"Error downloading media: {e}", "error")
return False
def _set_metadata(self, file_path: str, snap: SnapMedia, description: str = None):
"""Set EXIF metadata and file timestamp."""
try:
date_str = snap.timestamp.strftime('%Y:%m:%d %H:%M:%S')
desc = description or snap.description or ""
if snap.view_count:
desc += f" [Views: {snap.view_count}]"
desc = desc.strip()
ext = os.path.splitext(file_path)[1].lower()
is_video = ext in ['.mp4', '.mov', '.avi', '.webm']
is_image = ext in ['.jpg', '.jpeg', '.png', '.webp']
exif_args = [
'exiftool', '-overwrite_original', '-ignoreMinorErrors',
f'-FileModifyDate={date_str}',
]
if is_image:
exif_args.extend([
f'-DateTimeOriginal={date_str}',
f'-CreateDate={date_str}',
f'-ModifyDate={date_str}',
f'-MetadataDate={date_str}',
])
if desc:
exif_args.extend([
f'-ImageDescription={desc}',
f'-XPComment={desc}',
f'-UserComment={desc}',
])
if snap.lat and snap.lng:
lat_ref = 'N' if snap.lat >= 0 else 'S'
lng_ref = 'E' if snap.lng >= 0 else 'W'
exif_args.extend([
f'-GPSLatitude={abs(snap.lat)}',
f'-GPSLatitudeRef={lat_ref}',
f'-GPSLongitude={abs(snap.lng)}',
f'-GPSLongitudeRef={lng_ref}',
])
elif is_video:
exif_args.extend([
f'-CreateDate={date_str}',
f'-ModifyDate={date_str}',
f'-MediaCreateDate={date_str}',
f'-MediaModifyDate={date_str}',
f'-TrackCreateDate={date_str}',
f'-TrackModifyDate={date_str}',
])
if desc:
exif_args.extend([
f'-Description={desc}',
f'-Comment={desc}',
])
exif_args.append(file_path)
subprocess.run(exif_args, capture_output=True, timeout=30)
# Set filesystem modification time
ts = snap.timestamp.timestamp()
os.utime(file_path, (ts, ts))
except Exception as e:
self.log(f"Warning: Could not set metadata for {file_path}: {e}", "debug")
def _generate_filename(self, username: str, snap: SnapMedia, ext: str) -> str:
"""Generate filename with timestamp and media ID."""
date_str = snap.timestamp.strftime('%Y%m%d_%H%M%S')
return f"{username}_{date_str}_{snap.media_id}.{ext}"
def _get_processed_posts(self, username: str) -> Set[str]:
"""Get set of media IDs that have been processed."""
processed = set()
if not self.db:
return processed
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT filename, metadata FROM downloads
WHERE platform = 'snapchat'
AND source = ?
''', (username,))
for row in cursor.fetchall():
filename, metadata_str = row
if filename:
parts = filename.split('_')
if len(parts) >= 4:
media_id = '_'.join(parts[3:]).split('.')[0]
processed.add(media_id)
if metadata_str:
try:
metadata = json.loads(metadata_str)
if 'media_id' in metadata:
processed.add(metadata['media_id'])
except (json.JSONDecodeError, TypeError, KeyError):
pass
except Exception as e:
self.log(f"Error loading processed posts: {e}", "debug")
return processed
def _record_download(self, username: str, url: str, filename: str,
post_date=None, metadata: dict = None, file_path: str = None,
deferred: bool = False):
"""Record a download in the database."""
if deferred:
self.pending_downloads.append({
'username': username,
'url': url,
'filename': filename,
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
'file_path': file_path,
'metadata': metadata
})
return True
if not self.db:
return
try:
self.db.mark_downloaded(
username=username,
url=url,
filename=filename,
post_date=post_date,
metadata=metadata,
file_path=file_path
)
except Exception as e:
self.log(f"Failed to record download: {e}", "debug")
def get_pending_downloads(self) -> list:
"""Get list of pending downloads for deferred recording."""
return self.pending_downloads
def clear_pending_downloads(self):
"""Clear pending downloads list."""
self.pending_downloads = []
def download(self, username: str, content_type: str = "all", days_back: int = 14,
max_downloads: int = 50, output_dir: str = None,
spotlight_dir: str = None, stories_dir: str = None,
stitch_highlights: bool = True, defer_database: bool = False,
phrase_config: dict = None) -> int:
"""Download content from a user - compatible with media-downloader interface.
Args:
username: Snapchat username
content_type: "spotlight", "stories", "highlights", or "all"
days_back: How many days back to download (filters by post date)
max_downloads: Maximum items to download per content type
output_dir: Default output directory (used if specific dirs not set)
spotlight_dir: Output directory for spotlights
stories_dir: Output directory for stories/highlights
stitch_highlights: Ignored (kept for backwards compatibility)
defer_database: If True, defer database recording
phrase_config: Not used (for interface compatibility)
Returns:
Number of files downloaded
"""
self.defer_database = defer_database
self.downloaded_files.clear()
# Set output directories
if spotlight_dir:
spotlight_output = Path(spotlight_dir)
elif output_dir:
spotlight_output = Path(output_dir)
else:
spotlight_output = Path(f"/opt/media-downloader/downloads/snapchat_client/spotlight/{username}")
if stories_dir:
stories_output = Path(stories_dir)
elif output_dir:
stories_output = Path(output_dir)
else:
stories_output = Path(f"/opt/media-downloader/downloads/snapchat_client/stories/{username}")
spotlight_output.mkdir(parents=True, exist_ok=True)
stories_output.mkdir(parents=True, exist_ok=True)
# Update activity status
if self.activity_manager:
self.activity_manager.update_status("Checking Snapchat")
# Get processed posts (shared with snapchat module - both use platform='snapchat')
processed = self._get_processed_posts(username)
self.log(f"Loaded {len(processed)} processed posts from database", "debug")
cutoff_date = datetime.now() - timedelta(days=days_back)
downloaded_count = 0
# Crash recovery checkpoint
from modules.task_checkpoint import TaskCheckpoint
checkpoint = TaskCheckpoint(f'snapchat_client:{username}', 'scraping')
try:
# Get profile content via HTTP
content = self.get_profile_content(username)
# Count total items for checkpoint
total_items = 0
if content_type in ['spotlight', 'all'] and content['spotlights']:
total_items += min(len(content['spotlights']), max_downloads)
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
total_items += min(len(content['highlights']), max_downloads)
checkpoint.start(total_items=total_items)
if checkpoint.is_recovering():
self.log(f"Snapchat Client @{username}: recovering — skipping already-processed URLs", "info")
# Download spotlights
if content_type in ['spotlight', 'all'] and content['spotlights']:
spotlight_items = content['spotlights'][:max_downloads]
self.log(f"Processing {len(spotlight_items)} spotlights...", "info")
if self.activity_manager:
self.activity_manager.update_status(
"Downloading spotlights",
progress_current=0,
progress_total=len(spotlight_items)
)
for spot_idx, url in enumerate(spotlight_items):
if self.activity_manager:
self.activity_manager.update_status(
"Downloading spotlights",
progress_current=spot_idx + 1,
progress_total=len(spotlight_items)
)
if checkpoint.is_completed(url):
continue
checkpoint.set_current(url)
try:
# Rate limit between page fetches
if spot_idx > 0:
time.sleep(random.uniform(1.5, 2.5))
spotlight = self.get_spotlight_metadata(url)
if not spotlight or not spotlight.snaps:
continue
snap = spotlight.snaps[0]
# Check date filter
if snap.timestamp < cutoff_date:
self.log(f"Spotlight {snap.media_id} is older than {days_back} days, skipping", "debug")
continue
# Check if already processed
if snap.media_id in processed or snap.media_id in self.downloaded_files:
self.log(f"Spotlight {snap.media_id} already processed, skipping", "debug")
continue
# Download
ext = 'mp4' if snap.media_type == 'video' else 'jpg'
filename = self._generate_filename(username, snap, ext)
output_path = str(spotlight_output / filename)
# Rate limit between CDN downloads
time.sleep(random.uniform(0.3, 0.5))
if self._download_media_file(snap, output_path):
self.downloaded_files.add(snap.media_id)
downloaded_count += 1
self.log(f"Downloaded spotlight: {filename}", "info")
self._record_download(
username=username,
url=url,
filename=filename,
post_date=snap.timestamp,
metadata={
'media_id': snap.media_id,
'description': snap.description,
'view_count': snap.view_count,
'content_type': 'spotlight'
},
file_path=output_path,
deferred=defer_database
)
except Exception as e:
self.log(f"Error processing spotlight: {e}", "error")
checkpoint.mark_completed(url)
# Rate limit between content types
if content_type == 'all' and content['spotlights'] and content['highlights']:
time.sleep(random.uniform(2, 3))
# Download highlights (stories)
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
highlight_items = content['highlights'][:max_downloads]
self.log(f"Processing {len(highlight_items)} highlights...", "info")
if self.activity_manager:
self.activity_manager.update_status(
"Downloading highlights",
progress_current=0,
progress_total=len(highlight_items)
)
for hi_idx, url in enumerate(highlight_items):
if self.activity_manager:
self.activity_manager.update_status(
"Downloading highlights",
progress_current=hi_idx + 1,
progress_total=len(highlight_items)
)
if checkpoint.is_completed(url):
continue
checkpoint.set_current(url)
try:
# Rate limit between page fetches
if hi_idx > 0:
time.sleep(random.uniform(1.5, 2.5))
highlight = self.get_highlight_metadata(url)
if not highlight or not highlight.snaps:
continue
# Check if any snap is within date range
newest_snap = max(highlight.snaps, key=lambda s: s.timestamp)
if newest_snap.timestamp < cutoff_date:
self.log(f"Highlight {highlight.collection_id} is older than {days_back} days, skipping", "debug")
continue
# Check if already processed
if highlight.collection_id in processed or highlight.collection_id in self.downloaded_files:
self.log(f"Highlight {highlight.collection_id} already processed, skipping", "debug")
continue
# Separate videos and images
videos = [s for s in highlight.snaps if s.media_type == 'video']
images = [s for s in highlight.snaps if s.media_type == 'image']
# Download images individually
for snap in images:
if snap.timestamp < cutoff_date:
continue
if snap.media_id in processed or snap.media_id in self.downloaded_files:
continue
time.sleep(random.uniform(0.3, 0.5))
filename = self._generate_filename(username, snap, 'jpg')
output_path = str(stories_output / filename)
if self._download_media_file(snap, output_path):
self.downloaded_files.add(snap.media_id)
downloaded_count += 1
self.log(f"Downloaded image: {filename}", "info")
self._record_download(
username=username,
url=highlight.url,
filename=filename,
post_date=snap.timestamp,
metadata={
'media_id': snap.media_id,
'highlight_id': highlight.collection_id,
'content_type': 'highlight_image'
},
file_path=output_path,
deferred=defer_database
)
# Download videos individually
for snap in videos:
if snap.timestamp < cutoff_date:
continue
if snap.media_id in processed or snap.media_id in self.downloaded_files:
continue
time.sleep(random.uniform(0.3, 0.5))
filename = self._generate_filename(username, snap, 'mp4')
output_path = str(stories_output / filename)
if self._download_media_file(snap, output_path):
self._set_metadata(output_path, snap)
self.downloaded_files.add(snap.media_id)
downloaded_count += 1
self.log(f"Downloaded video: {filename}", "info")
self._record_download(
username=username,
url=highlight.url,
filename=filename,
post_date=snap.timestamp,
metadata={
'media_id': snap.media_id,
'highlight_id': highlight.collection_id,
'content_type': 'highlight_video'
},
file_path=output_path,
deferred=defer_database
)
except Exception as e:
self.log(f"Error processing highlight: {e}", "error")
checkpoint.mark_completed(url)
except Exception as e:
self.log(f"Error during download: {e}", "error")
checkpoint.finish()
self.log(f"Downloaded {downloaded_count} files for @{username}", "info")
return downloaded_count

985
modules/snapchat_scraper.py Normal file
View File

@@ -0,0 +1,985 @@
#!/usr/bin/env python3
"""
Snapchat Direct Scraper Module - Scrapes directly from Snapchat.com
Uses Playwright to scrape profiles and extract:
- Spotlight videos (540x960)
- Stories/Highlights (480x852, stitched into single videos)
Full metadata extraction including timestamps, media IDs, descriptions.
Follows the same interface as the original snapchat_module.py
"""
import os
import json
import re
import tempfile
import subprocess
import shutil
import platform
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Dict, List, Any, Set
from dataclasses import dataclass, field
# Set environment for Playwright
os.environ.setdefault('PLAYWRIGHT_BROWSERS_PATH', '/root/.cache/ms-playwright')
from modules.base_module import LoggingMixin
from modules.cloudflare_handler import (
get_playwright_context_options,
get_playwright_stealth_scripts,
get_flaresolverr_user_agent
)
@dataclass
class SnapMedia:
"""Represents a single snap media item"""
media_id: str
media_type: str # 'video' or 'image'
media_url: str
timestamp: datetime
index: int = 0
thumbnail_url: str = ""
duration_ms: int = 0
description: str = ""
view_count: int = 0
width: int = 0
height: int = 0
lat: Optional[float] = None
lng: Optional[float] = None
@dataclass
class SnapCollection:
"""Represents a spotlight or highlight collection"""
collection_id: str
collection_type: str # 'spotlight' or 'highlight'
title: str = ""
username: str = ""
snaps: List[SnapMedia] = field(default_factory=list)
url: str = ""
class SnapchatDirectScraper(LoggingMixin):
"""
Scrapes Snapchat profiles directly for media content.
Follows the same interface as SnapchatDownloader for compatibility
with the media-downloader system.
"""
def __init__(self,
headless: bool = True,
show_progress: bool = True,
use_database: bool = True,
log_callback=None,
unified_db=None):
"""Initialize scraper compatible with media-downloader system"""
self.headless = headless
self.show_progress = show_progress
self.use_database = use_database
self.unified_db = unified_db
self.scraper_id = 'snapchat_direct'
self.download_count = 0
self.downloaded_files: Set[str] = set()
self.pending_downloads = []
# Initialize logging via mixin
self._init_logger('SnapchatDirect', log_callback, default_module='Download')
# User-Agent to match FlareSolverr (dynamically fetched for consistency)
self.user_agent = get_flaresolverr_user_agent()
# Browser state
self._playwright = None
self.browser = None
self.context = None
# Database adapter
if unified_db and use_database:
from modules.unified_database import SnapchatDatabaseAdapter
self.db = SnapchatDatabaseAdapter(unified_db)
else:
self.db = None
self.use_database = False
# Activity status manager
try:
from modules.activity_status import get_activity_manager
self.activity_manager = get_activity_manager(unified_db)
except ImportError:
self.activity_manager = None
# Load cookies from database
self.cookies = self._load_cookies_from_db()
# Load proxy configuration from database
self.proxy_url = None
if unified_db:
try:
scraper_config = unified_db.get_scraper('snapchat')
if scraper_config and scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
self.proxy_url = scraper_config['proxy_url']
self.log(f"Using proxy: {self.proxy_url}", "info")
except Exception as e:
self.log(f"Could not load proxy config: {e}", "debug")
def _load_cookies_from_db(self) -> List[Dict]:
"""Load cookies from database"""
if not self.unified_db:
return self._get_default_cookies()
try:
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
if cookies:
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
return cookies
except Exception as e:
self.log(f"Error loading cookies from database: {e}", "warning")
# Try loading from original snapchat scraper
try:
cookies = self.unified_db.get_scraper_cookies('snapchat')
if cookies:
self.log(f"Using cookies from 'snapchat' scraper", "debug")
return cookies
except Exception as e:
self.log(f"Error loading cookies from snapchat scraper: {e}", "debug")
return self._get_default_cookies()
def _get_default_cookies(self) -> List[Dict]:
"""Get default cookies for Snapchat"""
return [
{"name": "sc-cookies-accepted", "value": "true", "domain": "www.snapchat.com", "path": "/"},
]
def _save_cookies_to_db(self, cookies: List[Dict], user_agent: str = None):
"""Save cookies to database
Args:
cookies: List of cookie dictionaries
user_agent: User agent to associate with cookies (important for cf_clearance).
If not provided, uses self.user_agent as fallback.
"""
if not self.unified_db:
return
try:
# Use provided user_agent or fall back to self.user_agent
ua = user_agent or self.user_agent
self.unified_db.save_scraper_cookies(
self.scraper_id,
cookies,
user_agent=ua,
merge=True
)
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50]}...)", "debug")
except Exception as e:
self.log(f"Error saving cookies to database: {e}", "warning")
def _parse_proxy_url(self, proxy_url: str) -> Optional[Dict]:
"""
Parse proxy URL into Playwright proxy config.
Supports: protocol://user:pass@host:port or protocol://host:port
"""
import re
try:
# Match: protocol://[user:pass@]host:port
match = re.match(
r'^(https?|socks[45]?)://(?:([^:]+):([^@]+)@)?([^:]+):(\d+)$',
proxy_url
)
if match:
protocol, username, password, host, port = match.groups()
config = {'server': f'{protocol}://{host}:{port}'}
if username and password:
config['username'] = username
config['password'] = password
return config
except Exception as e:
self.log(f"Failed to parse proxy URL: {e}", "warning")
return None
def __enter__(self):
"""Context manager entry"""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
self._close_browser()
return False
def _start_browser(self):
"""Start Playwright browser"""
if self.browser is not None:
return
os.environ['DISPLAY'] = ':100'
from playwright.sync_api import sync_playwright
self._playwright = sync_playwright().start()
self.browser = self._playwright.chromium.launch(
headless=self.headless,
args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
)
# Build context options - use dynamic fingerprinting from FlareSolverr
context_options = get_playwright_context_options()
# IMPORTANT: If cookies have a stored user_agent, use THAT user_agent
# Cloudflare cf_clearance cookies are fingerprinted to the browser that solved the challenge
try:
if self.unified_db:
stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
if stored_user_agent:
self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug", module="Browser")
context_options['user_agent'] = stored_user_agent
else:
self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser")
else:
self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser")
except Exception as e:
self.log(f"Error getting stored user_agent, using default: {e}", "debug", module="Browser")
# Add proxy if configured
if self.proxy_url:
proxy_config = self._parse_proxy_url(self.proxy_url)
if proxy_config:
context_options['proxy'] = proxy_config
self.log(f"Browser using proxy: {proxy_config.get('server')}", "info", module="Browser")
self.context = self.browser.new_context(**context_options)
# Add anti-detection scripts to all pages in this context
self.context.add_init_script(get_playwright_stealth_scripts())
# Add cookies
if self.cookies:
# Clean cookies for Playwright and convert expiry->expires
cleaned = []
for c in self.cookies:
clean = {k: v for k, v in c.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
# FlareSolverr uses 'expiry' but Playwright uses 'expires'
if 'expiry' in clean and 'expires' not in clean:
clean['expires'] = clean.pop('expiry')
cleaned.append(clean)
# CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect
try:
self.context.clear_cookies()
except Exception:
pass
self.context.add_cookies(cleaned)
self.log("Browser started", "info", module="Browser")
def _close_browser(self):
"""Close browser and cleanup"""
if self.context:
try:
self.context.close()
except Exception as e:
self.log(f"Error closing browser context: {e}", "debug")
self.context = None
if self.browser:
try:
self.browser.close()
except Exception as e:
self.log(f"Error closing browser: {e}", "debug")
self.browser = None
if self._playwright:
try:
self._playwright.stop()
except Exception as e:
self.log(f"Error stopping playwright: {e}", "debug")
self._playwright = None
def _get_next_data(self, page) -> Optional[Dict]:
"""Extract __NEXT_DATA__ JSON from page"""
try:
next_data_elem = page.locator('script#__NEXT_DATA__').first
if next_data_elem.count() > 0:
return json.loads(next_data_elem.inner_text())
except Exception as e:
self.log(f"Error extracting __NEXT_DATA__: {e}", "debug")
return None
def _set_metadata(self, file_path: str, snap: SnapMedia, description: str = None):
"""Set EXIF metadata and file timestamp"""
try:
date_str = snap.timestamp.strftime('%Y:%m:%d %H:%M:%S')
desc = description or snap.description or ""
if snap.view_count:
desc += f" [Views: {snap.view_count}]"
desc = desc.strip()
ext = os.path.splitext(file_path)[1].lower()
is_video = ext in ['.mp4', '.mov', '.avi', '.webm']
is_image = ext in ['.jpg', '.jpeg', '.png', '.webp']
exif_args = [
'exiftool', '-overwrite_original', '-ignoreMinorErrors',
f'-FileModifyDate={date_str}',
]
if is_image:
exif_args.extend([
f'-DateTimeOriginal={date_str}',
f'-CreateDate={date_str}',
f'-ModifyDate={date_str}',
f'-MetadataDate={date_str}',
])
if desc:
exif_args.extend([
f'-ImageDescription={desc}',
f'-XPComment={desc}',
f'-UserComment={desc}',
])
if snap.lat and snap.lng:
lat_ref = 'N' if snap.lat >= 0 else 'S'
lng_ref = 'E' if snap.lng >= 0 else 'W'
exif_args.extend([
f'-GPSLatitude={abs(snap.lat)}',
f'-GPSLatitudeRef={lat_ref}',
f'-GPSLongitude={abs(snap.lng)}',
f'-GPSLongitudeRef={lng_ref}',
])
elif is_video:
exif_args.extend([
f'-CreateDate={date_str}',
f'-ModifyDate={date_str}',
f'-MediaCreateDate={date_str}',
f'-MediaModifyDate={date_str}',
f'-TrackCreateDate={date_str}',
f'-TrackModifyDate={date_str}',
])
if desc:
exif_args.extend([
f'-Description={desc}',
f'-Comment={desc}',
])
exif_args.append(file_path)
subprocess.run(exif_args, capture_output=True, timeout=30)
# Set filesystem modification time
ts = snap.timestamp.timestamp()
os.utime(file_path, (ts, ts))
except Exception as e:
self.log(f"Warning: Could not set metadata for {file_path}: {e}", "debug")
def get_profile_content(self, username: str) -> Dict[str, List[str]]:
"""Get all spotlight and highlight URLs from a profile"""
import time
if not self.browser:
self._start_browser()
page = self.context.new_page()
result = {'spotlights': [], 'highlights': []}
try:
url = f"https://www.snapchat.com/@{username}"
self.log(f"Navigating to profile @{username}", "info")
page.goto(url, wait_until='networkidle', timeout=30000)
time.sleep(2)
content = page.content()
# Extract spotlight URLs
spotlight_pattern = rf'/@{username}/spotlight/([A-Za-z0-9_-]+)'
spotlight_ids = list(set(re.findall(spotlight_pattern, content)))
result['spotlights'] = [
f"https://www.snapchat.com/@{username}/spotlight/{sid}"
for sid in spotlight_ids
]
self.log(f"Found {len(result['spotlights'])} spotlights", "info")
# Click Stories tab to get highlights
stories_tab = page.locator('[role="tab"]:has-text("Stories")').first
if stories_tab.count() > 0:
stories_tab.click()
time.sleep(2)
content = page.content()
highlight_pattern = rf'/@{username}/highlight/([A-Za-z0-9-]+)'
highlight_ids = list(set(re.findall(highlight_pattern, content)))
result['highlights'] = [
f"https://www.snapchat.com/@{username}/highlight/{hid}"
for hid in highlight_ids
]
self.log(f"Found {len(result['highlights'])} highlights", "info")
except Exception as e:
self.log(f"Error getting profile content: {e}", "error")
finally:
page.close()
return result
def get_spotlight_metadata(self, url: str) -> Optional[SnapCollection]:
"""Extract full metadata from a spotlight URL"""
import time
if not self.browser:
self._start_browser()
page = self.context.new_page()
try:
page.goto(url, wait_until='domcontentloaded', timeout=60000)
time.sleep(2)
data = self._get_next_data(page)
if not data:
return None
props = (data.get('props') or {}).get('pageProps') or {}
feed = props.get('spotlightFeed') or {}
stories = feed.get('spotlightStories') or []
if not stories:
return None
story_data = stories[0]
story = story_data.get('story') or {}
metadata = (story_data.get('metadata') or {}).get('videoMetadata') or {}
story_id = (story.get('storyId') or {}).get('value', '')
creator = (metadata.get('creator') or {}).get('personCreator') or {}
username = creator.get('username', '')
collection = SnapCollection(
collection_id=story_id,
collection_type='spotlight',
title=metadata.get('description', ''),
username=username,
url=url
)
for snap_data in story.get('snapList') or []:
snap_id = (snap_data.get('snapId') or {}).get('value', '')
snap_urls = snap_data.get('snapUrls') or {}
media_url = snap_urls.get('mediaUrl', '')
media_id = ''
if '/d/' in media_url:
media_id = media_url.split('/d/')[1].split('.')[0]
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
snap = SnapMedia(
media_id=media_id or snap_id,
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
media_url=media_url,
timestamp=timestamp,
index=snap_data.get('snapIndex', 0),
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
duration_ms=int(metadata.get('durationMs', 0)),
description=metadata.get('description', ''),
view_count=int(metadata.get('viewCount', 0)),
width=int(metadata.get('width', 540)),
height=int(metadata.get('height', 960))
)
collection.snaps.append(snap)
return collection
except Exception as e:
self.log(f"Error getting spotlight metadata: {e}", "error")
return None
finally:
page.close()
def get_highlight_metadata(self, url: str) -> Optional[SnapCollection]:
"""Extract full metadata from a highlight URL"""
import time
if not self.browser:
self._start_browser()
page = self.context.new_page()
try:
page.goto(url, wait_until='domcontentloaded', timeout=60000)
time.sleep(2)
data = self._get_next_data(page)
if not data:
return None
props = (data.get('props') or {}).get('pageProps') or {}
highlight = props.get('highlight') or {}
if not highlight:
return None
highlight_id = highlight.get('highlightId') or {}
if isinstance(highlight_id, dict):
highlight_id = highlight_id.get('value', '')
username_match = re.search(r'@([^/]+)', url)
username = username_match.group(1) if username_match else ''
title = highlight.get('storyTitle') or {}
if isinstance(title, dict):
title = title.get('value', '')
collection = SnapCollection(
collection_id=highlight_id,
collection_type='highlight',
title=title or 'Untitled Highlight',
username=username,
url=url
)
for snap_data in highlight.get('snapList') or []:
snap_urls = snap_data.get('snapUrls') or {}
media_url = snap_urls.get('mediaUrl', '')
media_id = ''
if '/d/' in media_url:
media_id = media_url.split('/d/')[1].split('.')[0]
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
lat = snap_data.get('lat')
lng = snap_data.get('lng')
snap = SnapMedia(
media_id=media_id,
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
media_url=media_url,
timestamp=timestamp,
index=snap_data.get('snapIndex', 0),
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
lat=float(lat) if lat else None,
lng=float(lng) if lng else None
)
collection.snaps.append(snap)
return collection
except Exception as e:
self.log(f"Error getting highlight metadata: {e}", "error")
return None
finally:
page.close()
def _download_media_file(self, snap: SnapMedia, output_path: str) -> bool:
"""Download a single media file"""
try:
url = snap.media_url.replace('&amp;', '&')
result = subprocess.run([
'curl', '-sL', '-o', output_path,
'-H', 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
url
], capture_output=True, timeout=60)
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
self._set_metadata(output_path, snap)
return True
return False
except Exception as e:
self.log(f"Error downloading media: {e}", "error")
return False
def _generate_filename(self, username: str, snap: SnapMedia, ext: str) -> str:
"""Generate filename with timestamp and media ID (FastDL format)"""
date_str = snap.timestamp.strftime('%Y%m%d_%H%M%S')
return f"{username}_{date_str}_{snap.media_id}.{ext}"
def _record_download(self, username: str, url: str, filename: str,
post_date=None, metadata: dict = None, file_path: str = None,
deferred: bool = False):
"""Record a download in the database"""
if deferred:
self.pending_downloads.append({
'username': username,
'url': url,
'filename': filename,
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
'file_path': file_path,
'metadata': metadata
})
return True
if not self.db:
return
try:
self.db.mark_downloaded(
username=username,
url=url,
filename=filename,
post_date=post_date,
metadata=metadata,
file_path=file_path
)
except Exception as e:
self.log(f"Failed to record download: {e}", "debug")
def get_pending_downloads(self):
"""Get list of downloads that were deferred"""
return self.pending_downloads.copy()
def clear_pending_downloads(self):
"""Clear the pending downloads list"""
self.pending_downloads = []
def _get_processed_posts(self, username: str) -> Set[str]:
"""Get set of media IDs that have been processed"""
processed = set()
if not self.db:
return processed
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT filename, metadata FROM downloads
WHERE platform = 'snapchat'
AND source = ?
''', (username,))
for row in cursor.fetchall():
filename, metadata_str = row
if filename:
parts = filename.split('_')
if len(parts) >= 4:
media_id = '_'.join(parts[3:]).split('.')[0]
processed.add(media_id)
if metadata_str:
try:
metadata = json.loads(metadata_str)
if 'media_id' in metadata:
processed.add(metadata['media_id'])
except (json.JSONDecodeError, TypeError, KeyError):
pass # Invalid metadata, skip
except Exception as e:
self.log(f"Error loading processed posts: {e}", "debug")
return processed
def download(self, username: str, content_type: str = "all", days_back: int = 14,
max_downloads: int = 50, output_dir: str = None,
spotlight_dir: str = None, stories_dir: str = None,
stitch_highlights: bool = True, defer_database: bool = False,
phrase_config: dict = None):
"""
Download content from a user - compatible with media-downloader interface
Args:
username: Snapchat username
content_type: "spotlight", "stories", "highlights", or "all"
days_back: How many days back to download (filters by post date)
max_downloads: Maximum items to download per content type
output_dir: Default output directory (used if specific dirs not set)
spotlight_dir: Output directory for spotlights
stories_dir: Output directory for stories/highlights
stitch_highlights: Ignored (kept for backwards compatibility)
defer_database: If True, defer database recording
phrase_config: Not used (for interface compatibility)
Returns:
Number of files downloaded
"""
self.defer_database = defer_database
self.downloaded_files.clear()
# Set output directories
# If specific dirs provided, use them directly
# If only output_dir provided, use it directly (caller handles structure)
# If nothing provided, use default with subdirectories
if spotlight_dir:
spotlight_output = Path(spotlight_dir)
elif output_dir:
spotlight_output = Path(output_dir)
else:
spotlight_output = Path(f"/opt/media-downloader/downloads/snapchat/spotlight/{username}")
if stories_dir:
stories_output = Path(stories_dir)
elif output_dir:
stories_output = Path(output_dir)
else:
stories_output = Path(f"/opt/media-downloader/downloads/snapchat/stories/{username}")
spotlight_output.mkdir(parents=True, exist_ok=True)
stories_output.mkdir(parents=True, exist_ok=True)
# Update activity status
if self.activity_manager:
self.activity_manager.update_status("Checking Snapchat")
# Get processed posts
processed = self._get_processed_posts(username)
self.log(f"Loaded {len(processed)} processed posts from database", "debug")
cutoff_date = datetime.now() - timedelta(days=days_back)
downloaded_count = 0
# Crash recovery checkpoint
from modules.task_checkpoint import TaskCheckpoint
checkpoint = TaskCheckpoint(f'snapchat:{username}', 'scraping')
try:
# Start browser
self._start_browser()
# Get profile content
content = self.get_profile_content(username)
# Count total items for checkpoint
total_items = 0
if content_type in ['spotlight', 'all'] and content['spotlights']:
total_items += min(len(content['spotlights']), max_downloads)
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
total_items += min(len(content['highlights']), max_downloads)
checkpoint.start(total_items=total_items)
if checkpoint.is_recovering():
self.log(f"Snapchat @{username}: recovering — skipping already-processed URLs", "info")
# Download spotlights
if content_type in ['spotlight', 'all'] and content['spotlights']:
spotlight_items = content['spotlights'][:max_downloads]
self.log(f"Processing {len(spotlight_items)} spotlights...", "info")
if self.activity_manager:
self.activity_manager.update_status(
"Downloading spotlights",
progress_current=0,
progress_total=len(spotlight_items)
)
for spot_idx, url in enumerate(spotlight_items):
# Update progress at start of each iteration (fires even on skips)
if self.activity_manager:
self.activity_manager.update_status(
"Downloading spotlights",
progress_current=spot_idx + 1,
progress_total=len(spotlight_items)
)
if checkpoint.is_completed(url):
continue
checkpoint.set_current(url)
try:
spotlight = self.get_spotlight_metadata(url)
if not spotlight or not spotlight.snaps:
continue
snap = spotlight.snaps[0]
# Check date filter
if snap.timestamp < cutoff_date:
self.log(f"Spotlight {snap.media_id} is older than {days_back} days, skipping", "debug")
continue
# Check if already processed
if snap.media_id in processed or snap.media_id in self.downloaded_files:
self.log(f"Spotlight {snap.media_id} already processed, skipping", "debug")
continue
# Download
ext = 'mp4' if snap.media_type == 'video' else 'jpg'
filename = self._generate_filename(username, snap, ext)
output_path = str(spotlight_output / filename)
if self._download_media_file(snap, output_path):
self.downloaded_files.add(snap.media_id)
downloaded_count += 1
self.log(f"Downloaded spotlight: {filename}", "info")
self._record_download(
username=username,
url=url,
filename=filename,
post_date=snap.timestamp,
metadata={
'media_id': snap.media_id,
'description': snap.description,
'view_count': snap.view_count,
'content_type': 'spotlight'
},
file_path=output_path,
deferred=defer_database
)
except Exception as e:
self.log(f"Error processing spotlight: {e}", "error")
checkpoint.mark_completed(url)
# Download highlights (stories)
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
highlight_items = content['highlights'][:max_downloads]
self.log(f"Processing {len(highlight_items)} highlights...", "info")
if self.activity_manager:
self.activity_manager.update_status(
"Downloading highlights",
progress_current=0,
progress_total=len(highlight_items)
)
for hi_idx, url in enumerate(highlight_items):
# Update progress at start of each iteration (fires even on skips)
if self.activity_manager:
self.activity_manager.update_status(
"Downloading highlights",
progress_current=hi_idx + 1,
progress_total=len(highlight_items)
)
if checkpoint.is_completed(url):
continue
checkpoint.set_current(url)
try:
highlight = self.get_highlight_metadata(url)
if not highlight or not highlight.snaps:
continue
# Check if any snap is within date range
newest_snap = max(highlight.snaps, key=lambda s: s.timestamp)
if newest_snap.timestamp < cutoff_date:
self.log(f"Highlight {highlight.collection_id} is older than {days_back} days, skipping", "debug")
continue
# Check if already processed
if highlight.collection_id in processed or highlight.collection_id in self.downloaded_files:
self.log(f"Highlight {highlight.collection_id} already processed, skipping", "debug")
continue
# Separate videos and images
videos = [s for s in highlight.snaps if s.media_type == 'video']
images = [s for s in highlight.snaps if s.media_type == 'image']
# Download images individually
for snap in images:
if snap.timestamp < cutoff_date:
continue
if snap.media_id in processed or snap.media_id in self.downloaded_files:
continue
filename = self._generate_filename(username, snap, 'jpg')
output_path = str(stories_output / filename)
if self._download_media_file(snap, output_path):
self.downloaded_files.add(snap.media_id)
downloaded_count += 1
self.log(f"Downloaded image: {filename}", "info")
self._record_download(
username=username,
url=highlight.url,
filename=filename,
post_date=snap.timestamp,
metadata={
'media_id': snap.media_id,
'highlight_id': highlight.collection_id,
'content_type': 'highlight_image'
},
file_path=output_path,
deferred=defer_database
)
# Handle videos - download each clip individually
if videos:
for snap in videos:
if snap.timestamp < cutoff_date:
continue
if snap.media_id in processed or snap.media_id in self.downloaded_files:
continue
filename = self._generate_filename(username, snap, 'mp4')
output_path = str(stories_output / filename)
if self._download_media_file(snap, output_path):
self._set_metadata(output_path, snap)
self.downloaded_files.add(snap.media_id)
downloaded_count += 1
self.log(f"Downloaded video: {filename}", "info")
self._record_download(
username=username,
url=highlight.url,
filename=filename,
post_date=snap.timestamp,
metadata={
'media_id': snap.media_id,
'highlight_id': highlight.collection_id,
'content_type': 'highlight_video'
},
file_path=output_path,
deferred=defer_database
)
except Exception as e:
self.log(f"Error processing highlight: {e}", "error")
checkpoint.mark_completed(url)
except Exception as e:
self.log(f"Error during download: {e}", "error")
checkpoint.finish()
self.log(f"Downloaded {downloaded_count} files for @{username}", "info")
return downloaded_count
def test_scraper():
"""Test the scraper"""
print("=" * 60)
print("SNAPCHAT DIRECT SCRAPER TEST")
print("=" * 60)
with SnapchatDirectScraper(headless=True) as scraper:
username = "evalongoria"
# Test download
count = scraper.download(
username=username,
content_type="all",
days_back=30,
max_downloads=5,
spotlight_dir="/tmp/snap_test/spotlight",
stories_dir="/tmp/snap_test/stories",
stitch_highlights=True
)
print(f"\nDownloaded {count} files")
# Show files
import os
for root, dirs, files in os.walk("/tmp/snap_test"):
for f in files:
path = os.path.join(root, f)
size = os.path.getsize(path) / 1024
print(f" {path}: {size:.1f}KB")
print("=" * 60)
print("TEST COMPLETE")
print("=" * 60)
if __name__ == "__main__":
test_scraper()

391
modules/taddy_client.py Normal file
View File

@@ -0,0 +1,391 @@
"""Taddy Podcast API client for finding podcast appearances"""
import asyncio
import re
from html import unescape
from datetime import datetime, timedelta
from typing import Dict, List, Optional
from web.backend.core.http_client import http_client
from modules.universal_logger import get_logger
logger = get_logger('Taddy')
def strip_html(text: str) -> str:
"""Strip HTML tags and decode entities from text"""
if not text:
return text
# Remove HTML tags
clean = re.sub(r'<[^>]+>', ' ', text)
# Decode HTML entities
clean = unescape(clean)
# Normalize whitespace
clean = re.sub(r'\s+', ' ', clean).strip()
return clean
class TaddyClient:
"""Client for interacting with the Taddy Podcast API (GraphQL)
Supports primary and fallback accounts for quota management.
When the primary account fails (500 error / quota exceeded),
automatically switches to the fallback account.
"""
BASE_URL = "https://api.taddy.org"
def __init__(self, user_id: str, api_key: str,
user_id_2: str = None, api_key_2: str = None):
# Primary account
self.user_id = user_id
self.api_key = api_key
# Fallback account (optional)
self.user_id_2 = user_id_2
self.api_key_2 = api_key_2
self.has_fallback = bool(user_id_2 and api_key_2)
# Track which account is active
self.using_fallback = False
self._update_headers()
def _update_headers(self):
"""Update headers based on current active account"""
if self.using_fallback and self.has_fallback:
self.headers = {
"Content-Type": "application/json",
"X-USER-ID": self.user_id_2,
"X-API-KEY": self.api_key_2
}
else:
self.headers = {
"Content-Type": "application/json",
"X-USER-ID": self.user_id,
"X-API-KEY": self.api_key
}
def _switch_to_fallback(self) -> bool:
"""Switch to fallback account if available. Returns True if switched."""
if self.has_fallback and not self.using_fallback:
self.using_fallback = True
self._update_headers()
logger.info("Switched to fallback Taddy account")
return True
return False
async def _graphql_query(self, query: str, variables: Dict = None, retry_on_fallback: bool = True) -> Optional[Dict]:
"""Execute a GraphQL query against the Taddy API
If the primary account fails with a 500 error (quota exceeded),
automatically retries with the fallback account if configured.
"""
try:
payload = {"query": query}
if variables:
payload["variables"] = variables
response = await http_client.post(
self.BASE_URL,
json=payload,
headers=self.headers
)
data = response.json()
if "errors" in data:
logger.error(f"Taddy API error: {data['errors']}")
return None
return data.get("data")
except Exception as e:
error_str = str(e).lower()
# Check for 500 error (quota exceeded) - http_client raises ServiceError
if "500" in error_str or "server error" in error_str:
account_type = "fallback" if self.using_fallback else "primary"
logger.warning(f"Taddy API returned 500 on {account_type} account (likely quota exceeded)")
# Try fallback if available and we haven't already
if retry_on_fallback and self._switch_to_fallback():
logger.info("Retrying with fallback Taddy account...")
return await self._graphql_query(query, variables, retry_on_fallback=False)
logger.error(f"Taddy API request failed: {e}")
return None
async def search_podcast_appearances(
self,
celebrity_name: str,
lookback_days: int = 730, # 2 years
lookahead_days: int = 30,
limit: int = 25,
max_pages: int = 10
) -> List[Dict]:
"""
Search for podcast episodes featuring a celebrity.
Args:
celebrity_name: Name of the celebrity to search for
lookback_days: How many days back to search
lookahead_days: How many days forward to search (for scheduled releases)
limit: Maximum results per page
Returns:
List of podcast appearance dicts
"""
appearances = []
# Calculate date range
now = datetime.now()
start_date = now - timedelta(days=lookback_days)
# Convert to Unix timestamp (seconds)
start_timestamp = int(start_date.timestamp())
query = """
query SearchPodcastEpisodes($term: String!, $limitPerPage: Int, $page: Int, $filterForPublishedAfter: Int) {
search(
term: $term,
filterForTypes: PODCASTEPISODE,
matchBy: EXACT_PHRASE,
limitPerPage: $limitPerPage,
page: $page,
filterForPublishedAfter: $filterForPublishedAfter
) {
searchId
podcastEpisodes {
uuid
name
description
datePublished
audioUrl
persons {
uuid
name
role
}
podcastSeries {
uuid
name
imageUrl
}
websiteUrl
}
}
}
"""
# Paginate through results (max 20 pages API limit, 25 per page = 500 max)
# max_pages passed as parameter from config
all_episodes = []
for page in range(1, max_pages + 1):
variables = {
"term": celebrity_name,
"limitPerPage": limit,
"page": page,
"filterForPublishedAfter": start_timestamp
}
data = await self._graphql_query(query, variables)
if not data or not data.get("search"):
break
episodes = data["search"].get("podcastEpisodes", [])
if not episodes:
break # No more results
all_episodes.extend(episodes)
# If we got fewer than limit, we've reached the end
if len(episodes) < limit:
break
# Small delay between pages
await asyncio.sleep(0.2)
episodes = all_episodes
for ep in episodes:
try:
# Parse the episode data
podcast_series = ep.get("podcastSeries", {})
ep_name = (ep.get("name") or "")
podcast_name = (podcast_series.get("name") or "")
name_lower = celebrity_name.lower()
name_parts = name_lower.split()
# ===== USE PERSONS METADATA FOR ACCURATE FILTERING =====
# Check if celebrity is listed in the persons array with a role
persons = ep.get("persons", []) or []
person_match = None
credit_type = None
for person in persons:
person_name = (person.get("name") or "").lower()
# Match full name or last name
if name_lower in person_name or person_name in name_lower:
person_match = person
role = (person.get("role") or "").lower()
# Map Taddy roles to our credit types
if "host" in role:
credit_type = "host"
elif "guest" in role:
credit_type = "guest"
elif role:
credit_type = role # Use whatever role they have
else:
credit_type = "guest" # Default to guest if role not specified
break
# Also check by last name for partial matches
elif len(name_parts) >= 2:
last_name = name_parts[-1]
first_name = name_parts[0]
if len(last_name) >= 4 and (last_name in person_name or first_name in person_name):
person_match = person
role = (person.get("role") or "").lower()
if "host" in role:
credit_type = "host"
elif "guest" in role:
credit_type = "guest"
elif role:
credit_type = role
else:
credit_type = "guest"
break
# If person is in the persons list, include the episode
if person_match:
logger.debug(f"Accepting '{ep_name}' - {celebrity_name} listed as {credit_type} in persons metadata")
is_host = (credit_type == "host")
else:
# Fallback: check if they're the host via podcast series name
podcast_name_lower = podcast_name.lower()
is_host = name_lower in podcast_name_lower
if not is_host and len(name_parts) >= 2:
last_name = name_parts[-1]
first_name = name_parts[0]
if len(last_name) >= 4:
is_host = (f"with {last_name}" in podcast_name_lower or
f"with {first_name}" in podcast_name_lower or
f"{first_name} {last_name}" in podcast_name_lower)
if is_host:
credit_type = "host"
logger.debug(f"Accepting '{ep_name}' - host podcast (name in series title)")
else:
# No persons metadata - use WHITELIST approach
# Only accept if title clearly indicates an interview/guest appearance
ep_name_lower = ep_name.lower()
if name_lower not in ep_name_lower:
logger.debug(f"Skipping '{ep_name}' - name not in title")
continue
# Check podcast name for news/gossip shows first
garbage_podcast_names = ['news', 'gossip', 'rumor', 'daily', 'trending', 'tmz', 'variety', 'march madness', 'cruz show', 'aesthetic arrest', 'devious maids']
if any(word in podcast_name_lower for word in garbage_podcast_names):
logger.debug(f"Skipping '{ep_name}' - podcast name suggests news/gossip")
continue
# Reject listicles (multiple comma-separated topics)
comma_count = ep_name_lower.count(',')
if comma_count >= 3:
logger.debug(f"Skipping '{ep_name}' - listicle format ({comma_count} commas)")
continue
# WHITELIST: Only accept if title matches clear interview patterns
interview_patterns = [
# Direct interview indicators
rf'(interview|interviews|interviewing)\s+(with\s+)?{re.escape(name_lower)}',
rf'{re.escape(name_lower)}\s+(interview|interviewed)',
# Guest indicators
rf'(guest|featuring|feat\.?|ft\.?|with guest|special guest)[:\s]+{re.escape(name_lower)}',
rf'{re.escape(name_lower)}\s+(joins|joined|stops by|sits down|talks|speaks|discusses|shares|reveals|opens up|gets real|gets honest)',
# "Name on Topic" format (common interview title)
rf'^{re.escape(name_lower)}\s+on\s+',
# Episode number + name format ("Ep 123: Name...")
rf'^(ep\.?|episode|#)\s*\d+[:\s]+{re.escape(name_lower)}',
# Name at start followed by colon or dash (interview format)
rf'^{re.escape(name_lower)}\s*[:\-–—]\s*',
# "Conversation with Name"
rf'(conversation|chat|talk|talking|speaking)\s+with\s+{re.escape(name_lower)}',
# "Name Returns" / "Name is Back"
rf'{re.escape(name_lower)}\s+(returns|is back|comes back)',
# Q&A format
rf'(q&a|q\s*&\s*a|ama)\s+(with\s+)?{re.escape(name_lower)}',
# Podcast-specific patterns
rf'{re.escape(name_lower)}\s+(live|in studio|in the studio|on the show|on the pod)',
]
is_interview = False
for pattern in interview_patterns:
if re.search(pattern, ep_name_lower):
is_interview = True
logger.debug(f"Accepting '{ep_name}' - matches interview pattern")
break
if not is_interview:
logger.debug(f"Skipping '{ep_name}' - no interview pattern match (name just mentioned)")
continue
credit_type = "guest"
# Get the artwork URL from podcast series
artwork_url = podcast_series.get("imageUrl")
# Parse date
date_published = ep.get("datePublished")
if date_published:
# Taddy returns Unix timestamp in seconds
try:
pub_date = datetime.fromtimestamp(date_published)
appearance_date = pub_date.strftime("%Y-%m-%d")
status = "upcoming" if pub_date.date() > now.date() else "aired"
except (ValueError, TypeError):
appearance_date = None
status = "aired"
else:
appearance_date = None
status = "aired"
# Get episode URL
episode_url = ep.get("websiteUrl")
appearance = {
"appearance_type": "Podcast",
"show_name": podcast_series.get("name", "Unknown Podcast"),
"episode_title": ep.get("name"),
"appearance_date": appearance_date,
"status": status,
"description": strip_html(ep.get("description")),
"poster_url": artwork_url,
"audio_url": ep.get("audioUrl"),
"url": episode_url,
"credit_type": credit_type or ("host" if is_host else "guest"),
"character_name": "Self",
"taddy_episode_uuid": ep.get("uuid"),
"taddy_podcast_uuid": podcast_series.get("uuid"),
"duration_seconds": None, # Duration removed from query to reduce complexity
}
appearances.append(appearance)
logger.info(f"Found podcast appearance: {celebrity_name} on '{podcast_series.get('name')}' - {ep.get('name')}")
except Exception as e:
logger.error(f"Error parsing Taddy episode: {e}")
continue
return appearances
async def test_connection(self) -> bool:
"""Test if the API credentials are valid"""
query = """
query TestConnection {
search(term: "test", filterForTypes: PODCASTSERIES, limitPerPage: 1) {
searchId
}
}
"""
data = await self._graphql_query(query)
return data is not None

295
modules/task_checkpoint.py Normal file
View File

@@ -0,0 +1,295 @@
"""
Task Checkpoint Module for Crash Recovery
Tracks progress of long-running scheduler tasks so that if the scheduler
crashes mid-task, it can resume from where it left off instead of
re-processing everything from scratch.
Uses the scheduler_state database (PostgreSQL via pgadapter).
"""
import json
import sqlite3
import threading
import time
from contextlib import closing
from datetime import datetime
from pathlib import Path
from typing import Callable, List, Optional, Set
from modules.universal_logger import get_logger
logger = get_logger('TaskCheckpoint')
# Path to the scheduler state database
_SCHEDULER_DB_PATH = Path(__file__).parent.parent / 'database' / 'scheduler_state.db'
# How many items to buffer before flushing to DB
_FLUSH_INTERVAL = 5
# Stale checkpoint threshold (hours) — abandon checkpoints older than this
STALE_THRESHOLD_HOURS = 48
class TaskCheckpoint:
"""Track progress of a scheduler task for crash recovery.
Usage::
checkpoint = TaskCheckpoint('instagram_unified:all')
checkpoint.start(total_items=len(accounts))
for account in accounts:
if checkpoint.is_completed(account['username']):
continue
checkpoint.set_current(account['username'])
process(account)
checkpoint.mark_completed(account['username'])
checkpoint.finish()
"""
def __init__(self, task_id: str, task_type: str = 'scraping'):
self.task_id = task_id
self.task_type = task_type
self._started = False
self._recovering = False
self._completed_items: Set[str] = set()
self._pending_flush: List[str] = [] # items not yet flushed to DB
self._current_item: Optional[str] = None
self._total_items: int = 0
self._lock = threading.Lock()
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def start(self, total_items: int = 0):
"""Create or resume a checkpoint record.
If a prior checkpoint exists for this task_id (left behind by a crash),
we load the completed items from it and set recovery mode.
"""
self._total_items = total_items
self._started = True
existing = self._load_existing()
if existing is not None:
# Resuming from a crash
self._completed_items = existing
self._recovering = True
logger.info(
f"Resuming checkpoint for {self.task_id}: "
f"{len(self._completed_items)}/{total_items} items already completed",
module='Checkpoint',
)
else:
# Fresh run
self._completed_items = set()
self._recovering = False
self._create_record(total_items)
def is_recovering(self) -> bool:
"""True if we are resuming from a prior crash."""
return self._recovering
def is_completed(self, item_id: str) -> bool:
"""Check whether *item_id* was already processed in a previous run."""
return str(item_id) in self._completed_items
def get_remaining(self, items: list, key_fn: Callable) -> list:
"""Return only items not yet completed.
Args:
items: Full list of items.
key_fn: Function that extracts the item key from each element.
"""
return [item for item in items if str(key_fn(item)) not in self._completed_items]
def set_current(self, item_id: str):
"""Record which item is currently being processed (for crash diagnostics)."""
self._current_item = str(item_id)
self._update_current_item()
def mark_completed(self, item_id: str):
"""Mark an item as done. Batches DB writes every _FLUSH_INTERVAL items."""
item_id = str(item_id)
with self._lock:
self._completed_items.add(item_id)
self._pending_flush.append(item_id)
should_flush = len(self._pending_flush) >= _FLUSH_INTERVAL
if should_flush:
self._flush()
def finish(self):
"""Task completed successfully — delete the checkpoint record."""
if not self._started:
return
self._flush() # flush any remaining items
self._delete_record()
self._started = False
def finish_if_started(self):
"""No-op if start() was never called; otherwise calls finish()."""
if self._started:
self.finish()
# ------------------------------------------------------------------
# Class methods for discovery
# ------------------------------------------------------------------
@classmethod
def get_interrupted(cls) -> list:
"""Find checkpoint records left behind by crashed tasks.
Returns a list of dicts with keys:
task_id, task_type, started_at, completed_count, total_items, current_item
"""
try:
with closing(sqlite3.connect(str(_SCHEDULER_DB_PATH), timeout=10)) as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT task_id, task_type, started_at, completed_items, "
"total_items, current_item FROM scheduler_task_checkpoints "
"WHERE status = 'running'"
)
rows = cursor.fetchall()
results = []
for row in rows:
task_id, task_type, started_at, completed_json, total_items, current_item = row
completed = cls._parse_completed_json(completed_json)
results.append({
'task_id': task_id,
'task_type': task_type,
'started_at': started_at,
'completed_count': len(completed),
'total_items': total_items or 0,
'current_item': current_item,
})
return results
except Exception as e:
if 'no such table' not in str(e).lower():
logger.warning(f"Error reading interrupted checkpoints: {e}", module='Checkpoint')
return []
@classmethod
def abandon(cls, task_id: str):
"""Mark a checkpoint as abandoned (e.g. task no longer registered)."""
try:
with closing(sqlite3.connect(str(_SCHEDULER_DB_PATH), timeout=10)) as conn:
conn.execute(
"UPDATE scheduler_task_checkpoints SET status = 'abandoned', "
"updated_at = ? WHERE task_id = ?",
(datetime.now().isoformat(), task_id),
)
conn.commit()
except Exception as e:
logger.warning(f"Error abandoning checkpoint {task_id}: {e}", module='Checkpoint')
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _load_existing(self) -> Optional[Set[str]]:
"""Load completed items from an existing checkpoint, or return None."""
try:
with closing(sqlite3.connect(str(_SCHEDULER_DB_PATH), timeout=10)) as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT completed_items FROM scheduler_task_checkpoints "
"WHERE task_id = ? AND status = 'running'",
(self.task_id,),
)
row = cursor.fetchone()
if row is None:
return None
return self._parse_completed_json(row[0])
except Exception as e:
if 'no such table' not in str(e).lower():
logger.warning(f"Error loading checkpoint for {self.task_id}: {e}", module='Checkpoint')
return None
def _create_record(self, total_items: int):
"""Insert a fresh checkpoint row (or replace an existing abandoned one)."""
try:
with closing(sqlite3.connect(str(_SCHEDULER_DB_PATH), timeout=10)) as conn:
conn.execute(
"INSERT OR REPLACE INTO scheduler_task_checkpoints "
"(task_id, task_type, started_at, completed_items, current_item, "
"total_items, status, updated_at) "
"VALUES (?, ?, ?, '[]', NULL, ?, 'running', ?)",
(
self.task_id,
self.task_type,
datetime.now().isoformat(),
total_items,
datetime.now().isoformat(),
),
)
conn.commit()
except Exception as e:
logger.warning(f"Error creating checkpoint for {self.task_id}: {e}", module='Checkpoint')
def _flush(self):
"""Write pending completed items to the database."""
with self._lock:
if not self._pending_flush:
return
items_snapshot = list(self._completed_items)
self._pending_flush.clear()
try:
completed_json = json.dumps(items_snapshot)
with closing(sqlite3.connect(str(_SCHEDULER_DB_PATH), timeout=10)) as conn:
conn.execute(
"UPDATE scheduler_task_checkpoints "
"SET completed_items = ?, total_items = ?, updated_at = ? "
"WHERE task_id = ?",
(
completed_json,
self._total_items,
datetime.now().isoformat(),
self.task_id,
),
)
conn.commit()
except Exception as e:
logger.warning(f"Error flushing checkpoint for {self.task_id}: {e}", module='Checkpoint')
def _update_current_item(self):
"""Update the current_item column for crash diagnostics."""
try:
with closing(sqlite3.connect(str(_SCHEDULER_DB_PATH), timeout=10)) as conn:
conn.execute(
"UPDATE scheduler_task_checkpoints "
"SET current_item = ?, updated_at = ? WHERE task_id = ?",
(self._current_item, datetime.now().isoformat(), self.task_id),
)
conn.commit()
except Exception as e:
# Non-critical — just diagnostics
pass
def _delete_record(self):
"""Remove the checkpoint row on successful completion."""
try:
with closing(sqlite3.connect(str(_SCHEDULER_DB_PATH), timeout=10)) as conn:
conn.execute(
"DELETE FROM scheduler_task_checkpoints WHERE task_id = ?",
(self.task_id,),
)
conn.commit()
except Exception as e:
logger.warning(f"Error deleting checkpoint for {self.task_id}: {e}", module='Checkpoint')
@staticmethod
def _parse_completed_json(raw: str) -> Set[str]:
"""Parse JSON array of completed item IDs, tolerating corruption."""
if not raw:
return set()
try:
items = json.loads(raw)
if isinstance(items, list):
return set(str(i) for i in items)
except (json.JSONDecodeError, TypeError):
logger.warning("Corrupted checkpoint data — starting fresh (scrapers deduplicate)", module='Checkpoint')
return set()

View File

@@ -0,0 +1,639 @@
#!/usr/bin/env python3
"""
Background worker to pre-generate thumbnails and cache metadata for all media files.
This improves performance by generating thumbnails in advance rather than on-demand.
"""
import sys
import os
import time
import hashlib
from pathlib import Path
from datetime import datetime
from PIL import Image
import io
# Add parent directory to path so we can import modules
sys.path.insert(0, str(Path(__file__).parent.parent))
# Bootstrap database backend (must be before any database imports)
import modules.db_bootstrap # noqa: E402,F401
import sqlite3
from modules.universal_logger import get_logger
logger = get_logger('ThumbnailCacheBuilder')
class ThumbnailCacheBuilder:
"""Build and maintain thumbnail and metadata cache for media files"""
def __init__(self):
self.scan_dirs = [
Path('/opt/immich/md'),
Path('/opt/immich/review'),
Path('/opt/immich/recycle')
]
self.db_path = Path(__file__).parent.parent / 'database' / 'thumbnails.db'
self.metadata_db_path = Path(__file__).parent.parent / 'database' / 'media_metadata.db'
self.unified_db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db'
self.max_thumb_size = (300, 300)
# Image and video extensions
self.image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif', '.webp'}
self.video_extensions = {'.mp4', '.mov', '.webm', '.avi', '.mkv', '.flv', '.m4v'}
self.stats = {
'processed': 0,
'thumbnails_created': 0,
'thumbnails_cached': 0,
'metadata_cached': 0,
'errors': 0,
'skipped': 0
}
self._init_metadata_db()
def _init_metadata_db(self):
"""Initialize metadata cache database"""
self.metadata_db_path.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute("""
CREATE TABLE IF NOT EXISTS media_metadata (
file_hash TEXT PRIMARY KEY,
file_path TEXT NOT NULL,
width INTEGER,
height INTEGER,
file_size INTEGER,
duration REAL,
format TEXT,
created_at TEXT,
file_mtime DOUBLE PRECISION
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_meta_file_path ON media_metadata(file_path)")
conn.commit()
conn.close()
logger.info(f"Metadata database initialized at {self.metadata_db_path}", module="Database")
def _get_file_hash(self, file_path: Path, content_hash: str = None) -> str:
"""Generate hash for file path or use content hash
Args:
file_path: Path to the file
content_hash: Optional SHA256 content hash from database (preferred for recycle bin)
"""
if content_hash:
# Use first 64 chars of content hash (full SHA256 for cache key)
return content_hash[:64]
# Fall back to path-based hash
return hashlib.sha256(str(file_path).encode()).hexdigest()
def _generate_image_thumbnail(self, file_path: Path) -> tuple:
"""Generate thumbnail and extract metadata for image
Returns: (thumbnail_data, width, height, format)
"""
try:
with Image.open(file_path) as img:
# Get original dimensions
width, height = img.size
img_format = img.format
# Convert RGBA to RGB if needed
if img.mode == 'RGBA':
background = Image.new('RGB', img.size, (255, 255, 255))
background.paste(img, mask=img.split()[3])
img = background
elif img.mode != 'RGB':
img = img.convert('RGB')
# Generate thumbnail
img.thumbnail(self.max_thumb_size, Image.Resampling.LANCZOS)
# Save to bytes
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=85, optimize=True)
thumbnail_data = buffer.getvalue()
return thumbnail_data, width, height, img_format
except Exception as e:
logger.error(f"Error generating image thumbnail for {file_path}: {e}", module="Error")
return None, None, None, None
def _generate_video_thumbnail(self, file_path: Path) -> tuple:
"""Generate thumbnail and extract metadata for video using ffmpeg
Returns: (thumbnail_data, width, height, duration)
"""
try:
import subprocess
import json
# Get video metadata using ffprobe
probe_cmd = [
'ffprobe',
'-v', 'quiet',
'-print_format', 'json',
'-show_format',
'-show_streams',
str(file_path)
]
result = subprocess.run(probe_cmd, capture_output=True, text=True, timeout=30)
if result.returncode != 0:
logger.error(f"ffprobe failed for {file_path}", module="Error")
return None, None, None, None
metadata = json.loads(result.stdout)
# Extract video stream info
video_stream = next((s for s in metadata.get('streams', []) if s.get('codec_type') == 'video'), None)
if not video_stream:
return None, None, None, None
width = video_stream.get('width')
height = video_stream.get('height')
duration = float(metadata.get('format', {}).get('duration', 0))
# Generate thumbnail - seek to 1s or 0s for very short videos
temp_output = f"/tmp/thumb_{os.getpid()}.jpg"
seek_time = '00:00:01' if duration > 1.5 else '00:00:00'
thumb_cmd = [
'ffmpeg',
'-ss', seek_time,
'-i', str(file_path),
'-vframes', '1',
'-vf', f'scale={self.max_thumb_size[0]}:{self.max_thumb_size[1]}:force_original_aspect_ratio=decrease',
'-y',
temp_output
]
result = subprocess.run(thumb_cmd, capture_output=True, timeout=30)
if result.returncode != 0 or not Path(temp_output).exists():
logger.error(f"ffmpeg thumbnail generation failed for {file_path}", module="Error")
return None, width, height, duration
# Read thumbnail data
with open(temp_output, 'rb') as f:
thumbnail_data = f.read()
# Clean up temp file
Path(temp_output).unlink(missing_ok=True)
return thumbnail_data, width, height, duration
except Exception as e:
logger.error(f"Error generating video thumbnail for {file_path}: {e}", module="Error")
return None, None, None, None
def _cache_thumbnail(self, file_path: Path, thumbnail_data: bytes, content_hash: str = None):
"""Store thumbnail in cache database
Args:
file_path: Path to the file
thumbnail_data: JPEG thumbnail data
content_hash: Optional SHA256 content hash from database
"""
try:
file_hash = self._get_file_hash(file_path, content_hash)
file_mtime = file_path.stat().st_mtime
conn = sqlite3.connect(str(self.db_path), timeout=30.0)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute("""
INSERT OR REPLACE INTO thumbnails
(file_hash, file_path, thumbnail_data, created_at, file_mtime)
VALUES (?, ?, ?, ?, ?)
""", (file_hash, str(file_path), thumbnail_data, datetime.now().isoformat(), file_mtime))
conn.commit()
conn.close()
return True
except Exception as e:
logger.error(f"Error caching thumbnail for {file_path}: {e}", module="Error")
return False
def _cache_metadata(self, file_path: Path, width: int, height: int, duration: float = None, format_type: str = None, content_hash: str = None):
"""Store metadata in cache database
Args:
file_path: Path to the file
width: Image/video width
height: Image/video height
duration: Video duration (seconds)
format_type: Media format
content_hash: Optional SHA256 content hash from database
"""
try:
file_hash = self._get_file_hash(file_path, content_hash)
file_mtime = file_path.stat().st_mtime
file_size = file_path.stat().st_size
conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute("""
INSERT OR REPLACE INTO media_metadata
(file_hash, file_path, width, height, file_size, duration, format, created_at, file_mtime)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (file_hash, str(file_path), width, height, file_size, duration, format_type,
datetime.now().isoformat(), file_mtime))
conn.commit()
conn.close()
return True
except Exception as e:
logger.error(f"Error caching metadata for {file_path}: {e}", module="Error")
return False
def _is_cached_valid(self, file_path: Path, content_hash: str = None) -> bool:
"""Check if file already has valid cached thumbnail and metadata
Args:
file_path: Path to the file
content_hash: Optional SHA256 content hash from database
"""
try:
file_hash = self._get_file_hash(file_path, content_hash)
file_mtime = file_path.stat().st_mtime
# Check thumbnail cache
conn = sqlite3.connect(str(self.db_path), timeout=30.0)
conn.execute('PRAGMA journal_mode=WAL')
cursor = conn.execute(
"SELECT file_mtime FROM thumbnails WHERE file_hash = ?",
(file_hash,)
)
thumb_result = cursor.fetchone()
conn.close()
if not thumb_result or abs(thumb_result[0] - file_mtime) > 1:
return False
# Check metadata cache
conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
conn.execute('PRAGMA journal_mode=WAL')
cursor = conn.execute(
"SELECT file_mtime FROM media_metadata WHERE file_hash = ?",
(file_hash,)
)
meta_result = cursor.fetchone()
conn.close()
if not meta_result or abs(meta_result[0] - file_mtime) > 1:
return False
return True
except Exception as e:
logger.error(f"Error checking cache for {file_path}: {e}", module="Error")
return False
def process_file(self, file_path: Path, content_hash: str = None) -> bool:
"""Process a single file - generate thumbnail and cache metadata
Args:
file_path: Path to the file
content_hash: Optional SHA256 content hash from database (preferred for cache key)
"""
try:
if not file_path.exists():
self.stats['skipped'] += 1
return True
# Check if already cached and up-to-date
if self._is_cached_valid(file_path, content_hash):
self.stats['skipped'] += 1
return True
file_ext = file_path.suffix.lower()
if file_ext in self.image_extensions:
# Process image
thumbnail_data, width, height, format_type = self._generate_image_thumbnail(file_path)
if thumbnail_data and width and height:
# Cache thumbnail
if self._cache_thumbnail(file_path, thumbnail_data, content_hash):
self.stats['thumbnails_created'] += 1
# Cache metadata
if self._cache_metadata(file_path, width, height, format_type=format_type, content_hash=content_hash):
self.stats['metadata_cached'] += 1
return True
else:
self.stats['errors'] += 1
return False
elif file_ext in self.video_extensions:
# Process video
thumbnail_data, width, height, duration = self._generate_video_thumbnail(file_path)
# Cache thumbnail if generated
if thumbnail_data:
if self._cache_thumbnail(file_path, thumbnail_data, content_hash):
self.stats['thumbnails_created'] += 1
# Cache metadata if we have dimensions
if width and height:
if self._cache_metadata(file_path, width, height, duration=duration, format_type='video', content_hash=content_hash):
self.stats['metadata_cached'] += 1
# Consider successful even if thumbnail failed (metadata might still be cached)
if width and height:
return True
else:
self.stats['errors'] += 1
return False
return True
except Exception as e:
logger.error(f"Error processing file {file_path}: {e}", module="Error")
self.stats['errors'] += 1
return False
def _get_files_from_inventory(self) -> list:
"""Query file_inventory table for all media files (database-first)
Returns: List of tuples (file_path, content_hash or None)
"""
try:
conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
# Query all files from file_inventory (any location: final, review, recycle)
# Include file_hash from recycle_bin if file is in recycle location
cursor.execute("""
SELECT
fi.file_path,
fi.content_type,
fi.location,
rb.file_hash as content_hash
FROM file_inventory fi
LEFT JOIN recycle_bin rb ON fi.file_path = rb.recycle_path
ORDER BY fi.created_date DESC
""")
rows = cursor.fetchall()
conn.close()
# Convert to Path objects and filter by extension
all_extensions = list(self.image_extensions) + list(self.video_extensions)
files = []
for row in rows:
file_path = Path(row['file_path'])
if file_path.suffix.lower() in all_extensions and file_path.exists():
# Return tuple: (file_path, content_hash or None)
content_hash = row['content_hash'] if row['content_hash'] else None
files.append((file_path, content_hash))
return files
except Exception as e:
logger.error(f"Error querying file_inventory: {e}", module="Error")
# Fallback to filesystem scan if database query fails
logger.warning("Falling back to filesystem scan...", module="Warning")
return self._fallback_filesystem_scan()
def _fallback_filesystem_scan(self) -> list:
"""Fallback: Scan filesystem if database query fails
Returns: List of tuples (file_path, None) - no content_hash available from filesystem
"""
all_files = []
for scan_dir in self.scan_dirs:
if not scan_dir.exists():
continue
for ext in list(self.image_extensions) + list(self.video_extensions):
# Return tuples: (file_path, None) - no content hash from filesystem scan
all_files.extend([(f, None) for f in scan_dir.rglob(f"*{ext}")])
return all_files
def scan_and_process(self):
"""Query file_inventory and process all files (database-first)"""
logger.info("Starting thumbnail and metadata cache build...", module="Core")
logger.info("Querying file_inventory table (database-first architecture)...", module="Core")
start_time = time.time()
# Query file_inventory instead of scanning filesystem
# Returns list of tuples: (file_path, content_hash or None)
all_files = self._get_files_from_inventory()
total_files = len(all_files)
logger.info(f"Found {total_files} media files to process from file_inventory", module="Core")
# Count how many have content hashes (from recycle bin)
files_with_hash = sum(1 for _, content_hash in all_files if content_hash)
if files_with_hash > 0:
logger.info(f" - {files_with_hash} files have content hash (from recycle bin - cache survives moves)", module="Core")
# Process files with progress updates
for i, (file_path, content_hash) in enumerate(all_files, 1):
self.process_file(file_path, content_hash)
self.stats['processed'] += 1
# Progress update every 100 files
if i % 100 == 0 or i == total_files:
elapsed = time.time() - start_time
rate = i / elapsed if elapsed > 0 else 0
eta = (total_files - i) / rate if rate > 0 else 0
logger.info(f"Progress: {i}/{total_files} ({i/total_files*100:.1f}%) - "
f"Rate: {rate:.1f} files/sec - ETA: {eta/60:.1f} min", module="Core")
# Final statistics
elapsed = time.time() - start_time
logger.info("=" * 60, module="Core")
logger.info("Thumbnail and Metadata Cache Build Complete", module="Core")
logger.info("=" * 60, module="Core")
logger.info(f"Total files processed: {self.stats['processed']}", module="Core")
logger.info(f"Thumbnails created: {self.stats['thumbnails_created']}", module="Core")
logger.info(f"Metadata cached: {self.stats['metadata_cached']}", module="Core")
logger.info(f"Files skipped (already cached): {self.stats['skipped']}", module="Core")
logger.info(f"Errors: {self.stats['errors']}", module="Core")
logger.info(f"Total time: {elapsed/60:.1f} minutes", module="Core")
logger.info(f"Average rate: {self.stats['processed']/elapsed:.1f} files/sec", module="Core")
logger.info("=" * 60, module="Core")
def cleanup_orphaned_records(self):
"""Clean up orphaned database records for files that no longer exist"""
logger.info("Starting database cleanup for orphaned records...", module="Cleanup")
cleanup_stats = {
'face_recognition_scans': 0,
'downloads': 0,
'media_metadata': 0,
'thumbnail_cache': 0
}
conn = None
meta_conn = None
thumb_conn = None
main_conn = None
try:
# Clean up face_recognition_scans for files not in file_inventory
conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
cursor = conn.cursor()
# Find orphaned face_recognition_scans (files not in file_inventory)
cursor.execute("""
SELECT COUNT(*) FROM face_recognition_scans frs
WHERE NOT EXISTS (
SELECT 1 FROM file_inventory fi WHERE fi.file_path = frs.file_path
)
""")
orphaned_count = cursor.fetchone()[0]
if orphaned_count > 0:
cursor.execute("""
DELETE FROM face_recognition_scans
WHERE NOT EXISTS (
SELECT 1 FROM file_inventory fi WHERE fi.file_path = face_recognition_scans.file_path
)
""")
conn.commit()
cleanup_stats['face_recognition_scans'] = orphaned_count
logger.info(f"Removed {orphaned_count} orphaned face_recognition_scans records", module="Cleanup")
# Clean up downloads for files not in file_inventory
cursor.execute("""
SELECT COUNT(*) FROM downloads d
WHERE d.file_path IS NOT NULL AND d.file_path != ''
AND NOT EXISTS (
SELECT 1 FROM file_inventory fi WHERE fi.file_path = d.file_path
)
""")
orphaned_downloads = cursor.fetchone()[0]
if orphaned_downloads > 0:
cursor.execute("""
DELETE FROM downloads
WHERE file_path IS NOT NULL AND file_path != ''
AND NOT EXISTS (
SELECT 1 FROM file_inventory fi WHERE fi.file_path = downloads.file_path
)
""")
conn.commit()
cleanup_stats['downloads'] = orphaned_downloads
logger.info(f"Removed {orphaned_downloads} orphaned downloads records", module="Cleanup")
conn.close()
# Clean up media_metadata cache for files not in file_inventory
try:
meta_conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
main_conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
# Get list of valid file paths from file_inventory
main_cursor = main_conn.cursor()
main_cursor.execute("SELECT file_path FROM file_inventory")
valid_paths = set(row[0] for row in main_cursor.fetchall())
main_conn.close()
# Check metadata for orphans
meta_cursor = meta_conn.cursor()
meta_cursor.execute("SELECT file_path FROM media_metadata")
all_meta_paths = [row[0] for row in meta_cursor.fetchall()]
orphaned_meta = [p for p in all_meta_paths if p not in valid_paths]
if orphaned_meta:
placeholders = ','.join(['?' for _ in orphaned_meta])
meta_cursor.execute(f"DELETE FROM media_metadata WHERE file_path IN ({placeholders})", orphaned_meta)
meta_conn.commit()
cleanup_stats['media_metadata'] = len(orphaned_meta)
logger.info(f"Removed {len(orphaned_meta)} orphaned media_metadata records", module="Cleanup")
meta_conn.close()
except Exception:
pass # metadata cleanup is non-critical
# Clean up thumbnail cache for files not in file_inventory
thumb_db_path = Path(__file__).parent.parent / 'database' / 'thumbnails.db'
try:
thumb_conn = sqlite3.connect(str(thumb_db_path), timeout=30.0)
main_conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
# Get list of valid file paths
main_cursor = main_conn.cursor()
main_cursor.execute("SELECT file_path FROM file_inventory")
valid_paths = set(row[0] for row in main_cursor.fetchall())
main_conn.close()
# Check thumbnails for orphans
thumb_cursor = thumb_conn.cursor()
# Thumbnails use file_hash as key, so we need to check existence differently
try:
thumb_cursor.execute("SELECT file_path FROM thumbnails WHERE file_path IS NOT NULL")
all_thumb_paths = [row[0] for row in thumb_cursor.fetchall()]
orphaned_thumbs = [p for p in all_thumb_paths if p and p not in valid_paths]
if orphaned_thumbs:
placeholders = ','.join(['?' for _ in orphaned_thumbs])
thumb_cursor.execute(f"DELETE FROM thumbnails WHERE file_path IN ({placeholders})", orphaned_thumbs)
thumb_conn.commit()
cleanup_stats['thumbnail_cache'] = len(orphaned_thumbs)
logger.info(f"Removed {len(orphaned_thumbs)} orphaned thumbnail records", module="Cleanup")
except sqlite3.OperationalError:
# Table structure may not have file_path column
pass
thumb_conn.close()
except Exception:
pass # thumbnail cleanup is non-critical
# Log summary
total_cleaned = sum(cleanup_stats.values())
logger.info("=" * 60, module="Cleanup")
logger.info("Database Cleanup Complete", module="Cleanup")
logger.info("=" * 60, module="Cleanup")
logger.info(f"Total orphaned records removed: {total_cleaned}", module="Cleanup")
for table, count in cleanup_stats.items():
if count > 0:
logger.info(f" - {table}: {count}", module="Cleanup")
logger.info("=" * 60, module="Cleanup")
return cleanup_stats
except Exception as e:
logger.error(f"Error during database cleanup: {e}", exc_info=True, module="Error")
return cleanup_stats
finally:
# Ensure all database connections are closed
for connection in [conn, meta_conn, thumb_conn, main_conn]:
if connection:
try:
connection.close()
except Exception:
pass # Best effort cleanup
def main():
"""Main entry point"""
logger.info("Thumbnail Cache Builder starting...", module="Core")
try:
builder = ThumbnailCacheBuilder()
# Run database cleanup first (before processing)
logger.info("Phase 1: Database cleanup for orphaned records", module="Core")
builder.cleanup_orphaned_records()
# Then process thumbnails and metadata
logger.info("Phase 2: Thumbnail and metadata cache building", module="Core")
builder.scan_and_process()
logger.info("Thumbnail Cache Builder completed successfully", module="Core")
return 0
except Exception as e:
logger.error(f"Fatal error in Thumbnail Cache Builder: {e}", exc_info=True, module="Error")
return 1
if __name__ == '__main__':
sys.exit(main())

102
modules/tiktok_db_adapter.py Executable file
View File

@@ -0,0 +1,102 @@
#!/usr/bin/env python3
"""
TikTok Database Adapter for Unified Database
Provides compatibility layer between TikTok module and unified database
"""
from typing import Optional, Dict
from datetime import datetime
import json
class TikTokDatabaseAdapter:
"""Adapter to make unified database work with TikTok module"""
def __init__(self, unified_db):
"""Initialize adapter with unified database instance"""
self.unified_db = unified_db
self.platform = 'tiktok'
def get_file_hash(self, file_path: str) -> Optional[str]:
"""Calculate SHA256 hash of a file (delegates to UnifiedDatabase)"""
return self.unified_db.get_file_hash(file_path)
def get_download_by_file_hash(self, file_hash: str) -> Optional[Dict]:
"""Get download record by file hash (delegates to UnifiedDatabase)"""
return self.unified_db.get_download_by_file_hash(file_hash)
def record_download(self, video_id: str, username: str, filename: str,
post_date: Optional[datetime] = None, metadata: Dict = None,
file_path: str = None):
"""Record a TikTok download in the unified database"""
# Convert TikTok's video_id to a URL format for unified database
# For carousel photos, append filename to make URL unique (otherwise url_hash collision)
url = f"https://www.tiktok.com/@{username}/video/{video_id}#{filename}"
# Calculate file hash if file_path provided
file_hash = None
if file_path:
try:
from pathlib import Path
if Path(file_path).exists():
file_hash = self.unified_db.get_file_hash(file_path)
except Exception:
pass # If hash fails, continue without it
# Detect content type from file extension
from pathlib import Path
ext = Path(filename).suffix.lower()
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif', '.webp', '.bmp', '.tiff'}
content_type = 'image' if ext in image_exts else 'video'
return self.unified_db.record_download(
url=url,
platform=self.platform,
source=username,
content_type=content_type,
filename=filename,
post_date=post_date,
metadata=metadata,
file_hash=file_hash,
file_path=file_path
)
def is_downloaded(self, video_id: str, username: str = None) -> bool:
"""Check if a video has been downloaded"""
# Check if ANY file from this video_id has been downloaded
# (For carousels, URLs include #filename so we need to search by video_id pattern)
try:
import sqlite3
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
# Search for URLs containing this video_id
if username:
url_pattern = f"https://www.tiktok.com/@{username}/video/{video_id}%"
else:
url_pattern = f"%/video/{video_id}%"
cursor.execute(
"SELECT 1 FROM downloads WHERE url LIKE ? AND platform = ? LIMIT 1",
(url_pattern, self.platform)
)
return cursor.fetchone() is not None
except Exception:
return False
def is_already_downloaded(self, video_id: str) -> bool:
"""Check if a video has already been downloaded (alias for compatibility)"""
return self.is_downloaded(video_id)
def get_download_info(self, video_id: str) -> Optional[Dict]:
"""Get download information for a video"""
# This is a simplified lookup - may need to search by video_id in URL
results = self.unified_db.get_downloads(platform=self.platform, limit=1000)
for download in results:
if video_id in download.get('url', ''):
return download
return None
def cleanup_old_downloads(self, days: int = 180):
"""Clean up old download records"""
return self.unified_db.cleanup_old_downloads(days=days, platform=self.platform)

603
modules/tiktok_module.py Executable file
View File

@@ -0,0 +1,603 @@
#!/usr/bin/env python3
"""
TikTok Download Module - Downloads TikTok videos with proper timestamp extraction
"""
import os
import re
import json
import subprocess
import sqlite3
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from modules.base_module import LoggingMixin
class TikTokDownloader(LoggingMixin):
"""Downloads TikTok videos and extracts metadata including timestamps"""
def __init__(self, base_path: Path = None, log_callback=None, use_database=True, unified_db=None):
"""
Initialize TikTok downloader
Args:
base_path: Base path for downloads
log_callback: Optional callback for logging (tag, level, message)
use_database: Whether to use database for tracking downloads
unified_db: UnifiedDatabase instance (required)
"""
# Initialize logging via mixin
self._init_logger('TikTok', log_callback, default_module='Download')
self.base_path = Path(base_path) if base_path else Path.cwd()
self.file_timestamps = {} # Map of filename -> datetime
self.use_database = use_database
# Always use unified database adapter
if not unified_db:
raise ValueError("TikTok module requires unified_db - standalone database is no longer supported")
from modules.tiktok_db_adapter import TikTokDatabaseAdapter
self.db = TikTokDatabaseAdapter(unified_db)
self.use_unified_db = True
# Initialize activity status manager for real-time updates
from modules.activity_status import get_activity_manager
self.activity_manager = get_activity_manager(unified_db)
self.pending_downloads = [] # Track downloads for deferred database recording
def _is_already_downloaded(self, video_id: str, username: str = None) -> bool:
"""Check if a video has already been downloaded"""
if not self.use_database:
return False
# Pass username for proper database lookup
if username:
return self.db.is_downloaded(video_id, username)
return self.db.is_already_downloaded(video_id)
def _record_download(self, video_id: str, username: str, filename: str,
post_date: Optional[datetime] = None, metadata: Dict = None,
deferred: bool = False):
"""Record a successful download in the database
Args:
deferred: If True, don't record to database now - add to pending_downloads list
for later recording after file move is complete
"""
# Extract just the filename from the full path for database
from pathlib import Path
file_path = str(filename) # Full path
filename_only = Path(filename).name # Just the filename
# If deferred, store for later recording instead of recording now
if deferred:
self.pending_downloads.append({
'video_id': video_id,
'username': username,
'filename': filename_only,
'post_date': post_date.isoformat() if post_date else None,
'file_path': file_path,
'metadata': metadata
})
self.log(f"Deferred recording for {video_id}", "debug")
return True
if not self.use_database:
return
return self.db.record_download(
video_id=video_id,
username=username,
filename=filename_only,
post_date=post_date,
metadata=metadata,
file_path=file_path
)
def get_pending_downloads(self):
"""Get list of downloads that were deferred for later recording"""
return self.pending_downloads.copy()
def clear_pending_downloads(self):
"""Clear the pending downloads list after they've been recorded"""
self.pending_downloads = []
def extract_date_from_info(self, info_dict: Dict) -> Optional[datetime]:
"""
Extract upload date from yt-dlp info dictionary
Args:
info_dict: yt-dlp info dictionary
Returns:
datetime object or None
"""
# Try timestamp first (Unix timestamp - has full date and time)
# TikTok provides UTC timestamps, need to convert to local time
timestamp = info_dict.get('timestamp')
if timestamp:
try:
# Use UTC timestamp and convert to local
from datetime import timezone
dt_utc = datetime.fromtimestamp(timestamp, tz=timezone.utc)
dt = dt_utc.replace(tzinfo=None) # Remove timezone info for local datetime
self.log(f"Extracted full timestamp (UTC): {dt}", "debug")
return dt
except Exception:
pass
# Try release_timestamp (also has full date and time)
release_timestamp = info_dict.get('release_timestamp')
if release_timestamp:
try:
from datetime import timezone
dt_utc = datetime.fromtimestamp(release_timestamp, tz=timezone.utc)
dt = dt_utc.replace(tzinfo=None) # Remove timezone info for local datetime
self.log(f"Extracted release timestamp (UTC): {dt}", "debug")
return dt
except Exception:
pass
# Try modified_timestamp
modified_timestamp = info_dict.get('modified_timestamp')
if modified_timestamp:
try:
from datetime import timezone
dt_utc = datetime.fromtimestamp(modified_timestamp, tz=timezone.utc)
dt = dt_utc.replace(tzinfo=None) # Remove timezone info for local datetime
self.log(f"Extracted modified timestamp (UTC): {dt}", "debug")
return dt
except Exception:
pass
# Fall back to upload_date (YYYYMMDD format - only has date, no time)
# This should be last resort as it loses time information
upload_date = info_dict.get('upload_date')
if upload_date and len(upload_date) == 8:
try:
# Try to get time from filename if it has timestamp format
# TikTok sometimes includes timestamp in the video ID
dt = datetime.strptime(upload_date, '%Y%m%d')
self.log(f"Only date available (no time): {dt.date()}", "warning")
return dt
except Exception:
pass
return None
def download_profile(self,
username: str,
number_of_days: int = 7,
full_profile: bool = False,
output_dir: Path = None,
defer_database: bool = False) -> Tuple[Dict[str, datetime], List[Path]]:
"""
Download TikTok profile videos
Args:
username: TikTok username (without @)
number_of_days: Number of days to download (ignored if full_profile=True)
full_profile: If True, download entire profile
output_dir: Output directory (uses base_path/username if not specified)
defer_database: If True, don't record to database immediately - store in
pending_downloads for later recording after file move is complete
Returns:
Tuple of (file_timestamps dict, list of downloaded files)
"""
self.defer_database = defer_database # Store for use in _record_download
username = username.lstrip('@')
output_dir = output_dir or self.base_path / username
output_dir.mkdir(parents=True, exist_ok=True)
self.log(f"Downloading TikTok profile: @{username}", "info")
self.activity_manager.update_status("Checking videos")
# HYBRID APPROACH: Use yt-dlp to get ID list (fast), then gallery-dl per video (handles carousels)
# Step 1: Use yt-dlp to quickly get list of video IDs with dates
profile_url = f"https://www.tiktok.com/@{username}"
list_cmd = [
"yt-dlp",
"--flat-playlist", # Don't download, just list
"--print", "%(upload_date)s %(id)s", # Print date and ID
"--quiet",
"--no-warnings",
profile_url
]
self.log(f"Getting video list with yt-dlp...", "debug")
# Get list of video IDs with dates
try:
result = subprocess.run(list_cmd, capture_output=True, text=True, timeout=60)
lines = [line.strip() for line in result.stdout.strip().split('\n') if line.strip()]
# Parse and filter by date if needed
video_ids = []
if not full_profile and number_of_days:
from datetime import timedelta
cutoff_date = datetime.now() - timedelta(days=number_of_days)
cutoff_str = cutoff_date.strftime('%Y%m%d')
for line in lines:
parts = line.split()
if len(parts) >= 2:
upload_date, video_id = parts[0], parts[1]
# Only include videos after cutoff date
if upload_date >= cutoff_str:
video_ids.append(video_id)
else:
# No filter, take all
video_ids = [line.split()[1] for line in lines if len(line.split()) >= 2]
self.log(f"Found {len(video_ids)} posts to download", "info")
except Exception as e:
self.log(f"Failed to get video list: {e}", "error")
return {}, []
if not video_ids:
self.log("No videos found matching criteria", "info")
return {}, []
# Set initial progress so dashboard shows 0/N immediately
self.activity_manager.update_status(
"Downloading videos",
progress_current=0,
progress_total=len(video_ids)
)
# Crash recovery checkpoint
from modules.task_checkpoint import TaskCheckpoint
checkpoint = TaskCheckpoint(f'tiktok:{username}', 'scraping')
checkpoint.start(total_items=len(video_ids))
if checkpoint.is_recovering():
self.log(f"TikTok @{username}: recovering — skipping already-downloaded videos", "info")
# Step 2: Download each video individually with gallery-dl (fast per video, handles carousels)
for i, video_id in enumerate(video_ids, 1):
# Update progress at start of each iteration (fires even on skips)
self.activity_manager.update_status(
"Downloading videos",
progress_current=i,
progress_total=len(video_ids)
)
# Skip if already completed in a previous crashed run
if checkpoint.is_completed(video_id):
continue
checkpoint.set_current(video_id)
# Skip if already downloaded
if self._is_already_downloaded(video_id, username):
self.log(f"[{i}/{len(video_ids)}] Skipping already downloaded: {video_id}", "debug")
checkpoint.mark_completed(video_id)
continue
video_url = f"https://www.tiktok.com/@{username}/video/{video_id}"
self.log(f"[{i}/{len(video_ids)}] Downloading {video_id}", "debug")
cmd = [
"gallery-dl",
"--write-metadata",
"-D", str(output_dir),
"-f", "{date:%Y%m%d}_{desc}_{id}_{num}.{extension}",
video_url
]
try:
self.log(f"Calling gallery-dl for {video_id}", "debug")
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
self.log(f"gallery-dl returned: code={result.returncode}, stdout lines={len(result.stdout.splitlines()) if result.stdout else 0}", "debug")
if result.returncode != 0 and result.stderr:
stderr = result.stderr
if "not available" in stderr.lower() or "404" in stderr:
self.log(f"Video {video_id} not available (deleted or private)", "warning")
else:
self.log(f"Failed to download {video_id}: {stderr[:100]}", "warning")
except subprocess.TimeoutExpired:
self.log(f"Timeout downloading {video_id}", "warning")
except Exception as e:
self.log(f"Error downloading {video_id}: {e}", "warning")
checkpoint.mark_completed(video_id)
checkpoint.finish()
# Post-process: Rename files with long descriptions and remove audio-only files
for file in output_dir.glob("*"):
if file.is_file() and not file.suffix == '.json':
# Remove audio-only files (.mp3, .m4a, .aac)
if file.suffix.lower() in ['.mp3', '.m4a', '.aac', '.wav', '.ogg']:
self.log(f"Removing audio-only file: {file.name}", "debug")
file.unlink()
# Also remove corresponding JSON
json_file = file.with_suffix(file.suffix + '.json')
if json_file.exists():
json_file.unlink()
continue
# Truncate long filenames (max 255 chars for Linux)
if len(file.name) > 200: # Leave some margin
# Parse filename: YYYYMMDD_description_ID_NUM.ext
parts = file.name.rsplit('_', 2) # Split from right to preserve ID and num
if len(parts) == 3:
date_and_desc, video_id, num_and_ext = parts
# Split date from description
date_part = date_and_desc[:8] # YYYYMMDD
desc_part = date_and_desc[9:] # Everything after date_
# Calculate max description length
# Format: DATE_DESC_ID_NUM.EXT
fixed_length = len(date_part) + len(video_id) + len(num_and_ext) + 3 # 3 underscores
max_desc_len = 200 - fixed_length
if len(desc_part) > max_desc_len:
truncated_desc = desc_part[:max_desc_len-3] + "..."
new_name = f"{date_part}_{truncated_desc}_{video_id}_{num_and_ext}"
new_path = file.parent / new_name
self.log(f"Truncating long filename: {file.name[:50]}... -> {new_name[:50]}...", "debug")
file.rename(new_path)
# Rename corresponding JSON file too
json_file = Path(str(file) + '.json')
if json_file.exists():
new_json = Path(str(new_path) + '.json')
json_file.rename(new_json)
# Process downloaded files and extract timestamps from JSON
downloaded_files = []
file_timestamps = {}
processed_ids = set() # Track IDs we've checked in DB (not in this loop, but in previous downloads)
started_ids = set() # Track IDs we've started processing in THIS run
for json_file in output_dir.glob("*.json"):
try:
with open(json_file, 'r', encoding='utf-8') as f:
info = json.load(f)
# Get video ID
video_id = info.get('id', '')
# Extract timestamp from gallery-dl's createTime field (needed for all files)
timestamp = None
create_time = info.get('createTime')
if create_time:
try:
timestamp = datetime.fromtimestamp(int(create_time))
self.log(f"Extracted timestamp {timestamp} from createTime", "debug")
except Exception:
# Fall back to old yt-dlp method if createTime not available
timestamp = self.extract_date_from_info(info)
# gallery-dl names JSON files as: filename.ext.json
# So we need to remove the .json extension to get the media file
media_file = Path(str(json_file)[:-5]) # Remove .json extension
if not media_file.exists():
self.log(f"Media file not found for {json_file.name}", "warning")
json_file.unlink()
continue
video_file = media_file # Use same variable name for compatibility
# Check if already downloaded - but only check ONCE per video_id per run
# (Don't check again for carousel photos #2, #3 after we've started processing #1)
if video_id and video_id not in started_ids:
if self._is_already_downloaded(video_id, username):
self.log(f"Skipping already downloaded post: {video_id}", "debug")
# Mark as processed so we don't check again for this ID's other files
processed_ids.add(video_id)
# Just remove JSON file, keep media files (they're already processed)
json_file.unlink()
continue
# Mark that we've started processing this video_id
started_ids.add(video_id)
# Skip if this video_id was marked as already downloaded
if video_id in processed_ids:
json_file.unlink()
continue
# ALWAYS add file to downloaded list and apply timestamp (even for carousel photos #2, #3)
downloaded_files.append(video_file)
if timestamp:
file_timestamps[video_file.name] = timestamp
self.log(f"Extracted timestamp {timestamp} for {video_file.name}", "debug")
# Check for duplicate hash before recording (hash blacklist persists even if original deleted)
file_hash = self.db.get_file_hash(str(video_file)) if self.db else None
if file_hash:
existing = self.db.get_download_by_file_hash(file_hash)
if existing and existing.get('file_path') and str(video_file) != existing.get('file_path'):
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
self.log(f"⚠ Duplicate content detected (hash match): {video_file.name} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
# Delete the duplicate regardless of whether original file still exists
try:
video_file.unlink()
self.log(f"Deleted duplicate (hash blacklist): {video_file.name}", "debug")
# Mark as processed so we don't try to download again
processed_ids.add(video_id)
json_file.unlink()
continue
except Exception as e:
self.log(f"Failed to delete duplicate {video_file.name}: {e}", "warning")
# Record in database (each file gets its own entry, even for carousels)
if video_id:
self._record_download(
video_id=video_id,
username=username,
filename=video_file.name,
post_date=timestamp,
metadata={"title": info.get('desc', ''), "description": info.get('desc', '')},
deferred=self.defer_database
)
# Remove JSON file after processing
json_file.unlink()
except Exception as e:
self.log(f"Failed to process {json_file}: {e}", "error")
self.log(f"Downloaded {len(downloaded_files)} files from @{username}", "info")
# Apply timestamps to files
import os
for file_path in downloaded_files:
filename = file_path.name
if filename in file_timestamps:
timestamp = file_timestamps[filename]
try:
# Convert datetime to unix timestamp
unix_time = timestamp.timestamp()
# Set both access time and modification time
os.utime(str(file_path), (unix_time, unix_time))
self.log(f"Applied timestamp {timestamp} to {filename}", "debug")
except Exception as e:
self.log(f"Failed to apply timestamp to {filename}: {e}", "warning")
# Store timestamps for later use
self.file_timestamps.update(file_timestamps)
return file_timestamps, downloaded_files
def download_video(self, url: str, output_dir: Path = None) -> Tuple[Optional[datetime], Optional[Path]]:
"""
Download a single TikTok video
Args:
url: TikTok video URL
output_dir: Output directory
Returns:
Tuple of (timestamp, downloaded file path)
"""
output_dir = output_dir or self.base_path
output_dir.mkdir(parents=True, exist_ok=True)
self.log(f"Downloading video: {url}", "info")
# First, get video info without downloading
cmd_info = [
"yt-dlp",
"--dump-json",
"--no-warnings",
"--quiet",
url
]
try:
result = subprocess.run(cmd_info, capture_output=True, text=True)
if result.returncode != 0:
self.log(f"Failed to get video info: {result.stderr}", "error")
return None, None
info = json.loads(result.stdout)
timestamp = self.extract_date_from_info(info)
# Check if this is a photo post (no video, only audio)
formats = info.get('formats', [])
has_video = any(f.get('vcodec') != 'none' for f in formats)
if not has_video and len(formats) > 0:
# This is a photo/image post - skip it
self.log("Skipping TikTok photo post (only videos are downloaded)", "info")
return timestamp, None
# Download video
output_template = str(output_dir / "%(upload_date)s_%(title)s_%(id)s.%(ext)s")
cmd_download = [
"yt-dlp",
"--format", "best", # Explicitly request best video+audio format
"--no-warnings",
"--quiet",
"-o", output_template,
url
]
result = subprocess.run(cmd_download, capture_output=True, text=True)
if result.returncode != 0:
self.log(f"Failed to download video: {result.stderr}", "error")
return timestamp, None
# Find the downloaded file
expected_name = output_template.replace('%(upload_date)s', info.get('upload_date', 'unknown'))
expected_name = expected_name.replace('%(title)s', info.get('title', 'video'))
expected_name = expected_name.replace('%(id)s', info.get('id', ''))
expected_name = expected_name.replace('%(ext)s', info.get('ext', 'mp4'))
downloaded_file = Path(expected_name)
if not downloaded_file.exists():
# Try to find it by pattern
pattern = f"*{info.get('id', '')}*.mp4"
matches = list(output_dir.glob(pattern))
if matches:
downloaded_file = matches[0]
if downloaded_file.exists():
if timestamp:
self.file_timestamps[downloaded_file.name] = timestamp
return timestamp, downloaded_file
return timestamp, None
except Exception as e:
self.log(f"Failed to download video: {e}", "error")
return None, None
def get_file_timestamps(self) -> Dict[str, datetime]:
"""Get the collected file timestamps"""
return self.file_timestamps.copy()
def clear_timestamps(self):
"""Clear the stored timestamps"""
self.file_timestamps.clear()
def download_tiktok_profile(username: str,
days: int = 7,
base_path: Path = None,
log_callback=None,
unified_db=None) -> Dict[str, datetime]:
"""
Simple function interface for downloading TikTok profile
Args:
username: TikTok username
days: Number of days to download
base_path: Base download path
log_callback: Optional logging callback
unified_db: UnifiedDatabase instance (required)
Returns:
Dictionary mapping filenames to timestamps
"""
if not unified_db:
raise ValueError("unified_db is required for TikTok downloads")
downloader = TikTokDownloader(base_path=base_path, log_callback=log_callback, unified_db=unified_db)
timestamps, files = downloader.download_profile(username, number_of_days=days)
return timestamps
if __name__ == "__main__":
# Test the module
import tempfile
print("TikTok Downloader Module Test")
print("="*60)
# Test with a small profile
with tempfile.TemporaryDirectory() as tmpdir:
downloader = TikTokDownloader(base_path=Path(tmpdir))
# You can test with a real TikTok username
# timestamps, files = downloader.download_profile("username", number_of_days=1)
print("Module ready for integration")

1512
modules/tmdb_client.py Normal file

File diff suppressed because it is too large Load Diff

1116
modules/toolzu_module.py Normal file

File diff suppressed because it is too large Load Diff

6350
modules/unified_database.py Executable file

File diff suppressed because it is too large Load Diff

348
modules/universal_logger.py Normal file
View File

@@ -0,0 +1,348 @@
#!/usr/bin/env python3
"""
Universal Logging Module for Media Downloader
Provides consistent logging across all components with automatic rotation and 7-day retention
"""
import logging
import logging.handlers
from pathlib import Path
from datetime import datetime, timedelta
import os
import glob
import sys
class UniversalLogger:
"""
Universal logger with automatic rotation and cleanup
Features:
- Consistent log format across all components
- Daily log rotation at midnight
- Automatic cleanup of logs older than 7 days
- Separate log files per component
- Console and file output
"""
def __init__(
self,
component_name: str,
log_dir: str = None,
retention_days: int = 7,
console_level: str = 'INFO',
file_level: str = 'DEBUG'
):
"""
Initialize universal logger for a component
Args:
component_name: Name of the component (e.g., 'API', 'Scheduler', 'MediaDownloader')
log_dir: Directory to store logs (default: /opt/media-downloader/logs)
retention_days: Number of days to keep logs (default: 7)
console_level: Logging level for console output (default: INFO)
file_level: Logging level for file output (default: DEBUG)
"""
self.component_name = component_name
self.retention_days = retention_days
# Set up log directory
if log_dir is None:
base_path = Path(__file__).parent.parent
self.log_dir = base_path / 'logs'
else:
self.log_dir = Path(log_dir)
self.log_dir.mkdir(exist_ok=True, parents=True)
# Create logger
self.logger = logging.getLogger(f'MediaDownloader.{component_name}')
self.logger.setLevel(logging.DEBUG)
# Remove existing handlers to prevent duplicates
self.logger.handlers = []
# Create formatter - matches media-downloader.py format
# Format: 2025-11-12 21:00:00.123456 [ComponentName] [Module] [LEVEL] message
# Custom formatter to include microseconds for proper log sorting
class MicrosecondFormatter(logging.Formatter):
def formatTime(self, record, datefmt=None):
ct = datetime.fromtimestamp(record.created)
return ct.strftime('%Y-%m-%d %H:%M:%S.%f')
formatter = MicrosecondFormatter(
'%(asctime)s [%(name)s] %(message)s'
)
# File handler with date-stamped filename (one file per day)
# Format: 20251113_component.log (all logs for the day append to same file)
date_stamp = datetime.now().strftime('%Y%m%d')
log_file = self.log_dir / f'{date_stamp}_{component_name.lower()}.log'
file_handler = logging.FileHandler(
filename=str(log_file),
mode='a', # Append mode - preserves logs across restarts
encoding='utf-8'
)
file_handler.setLevel(getattr(logging, file_level.upper()))
file_handler.setFormatter(formatter)
self.logger.addHandler(file_handler)
# Console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(getattr(logging, console_level.upper()))
console_handler.setFormatter(formatter)
self.logger.addHandler(console_handler)
# Suppress noisy third-party loggers
logging.getLogger('asyncio').setLevel(logging.WARNING)
logging.getLogger('selenium').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('websocket').setLevel(logging.WARNING)
logging.getLogger('requests').setLevel(logging.WARNING)
logging.getLogger('PIL').setLevel(logging.WARNING)
logging.getLogger('instaloader').setLevel(logging.WARNING)
logging.getLogger('tensorflow').setLevel(logging.ERROR)
logging.getLogger('deepface').setLevel(logging.WARNING)
# Clean up old logs on initialization
self._cleanup_old_logs()
def _cleanup_old_logs(self):
"""Remove log files older than retention_days"""
try:
cutoff_date = datetime.now() - timedelta(days=self.retention_days)
# Match pattern: YYYYMMDD_HHMMSS_component.log
pattern = str(self.log_dir / f'*_{self.component_name.lower()}.log')
cleaned_count = 0
for log_file in glob.glob(pattern):
try:
file_path = Path(log_file)
# Check file modification time
mtime = datetime.fromtimestamp(file_path.stat().st_mtime)
if mtime < cutoff_date:
file_path.unlink()
cleaned_count += 1
except Exception as e:
# Don't fail if we can't clean up a single file
pass
if cleaned_count > 0:
# Log cleanup message through the logger itself (after file handler is set up)
self.info(f"Cleaned up {cleaned_count} old {self.component_name} log file(s)", module='LogCleanup')
except Exception as e:
# Don't fail initialization if cleanup fails
pass
def _format_message(self, module: str, level: str, message: str) -> str:
"""
Format message to match media-downloader.py style
Args:
module: Module name (e.g., 'Core', 'Forum', 'Instagram')
level: Log level (e.g., 'INFO', 'ERROR', 'DEBUG')
message: Log message
Returns:
Formatted message: [Module] [LEVEL] message
"""
return f"[{module}] [{level.upper()}] {message}"
def _broadcast_error(self, message: str, module: str, level: str = 'ERROR'):
"""
Broadcast error to connected WebSocket clients for real-time notifications.
Fails silently to not disrupt logging.
"""
try:
# Try to import the WebSocket manager from the API
# This will only work when the API is running
from web.backend.api import manager
if manager and manager.active_connections:
manager.broadcast_sync({
'type': 'error_alert',
'error': {
'module': module,
'level': level,
'message': message[:200], # Truncate for notification
'timestamp': datetime.now().isoformat(),
'component': self.component_name
}
})
except Exception:
# Fail silently - API may not be running or manager not available
pass
def _record_error_to_db(self, message: str, module: str, level: str = 'ERROR'):
"""
Record error to error_log database table for dashboard display.
Uses a separate connection to avoid circular dependencies.
Fails silently to not disrupt logging.
"""
try:
import sqlite3
import hashlib
import re
from pathlib import Path
# Get database path
db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db'
if not db_path.exists():
return
# Normalize message for deduplication (remove variable parts like URLs, paths, numbers)
normalized = message
normalized = re.sub(r'/[\w/\-\.]+\.(jpg|png|mp4|webp|gif|heic|mov)', '{file}', normalized)
normalized = re.sub(r'https?://[^\s]+', '{url}', normalized)
normalized = re.sub(r'\b\d+\b', '{n}', normalized)
normalized = re.sub(r'[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}', '{uuid}', normalized)
# Create error hash for deduplication (module + normalized message)
error_key = f"{module}:{normalized[:200]}"
error_hash = hashlib.sha256(error_key.encode()).hexdigest()
# Use a quick connection with short timeout
conn = sqlite3.connect(str(db_path), timeout=2.0)
conn.execute("PRAGMA busy_timeout = 2000")
cursor = conn.cursor()
now = datetime.now().isoformat()
# Upsert: insert new error or update occurrence count
# Reset viewed_at and dismissed_at to NULL when error recurs so it shows as "new" on dashboard
cursor.execute('''
INSERT INTO error_log (error_hash, module, level, message, first_seen, last_seen, occurrence_count, log_file)
VALUES (?, ?, ?, ?, ?, ?, 1, ?)
ON CONFLICT(error_hash) DO UPDATE SET
last_seen = excluded.last_seen,
occurrence_count = error_log.occurrence_count + 1,
viewed_at = NULL,
dismissed_at = NULL
''', (error_hash, module, level, message[:500], now, now, self.component_name))
conn.commit()
conn.close()
# Broadcast to WebSocket clients for real-time notification
self._broadcast_error(message, module, level)
except Exception:
# Fail silently - don't let error logging break the main logging
pass
def debug(self, message: str, module: str = 'Core'):
"""Log debug message"""
self.logger.debug(self._format_message(module, 'DEBUG', message))
def info(self, message: str, module: str = 'Core'):
"""Log info message"""
self.logger.info(self._format_message(module, 'INFO', message))
def warning(self, message: str, module: str = 'Core'):
"""Log warning message"""
self.logger.warning(self._format_message(module, 'WARNING', message))
def error(self, message: str, module: str = 'Core'):
"""Log error message and record to error_log database"""
self.logger.error(self._format_message(module, 'ERROR', message))
# Record error to database for dashboard display
self._record_error_to_db(message, module)
def critical(self, message: str, module: str = 'Core'):
"""Log critical message and record to error_log database"""
self.logger.critical(self._format_message(module, 'CRITICAL', message))
# Record critical errors to database for dashboard display
self._record_error_to_db(message, module, level='CRITICAL')
def success(self, message: str, module: str = 'Core'):
"""Log success message (maps to INFO level)"""
self.logger.info(self._format_message(module, 'SUCCESS', message))
def log(self, message: str, level: str = 'INFO', module: str = 'Core'):
"""
Generic log method supporting all levels
Args:
message: Log message
level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL, SUCCESS)
module: Module name
"""
level_map = {
'DEBUG': self.debug,
'INFO': self.info,
'WARNING': self.warning,
'ERROR': self.error,
'CRITICAL': self.critical,
'SUCCESS': self.success
}
log_func = level_map.get(level.upper(), self.info)
log_func(message, module)
def get_callback(self):
"""
Get a callback function compatible with existing module signatures
Returns:
Callback function that can be passed to modules expecting log_callback
"""
def callback(*args):
"""
Flexible callback that handles multiple signature formats:
- callback(message, level)
- callback(message, level, module)
"""
if len(args) == 2:
message, level = args
# Extract module from message if present
if message.startswith('[') and ']' in message:
end_bracket = message.index(']')
module = message[1:end_bracket]
message = message[end_bracket+1:].strip()
# Remove level tag if present
if message.startswith('[') and ']' in message:
message = message[message.index(']')+1:].strip()
self.log(message, level, module)
else:
self.log(message, level)
elif len(args) == 3:
message, level, module = args
self.log(message, level, module)
else:
# Default: treat as simple message
self.info(str(args))
return callback
# Singleton instances for common components
_logger_instances = {}
def get_logger(
component_name: str,
log_dir: str = None,
retention_days: int = 7,
console_level: str = 'INFO',
file_level: str = 'DEBUG'
) -> UniversalLogger:
"""
Get or create a logger instance for a component (singleton pattern)
Args:
component_name: Name of the component
log_dir: Directory to store logs
retention_days: Number of days to keep logs
console_level: Console logging level
file_level: File logging level
Returns:
UniversalLogger instance
"""
if component_name not in _logger_instances:
_logger_instances[component_name] = UniversalLogger(
component_name=component_name,
log_dir=log_dir,
retention_days=retention_days,
console_level=console_level,
file_level=file_level
)
return _logger_instances[component_name]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff