491
modules/activity_status.py
Normal file
491
modules/activity_status.py
Normal file
@@ -0,0 +1,491 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Activity Status Manager
|
||||
Centralized module for tracking and updating real-time download activity status
|
||||
Stores status in database for reliable, concurrent access
|
||||
|
||||
Supports:
|
||||
- Single main activity (scheduler) via activity_status table
|
||||
- Multiple background tasks (YouTube monitor, etc.) via background_task_status table
|
||||
"""
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List
|
||||
from pathlib import Path
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('ActivityStatus')
|
||||
|
||||
|
||||
class ActivityStatusManager:
|
||||
"""Manages real-time activity status updates stored in database"""
|
||||
|
||||
def __init__(self, unified_db=None):
|
||||
"""
|
||||
Initialize activity status manager
|
||||
|
||||
Args:
|
||||
unified_db: UnifiedDatabase instance (optional, will create if needed)
|
||||
"""
|
||||
self.db = unified_db
|
||||
if not self.db:
|
||||
from modules.unified_database import UnifiedDatabase
|
||||
self.db = UnifiedDatabase()
|
||||
|
||||
self._ensure_table()
|
||||
|
||||
def _ensure_table(self):
|
||||
"""Ensure activity_status and background_task_status tables exist"""
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Main scheduler activity table (single row)
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS activity_status (
|
||||
id INTEGER PRIMARY KEY CHECK (id = 1),
|
||||
active INTEGER NOT NULL DEFAULT 0,
|
||||
task_id TEXT,
|
||||
platform TEXT,
|
||||
account TEXT,
|
||||
start_time TEXT,
|
||||
status TEXT,
|
||||
detailed_status TEXT,
|
||||
progress_current INTEGER,
|
||||
progress_total INTEGER,
|
||||
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
''')
|
||||
|
||||
# Add account progress columns if missing
|
||||
cursor.execute("PRAGMA table_info(activity_status)")
|
||||
columns = [col[1] for col in cursor.fetchall()]
|
||||
if 'account_current' not in columns:
|
||||
cursor.execute('ALTER TABLE activity_status ADD COLUMN account_current INTEGER')
|
||||
if 'account_total' not in columns:
|
||||
cursor.execute('ALTER TABLE activity_status ADD COLUMN account_total INTEGER')
|
||||
|
||||
# Insert default row if doesn't exist
|
||||
cursor.execute('''
|
||||
INSERT OR IGNORE INTO activity_status (id, active)
|
||||
VALUES (1, 0)
|
||||
''')
|
||||
|
||||
# Background tasks table (multiple concurrent tasks like YouTube monitor)
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS background_task_status (
|
||||
task_id TEXT PRIMARY KEY,
|
||||
active INTEGER NOT NULL DEFAULT 0,
|
||||
task_type TEXT,
|
||||
display_name TEXT,
|
||||
start_time TEXT,
|
||||
status TEXT,
|
||||
detailed_status TEXT,
|
||||
progress_current INTEGER,
|
||||
progress_total INTEGER,
|
||||
extra_data TEXT,
|
||||
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
''')
|
||||
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create activity tables: {e}")
|
||||
|
||||
def start_activity(self, task_id: str, platform: str, account: str, status: str = "Running"):
|
||||
"""
|
||||
Mark activity as started
|
||||
|
||||
Args:
|
||||
task_id: Unique task identifier
|
||||
platform: Platform name (instagram, snapchat, etc)
|
||||
account: Account/username being processed
|
||||
status: Initial status message
|
||||
"""
|
||||
try:
|
||||
with self.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
UPDATE activity_status
|
||||
SET active = 1,
|
||||
task_id = ?,
|
||||
platform = ?,
|
||||
account = ?,
|
||||
start_time = ?,
|
||||
status = ?,
|
||||
detailed_status = NULL,
|
||||
progress_current = NULL,
|
||||
progress_total = NULL,
|
||||
account_current = NULL,
|
||||
account_total = NULL,
|
||||
updated_at = ?
|
||||
WHERE id = 1
|
||||
''', (task_id, platform, account, datetime.now().isoformat(),
|
||||
status, datetime.now().isoformat()))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start activity: {e}")
|
||||
|
||||
def update_status(self, detailed_status: str, progress_current: Optional[int] = None,
|
||||
progress_total: Optional[int] = None):
|
||||
"""Update detailed status message and progress."""
|
||||
try:
|
||||
with self.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
UPDATE activity_status
|
||||
SET detailed_status = ?,
|
||||
progress_current = COALESCE(?, progress_current),
|
||||
progress_total = COALESCE(?, progress_total),
|
||||
updated_at = ?
|
||||
WHERE id = 1 AND active = 1
|
||||
''', (detailed_status, progress_current, progress_total,
|
||||
datetime.now().isoformat()))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to update status: {e}")
|
||||
|
||||
def update_account_name(self, account: str):
|
||||
"""Update the current account name being processed."""
|
||||
try:
|
||||
with self.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
UPDATE activity_status
|
||||
SET account = ?,
|
||||
updated_at = ?
|
||||
WHERE id = 1 AND active = 1
|
||||
''', (account, datetime.now().isoformat()))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to update account name: {e}")
|
||||
|
||||
def update_account_progress(self, account_current: int, account_total: int):
|
||||
"""Update account-level progress and reset file-level progress for the new account"""
|
||||
try:
|
||||
with self.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
UPDATE activity_status
|
||||
SET account_current = ?,
|
||||
account_total = ?,
|
||||
progress_current = NULL,
|
||||
progress_total = NULL,
|
||||
updated_at = ?
|
||||
WHERE id = 1 AND active = 1
|
||||
''', (account_current, account_total, datetime.now().isoformat()))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to update account progress: {e}")
|
||||
|
||||
def stop_activity(self):
|
||||
"""Mark activity as stopped"""
|
||||
try:
|
||||
with self.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
UPDATE activity_status
|
||||
SET active = 0,
|
||||
detailed_status = NULL,
|
||||
progress_current = NULL,
|
||||
progress_total = NULL,
|
||||
account_current = NULL,
|
||||
account_total = NULL,
|
||||
updated_at = ?
|
||||
WHERE id = 1
|
||||
''', (datetime.now().isoformat(),))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to stop activity: {e}")
|
||||
|
||||
def get_current_activity(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get current activity status
|
||||
|
||||
Returns:
|
||||
Dict with activity information
|
||||
"""
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
SELECT active, task_id, platform, account, start_time, status,
|
||||
detailed_status, progress_current, progress_total,
|
||||
account_current, account_total
|
||||
FROM activity_status
|
||||
WHERE id = 1
|
||||
''')
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
result = {
|
||||
'active': bool(row[0]),
|
||||
'task_id': row[1],
|
||||
'platform': row[2],
|
||||
'account': row[3],
|
||||
'start_time': row[4],
|
||||
'status': row[5]
|
||||
}
|
||||
|
||||
# Add optional fields only if they exist
|
||||
if row[6]: # detailed_status
|
||||
result['detailed_status'] = row[6]
|
||||
if row[7] is not None and row[8] is not None: # progress
|
||||
result['progress'] = {
|
||||
'current': row[7],
|
||||
'total': row[8]
|
||||
}
|
||||
if row[9] is not None and row[10] is not None: # account_progress
|
||||
result['account_progress'] = {
|
||||
'current': row[9],
|
||||
'total': row[10]
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
return {
|
||||
'active': False,
|
||||
'task_id': None,
|
||||
'platform': None,
|
||||
'account': None,
|
||||
'start_time': None,
|
||||
'status': None
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get current activity: {e}")
|
||||
return {
|
||||
'active': False,
|
||||
'task_id': None,
|
||||
'platform': None,
|
||||
'account': None,
|
||||
'start_time': None,
|
||||
'status': None
|
||||
}
|
||||
|
||||
# =========================================================================
|
||||
# BACKGROUND TASK METHODS (for concurrent tasks like YouTube monitor)
|
||||
# =========================================================================
|
||||
|
||||
def start_background_task(self, task_id: str, task_type: str, display_name: str,
|
||||
status: str = "Running", extra_data: Dict = None):
|
||||
"""
|
||||
Start a background task (doesn't interfere with main activity).
|
||||
|
||||
Args:
|
||||
task_id: Unique task identifier (e.g., 'youtube_monitor')
|
||||
task_type: Type of task (e.g., 'youtube_monitor', 'video_processor')
|
||||
display_name: Human-readable name for display
|
||||
status: Initial status message
|
||||
extra_data: Optional extra data to store as JSON
|
||||
"""
|
||||
try:
|
||||
with self.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
# Check if task is already running - don't reset if so
|
||||
cursor.execute('''
|
||||
SELECT active FROM background_task_status WHERE task_id = ?
|
||||
''', (task_id,))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0] == 1:
|
||||
# Task already running, just update status without resetting counter
|
||||
logger.debug(f"Background task {task_id} already running, not resetting")
|
||||
return
|
||||
|
||||
cursor.execute('''
|
||||
INSERT OR REPLACE INTO background_task_status
|
||||
(task_id, active, task_type, display_name, start_time, status,
|
||||
detailed_status, progress_current, progress_total, extra_data, updated_at)
|
||||
VALUES (?, 1, ?, ?, ?, ?, NULL, NULL, NULL, ?, ?)
|
||||
''', (task_id, task_type, display_name, datetime.now().isoformat(),
|
||||
status, json.dumps(extra_data) if extra_data else None,
|
||||
datetime.now().isoformat()))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start background task {task_id}: {e}")
|
||||
|
||||
def update_background_task(self, task_id: str, detailed_status: str,
|
||||
progress_current: Optional[int] = None,
|
||||
progress_total: Optional[int] = None,
|
||||
extra_data: Dict = None):
|
||||
"""Update a background task's status."""
|
||||
try:
|
||||
with self.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
if extra_data is not None:
|
||||
cursor.execute('''
|
||||
UPDATE background_task_status
|
||||
SET detailed_status = ?,
|
||||
progress_current = ?,
|
||||
progress_total = ?,
|
||||
extra_data = ?,
|
||||
updated_at = ?
|
||||
WHERE task_id = ? AND active = 1
|
||||
''', (detailed_status, progress_current, progress_total,
|
||||
json.dumps(extra_data), datetime.now().isoformat(), task_id))
|
||||
else:
|
||||
cursor.execute('''
|
||||
UPDATE background_task_status
|
||||
SET detailed_status = ?,
|
||||
progress_current = ?,
|
||||
progress_total = ?,
|
||||
updated_at = ?
|
||||
WHERE task_id = ? AND active = 1
|
||||
''', (detailed_status, progress_current, progress_total,
|
||||
datetime.now().isoformat(), task_id))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to update background task {task_id}: {e}")
|
||||
|
||||
def stop_background_task(self, task_id: str):
|
||||
"""Mark a background task as stopped."""
|
||||
try:
|
||||
with self.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
UPDATE background_task_status
|
||||
SET active = 0,
|
||||
updated_at = ?
|
||||
WHERE task_id = ?
|
||||
''', (datetime.now().isoformat(), task_id))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to stop background task {task_id}: {e}")
|
||||
|
||||
def stop_all_background_tasks(self):
|
||||
"""Mark all background tasks as stopped (used on scheduler startup to clear stale state)."""
|
||||
try:
|
||||
with self.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
UPDATE background_task_status
|
||||
SET active = 0,
|
||||
updated_at = ?
|
||||
WHERE active = 1
|
||||
''', (datetime.now().isoformat(),))
|
||||
count = cursor.rowcount
|
||||
conn.commit()
|
||||
if count > 0:
|
||||
logger.info(f"Cleared {count} stale background task(s) from previous run")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to stop all background tasks: {e}")
|
||||
|
||||
def get_background_task(self, task_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get a specific background task's status.
|
||||
|
||||
Args:
|
||||
task_id: Task identifier
|
||||
|
||||
Returns:
|
||||
Dict with task information or None
|
||||
"""
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
SELECT task_id, active, task_type, display_name, start_time,
|
||||
status, detailed_status, progress_current, progress_total,
|
||||
extra_data, updated_at
|
||||
FROM background_task_status
|
||||
WHERE task_id = ?
|
||||
''', (task_id,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
result = {
|
||||
'task_id': row[0],
|
||||
'active': bool(row[1]),
|
||||
'task_type': row[2],
|
||||
'display_name': row[3],
|
||||
'start_time': row[4],
|
||||
'status': row[5],
|
||||
'updated_at': row[10]
|
||||
}
|
||||
|
||||
if row[6]: # detailed_status
|
||||
result['detailed_status'] = row[6]
|
||||
if row[7] is not None and row[8] is not None: # progress
|
||||
result['progress'] = {
|
||||
'current': row[7],
|
||||
'total': row[8]
|
||||
}
|
||||
if row[9]: # extra_data
|
||||
try:
|
||||
result['extra_data'] = json.loads(row[9])
|
||||
except (json.JSONDecodeError, TypeError, ValueError) as e:
|
||||
logger.debug(f"Failed to parse extra_data for task {task_id}: {e}")
|
||||
result['extra_data'] = {}
|
||||
|
||||
return result
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get background task {task_id}: {e}")
|
||||
return None
|
||||
|
||||
def get_active_background_tasks(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get all active background tasks.
|
||||
|
||||
Returns:
|
||||
List of active task dictionaries
|
||||
"""
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
SELECT task_id, active, task_type, display_name, start_time,
|
||||
status, detailed_status, progress_current, progress_total,
|
||||
extra_data, updated_at
|
||||
FROM background_task_status
|
||||
WHERE active = 1
|
||||
ORDER BY start_time DESC
|
||||
''')
|
||||
|
||||
tasks = []
|
||||
for row in cursor.fetchall():
|
||||
task = {
|
||||
'task_id': row[0],
|
||||
'active': bool(row[1]),
|
||||
'task_type': row[2],
|
||||
'display_name': row[3],
|
||||
'start_time': row[4],
|
||||
'status': row[5],
|
||||
'updated_at': row[10]
|
||||
}
|
||||
|
||||
if row[6]: # detailed_status
|
||||
task['detailed_status'] = row[6]
|
||||
if row[7] is not None and row[8] is not None: # progress
|
||||
task['progress'] = {
|
||||
'current': row[7],
|
||||
'total': row[8]
|
||||
}
|
||||
if row[9]: # extra_data
|
||||
try:
|
||||
task['extra_data'] = json.loads(row[9])
|
||||
except (json.JSONDecodeError, TypeError, ValueError):
|
||||
task['extra_data'] = {}
|
||||
|
||||
tasks.append(task)
|
||||
|
||||
return tasks
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get active background tasks: {e}")
|
||||
return []
|
||||
|
||||
|
||||
# Global instance with thread-safe initialization
|
||||
_activity_manager = None
|
||||
_activity_manager_lock = __import__('threading').Lock()
|
||||
|
||||
|
||||
def get_activity_manager(unified_db=None):
|
||||
"""Get or create global activity manager instance (thread-safe)"""
|
||||
global _activity_manager
|
||||
if _activity_manager is None:
|
||||
with _activity_manager_lock:
|
||||
# Double-check inside lock to prevent race condition
|
||||
if _activity_manager is None:
|
||||
_activity_manager = ActivityStatusManager(unified_db)
|
||||
return _activity_manager
|
||||
478
modules/base_module.py
Normal file
478
modules/base_module.py
Normal file
@@ -0,0 +1,478 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Base Module - Shared functionality for all media downloader modules
|
||||
|
||||
Provides:
|
||||
- LoggingMixin: Consistent logging with universal logger and backwards-compatible callback support
|
||||
- CookieManagerMixin: Centralized cookie loading/saving for scrapers
|
||||
- RateLimitMixin: Smart delay handling for rate limiting
|
||||
- DeferredDownloadsMixin: Track downloads for batch database recording
|
||||
"""
|
||||
|
||||
import random
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
|
||||
class LoggingMixin:
|
||||
"""
|
||||
Mixin providing consistent logging across all modules.
|
||||
|
||||
Uses the universal logger for all logging, with optional callback support
|
||||
for backwards compatibility with existing code.
|
||||
|
||||
Usage:
|
||||
class MyModule(LoggingMixin):
|
||||
def __init__(self, log_callback=None):
|
||||
self._init_logger('MyModule', log_callback)
|
||||
# ... rest of init
|
||||
|
||||
def do_something(self):
|
||||
self.log("Starting operation", "info")
|
||||
# ...
|
||||
self.log("Operation complete", "success")
|
||||
"""
|
||||
|
||||
_logger_name: str = 'Unknown'
|
||||
_default_module: str = 'Core'
|
||||
logger = None
|
||||
log_callback = None
|
||||
show_debug: bool = True
|
||||
|
||||
def _init_logger(self, logger_name: str, log_callback=None, default_module: str = 'Core', show_debug: bool = True):
|
||||
"""
|
||||
Initialize logging for this module.
|
||||
|
||||
Args:
|
||||
logger_name: Name for the logger (e.g., 'Instagram', 'TikTok', 'Forum')
|
||||
log_callback: Optional callback function for backwards compatibility
|
||||
default_module: Default module name for log messages (default: 'Core')
|
||||
show_debug: Whether to show debug messages (default: True)
|
||||
"""
|
||||
self._logger_name = logger_name
|
||||
self._default_module = default_module
|
||||
self.log_callback = log_callback
|
||||
self.show_debug = show_debug
|
||||
self.logger = get_logger(logger_name)
|
||||
|
||||
def log(self, message: str, level: str = "info", module: str = None):
|
||||
"""
|
||||
Log a message using universal logger with optional callback.
|
||||
|
||||
Args:
|
||||
message: The message to log
|
||||
level: Log level ('debug', 'info', 'warning', 'error', 'success', 'critical')
|
||||
module: Module name for the log entry (default: uses _default_module)
|
||||
"""
|
||||
level_lower = level.lower()
|
||||
|
||||
# Skip debug messages if show_debug is False
|
||||
if level_lower == "debug" and not self.show_debug:
|
||||
return
|
||||
|
||||
# Use universal logger (always log here first)
|
||||
actual_module = module or self._default_module
|
||||
self.logger.log(message, level.upper(), module=actual_module)
|
||||
|
||||
# Call log_callback for backwards compatibility
|
||||
if self.log_callback:
|
||||
self.log_callback(f"[{self._logger_name}] {message}", level_lower)
|
||||
|
||||
|
||||
class CookieManagerMixin:
|
||||
"""
|
||||
Mixin providing centralized cookie management for scrapers.
|
||||
|
||||
Handles loading and saving cookies to/from the database.
|
||||
|
||||
Usage:
|
||||
class MyScraper(LoggingMixin, CookieManagerMixin):
|
||||
def __init__(self, unified_db=None):
|
||||
self._init_logger('MyScraper')
|
||||
self._init_cookie_manager(unified_db, 'my_scraper')
|
||||
self._load_cookies_from_db()
|
||||
|
||||
def after_auth(self, cookies):
|
||||
self._save_cookies_to_db(cookies)
|
||||
"""
|
||||
|
||||
unified_db = None
|
||||
scraper_id: str = ''
|
||||
cf_handler = None # CloudflareHandler if used
|
||||
user_agent: str = ''
|
||||
|
||||
def _init_cookie_manager(self, unified_db, scraper_id: str, cf_handler=None, user_agent: str = ''):
|
||||
"""
|
||||
Initialize cookie management.
|
||||
|
||||
Args:
|
||||
unified_db: UnifiedDatabase instance
|
||||
scraper_id: ID for this scraper in database
|
||||
cf_handler: Optional CloudflareHandler instance
|
||||
user_agent: User agent string
|
||||
"""
|
||||
self.unified_db = unified_db
|
||||
self.scraper_id = scraper_id
|
||||
self.cf_handler = cf_handler
|
||||
self.user_agent = user_agent
|
||||
|
||||
def _load_cookies_from_db(self) -> Optional[List[Dict]]:
|
||||
"""
|
||||
Load cookies from database if available.
|
||||
|
||||
Returns:
|
||||
List of cookie dicts or None if not available
|
||||
"""
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
||||
if cookies:
|
||||
# Load into CloudflareHandler if available
|
||||
if self.cf_handler:
|
||||
self.cf_handler._cookies = cookies
|
||||
if hasattr(self, 'log'):
|
||||
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
|
||||
return cookies
|
||||
except Exception as e:
|
||||
if hasattr(self, 'log'):
|
||||
self.log(f"Error loading cookies from database: {e}", "warning")
|
||||
|
||||
return None
|
||||
|
||||
def _save_cookies_to_db(self, cookies: List[Dict], merge: bool = True, user_agent: str = None):
|
||||
"""
|
||||
Save cookies to database.
|
||||
|
||||
Args:
|
||||
cookies: List of cookie dicts
|
||||
merge: Whether to merge with existing cookies
|
||||
user_agent: User agent to associate with cookies (important for cf_clearance).
|
||||
If not provided, uses self.user_agent as fallback.
|
||||
"""
|
||||
if not self.unified_db:
|
||||
return
|
||||
|
||||
try:
|
||||
# Use provided user_agent or fall back to self.user_agent
|
||||
ua = user_agent or self.user_agent
|
||||
self.unified_db.save_scraper_cookies(
|
||||
self.scraper_id,
|
||||
cookies,
|
||||
user_agent=ua,
|
||||
merge=merge
|
||||
)
|
||||
if hasattr(self, 'log'):
|
||||
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50] if ua else 'None'}...)", "debug")
|
||||
except Exception as e:
|
||||
if hasattr(self, 'log'):
|
||||
self.log(f"Error saving cookies to database: {e}", "warning")
|
||||
|
||||
def _cookies_expired(self) -> bool:
|
||||
"""
|
||||
Check if cookies are expired.
|
||||
|
||||
Returns:
|
||||
True if expired, False otherwise
|
||||
"""
|
||||
if self.cf_handler:
|
||||
return self.cf_handler.cookies_expired()
|
||||
return True
|
||||
|
||||
def _get_cookies_for_requests(self) -> Dict[str, str]:
|
||||
"""
|
||||
Get cookies in format for requests library.
|
||||
|
||||
Returns:
|
||||
Dict of cookie name -> value
|
||||
"""
|
||||
if self.cf_handler:
|
||||
return self.cf_handler.get_cookies_dict()
|
||||
return {}
|
||||
|
||||
|
||||
class RateLimitMixin:
|
||||
"""
|
||||
Mixin providing smart rate limiting for scrapers.
|
||||
|
||||
Handles delays between requests to avoid detection and rate limiting.
|
||||
|
||||
Usage:
|
||||
class MyScraper(LoggingMixin, RateLimitMixin):
|
||||
def __init__(self):
|
||||
self._init_logger('MyScraper')
|
||||
self._init_rate_limiter(min_delay=5, max_delay=15, batch_delay=30)
|
||||
|
||||
def download_batch(self, items):
|
||||
for i, item in enumerate(items):
|
||||
self.download_item(item)
|
||||
is_batch_end = (i + 1) % 10 == 0
|
||||
self._smart_delay(is_batch_end)
|
||||
"""
|
||||
|
||||
min_delay: float = 5.0
|
||||
max_delay: float = 15.0
|
||||
batch_delay_min: float = 30.0
|
||||
batch_delay_max: float = 60.0
|
||||
error_delay: float = 120.0
|
||||
|
||||
def _init_rate_limiter(
|
||||
self,
|
||||
min_delay: float = 5.0,
|
||||
max_delay: float = 15.0,
|
||||
batch_delay_min: float = 30.0,
|
||||
batch_delay_max: float = 60.0,
|
||||
error_delay: float = 120.0
|
||||
):
|
||||
"""
|
||||
Initialize rate limiting.
|
||||
|
||||
Args:
|
||||
min_delay: Minimum delay between requests (seconds)
|
||||
max_delay: Maximum delay between requests (seconds)
|
||||
batch_delay_min: Minimum delay between batches (seconds)
|
||||
batch_delay_max: Maximum delay between batches (seconds)
|
||||
error_delay: Delay after errors (seconds)
|
||||
"""
|
||||
self.min_delay = min_delay
|
||||
self.max_delay = max_delay
|
||||
self.batch_delay_min = batch_delay_min
|
||||
self.batch_delay_max = batch_delay_max
|
||||
self.error_delay = error_delay
|
||||
|
||||
def _smart_delay(self, is_batch_end: bool = False, had_error: bool = False):
|
||||
"""
|
||||
Apply smart delay between requests.
|
||||
|
||||
Args:
|
||||
is_batch_end: True if this is the end of a batch
|
||||
had_error: True if there was an error (uses longer delay)
|
||||
"""
|
||||
if had_error:
|
||||
delay = self.error_delay
|
||||
elif is_batch_end:
|
||||
delay = random.uniform(self.batch_delay_min, self.batch_delay_max)
|
||||
else:
|
||||
delay = random.uniform(self.min_delay, self.max_delay)
|
||||
|
||||
if hasattr(self, 'log'):
|
||||
self.log(f"Waiting {delay:.1f}s before next request", "debug")
|
||||
|
||||
time.sleep(delay)
|
||||
|
||||
def _delay_after_error(self):
|
||||
"""Apply error delay."""
|
||||
self._smart_delay(had_error=True)
|
||||
|
||||
def _delay_between_items(self):
|
||||
"""Apply normal delay between items."""
|
||||
self._smart_delay(is_batch_end=False)
|
||||
|
||||
def _delay_between_batches(self):
|
||||
"""Apply batch delay."""
|
||||
self._smart_delay(is_batch_end=True)
|
||||
|
||||
|
||||
class DeferredDownloadsMixin:
|
||||
"""
|
||||
Mixin for tracking downloads to be recorded in batch.
|
||||
|
||||
Allows deferring database writes for better performance.
|
||||
|
||||
Usage:
|
||||
class MyScraper(LoggingMixin, DeferredDownloadsMixin):
|
||||
def __init__(self):
|
||||
self._init_logger('MyScraper')
|
||||
self._init_deferred_downloads()
|
||||
|
||||
def download_file(self, url, path):
|
||||
# ... download logic ...
|
||||
self._add_pending_download({
|
||||
'platform': 'my_platform',
|
||||
'source': 'username',
|
||||
'file_path': str(path),
|
||||
# ... other fields ...
|
||||
})
|
||||
|
||||
def finish_batch(self):
|
||||
downloads = self.get_pending_downloads()
|
||||
self.db.record_downloads_batch(downloads)
|
||||
self.clear_pending_downloads()
|
||||
"""
|
||||
|
||||
pending_downloads: List[Dict] = None
|
||||
|
||||
def _init_deferred_downloads(self):
|
||||
"""Initialize deferred downloads tracking."""
|
||||
self.pending_downloads = []
|
||||
|
||||
def _add_pending_download(self, download_info: Dict[str, Any]):
|
||||
"""
|
||||
Add a download to pending list.
|
||||
|
||||
Args:
|
||||
download_info: Dict with download metadata
|
||||
"""
|
||||
if self.pending_downloads is None:
|
||||
self.pending_downloads = []
|
||||
self.pending_downloads.append(download_info)
|
||||
|
||||
def get_pending_downloads(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get all pending downloads.
|
||||
|
||||
Returns:
|
||||
List of pending download dicts
|
||||
"""
|
||||
return self.pending_downloads or []
|
||||
|
||||
def clear_pending_downloads(self):
|
||||
"""Clear pending downloads list."""
|
||||
self.pending_downloads = []
|
||||
|
||||
def has_pending_downloads(self) -> bool:
|
||||
"""Check if there are pending downloads."""
|
||||
return bool(self.pending_downloads)
|
||||
|
||||
|
||||
class BaseDatabaseAdapter:
|
||||
"""
|
||||
Base class for platform-specific database adapters.
|
||||
|
||||
Provides common functionality for recording and querying downloads.
|
||||
Platform-specific adapters should inherit from this class.
|
||||
|
||||
Usage:
|
||||
class MyPlatformAdapter(BaseDatabaseAdapter):
|
||||
def __init__(self, unified_db):
|
||||
super().__init__(unified_db, platform='my_platform')
|
||||
|
||||
def record_download(self, content_id, username, filename, **kwargs):
|
||||
# Platform-specific URL construction
|
||||
url = f"https://my_platform.com/{username}/{content_id}"
|
||||
return self._record_download_internal(
|
||||
url=url,
|
||||
source=username,
|
||||
filename=filename,
|
||||
**kwargs
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(self, unified_db, platform: str, method: str = None):
|
||||
"""
|
||||
Initialize base adapter.
|
||||
|
||||
Args:
|
||||
unified_db: UnifiedDatabase instance
|
||||
platform: Platform name (e.g., 'instagram', 'tiktok')
|
||||
method: Optional method identifier for multi-method platforms
|
||||
"""
|
||||
self.db = unified_db
|
||||
self.unified_db = unified_db # Alias for compatibility
|
||||
self.platform = platform
|
||||
self.method = method or platform
|
||||
|
||||
def get_connection(self, for_write: bool = False):
|
||||
"""Get database connection (delegates to UnifiedDatabase)."""
|
||||
return self.db.get_connection(for_write)
|
||||
|
||||
def get_file_hash(self, file_path: str) -> Optional[str]:
|
||||
"""Calculate SHA256 hash of a file."""
|
||||
return self.db.get_file_hash(file_path)
|
||||
|
||||
def get_download_by_file_hash(self, file_hash: str) -> Optional[Dict]:
|
||||
"""Get download record by file hash."""
|
||||
return self.db.get_download_by_file_hash(file_hash)
|
||||
|
||||
def get_download_by_media_id(self, media_id: str) -> Optional[Dict]:
|
||||
"""Get download record by media_id."""
|
||||
return self.db.get_download_by_media_id(media_id, self.platform, self.method)
|
||||
|
||||
def is_already_downloaded_by_hash(self, file_path: str) -> bool:
|
||||
"""Check if file is already downloaded by comparing file hash."""
|
||||
file_hash = self.get_file_hash(file_path)
|
||||
if not file_hash:
|
||||
return False
|
||||
return self.get_download_by_file_hash(file_hash) is not None
|
||||
|
||||
def is_already_downloaded_by_media_id(self, media_id: str) -> bool:
|
||||
"""Check if content is already downloaded by media_id."""
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
SELECT 1 FROM downloads
|
||||
WHERE platform = ?
|
||||
AND media_id = ?
|
||||
LIMIT 1
|
||||
''', (self.platform, media_id))
|
||||
return cursor.fetchone() is not None
|
||||
|
||||
def _calculate_file_hash(self, file_path: str) -> Optional[str]:
|
||||
"""Helper to safely calculate file hash."""
|
||||
if not file_path:
|
||||
return None
|
||||
try:
|
||||
from pathlib import Path
|
||||
if Path(file_path).exists():
|
||||
return self.get_file_hash(file_path)
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
def _detect_content_type(self, filename: str) -> str:
|
||||
"""Detect content type from filename extension."""
|
||||
from pathlib import Path
|
||||
ext = Path(filename).suffix.lower()
|
||||
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif', '.webp', '.bmp', '.tiff'}
|
||||
return 'image' if ext in image_exts else 'video'
|
||||
|
||||
def _record_download_internal(
|
||||
self,
|
||||
url: str,
|
||||
source: str,
|
||||
filename: str,
|
||||
content_type: str = None,
|
||||
file_path: str = None,
|
||||
post_date=None,
|
||||
metadata: Dict = None,
|
||||
file_hash: str = None,
|
||||
**extra_kwargs
|
||||
) -> bool:
|
||||
"""
|
||||
Internal method to record a download.
|
||||
|
||||
Args:
|
||||
url: Unique URL/identifier for the content
|
||||
source: Username or source identifier
|
||||
filename: Downloaded filename
|
||||
content_type: 'image' or 'video' (auto-detected if not provided)
|
||||
file_path: Full path to downloaded file
|
||||
post_date: Original post date
|
||||
metadata: Additional metadata dict
|
||||
file_hash: Pre-computed file hash (computed if not provided and file_path exists)
|
||||
**extra_kwargs: Additional arguments passed to unified_db.record_download
|
||||
"""
|
||||
# Auto-detect content type if not provided
|
||||
if not content_type:
|
||||
content_type = self._detect_content_type(filename)
|
||||
|
||||
# Calculate file hash if not provided
|
||||
if not file_hash and file_path:
|
||||
file_hash = self._calculate_file_hash(file_path)
|
||||
|
||||
return self.db.record_download(
|
||||
url=url,
|
||||
platform=self.platform,
|
||||
source=source,
|
||||
content_type=content_type,
|
||||
filename=filename,
|
||||
file_path=file_path,
|
||||
file_hash=file_hash,
|
||||
post_date=post_date,
|
||||
metadata=metadata,
|
||||
method=self.method,
|
||||
**extra_kwargs
|
||||
)
|
||||
1024
modules/cloudflare_handler.py
Normal file
1024
modules/cloudflare_handler.py
Normal file
File diff suppressed because it is too large
Load Diff
873
modules/coppermine_module.py
Normal file
873
modules/coppermine_module.py
Normal file
@@ -0,0 +1,873 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Coppermine Photo Gallery Downloader Module
|
||||
Downloads full-resolution images from Coppermine-based galleries
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import hashlib
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Optional, Set
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlparse, parse_qs
|
||||
from modules.base_module import LoggingMixin
|
||||
from modules.cloudflare_handler import CloudflareHandler, SiteStatus, get_flaresolverr_user_agent
|
||||
|
||||
|
||||
class CoppermineDownloader(LoggingMixin):
|
||||
"""
|
||||
Coppermine Photo Gallery downloader
|
||||
|
||||
Example usage:
|
||||
from coppermine_module import CoppermineDownloader
|
||||
|
||||
downloader = CoppermineDownloader()
|
||||
count = downloader.download(
|
||||
gallery_url="https://hqdiesel.net/thumbnails.php?album=lastup&cat=123",
|
||||
output_dir="downloads/coppermine",
|
||||
days_back=7
|
||||
)
|
||||
print(f"Downloaded {count} items")
|
||||
"""
|
||||
|
||||
def __init__(self, show_progress=True, use_database=True,
|
||||
log_callback=None, unified_db=None, config=None):
|
||||
"""
|
||||
Initialize the downloader
|
||||
|
||||
Args:
|
||||
show_progress: Print progress messages
|
||||
use_database: Use database to track downloads
|
||||
log_callback: Optional callback function for logging
|
||||
unified_db: Optional UnifiedDatabase instance
|
||||
config: Optional config dict with flaresolverr settings
|
||||
"""
|
||||
# Initialize logging via mixin
|
||||
self._init_logger('Coppermine', log_callback, default_module='Download')
|
||||
|
||||
self.show_progress = show_progress
|
||||
self.use_database = use_database
|
||||
self.downloaded_files = set()
|
||||
self.download_count = 0
|
||||
self.unified_db = unified_db # Store for scraper config access
|
||||
self.scraper_id = 'coppermine' # Scraper ID in database
|
||||
|
||||
# Use unified database if provided
|
||||
if unified_db and use_database:
|
||||
from modules.unified_database import CoppermineDatabaseAdapter
|
||||
self.db = CoppermineDatabaseAdapter(unified_db)
|
||||
else:
|
||||
self.db = None
|
||||
self.use_database = False
|
||||
|
||||
# Initialize activity status manager for real-time updates
|
||||
from modules.activity_status import get_activity_manager
|
||||
self.activity_manager = get_activity_manager(unified_db)
|
||||
|
||||
# Rate limiting
|
||||
self.min_delay = 1
|
||||
self.max_delay = 3
|
||||
|
||||
self.pending_downloads = [] # Track downloads for deferred database recording
|
||||
|
||||
# Load scraper configuration from database if available
|
||||
self.proxy_url = None
|
||||
self.cookie_file = None # Default to None (use database)
|
||||
|
||||
if unified_db:
|
||||
scraper_config = unified_db.get_scraper(self.scraper_id)
|
||||
if scraper_config:
|
||||
# Get proxy configuration
|
||||
if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
|
||||
self.proxy_url = scraper_config['proxy_url']
|
||||
self.log(f"Using proxy: {self.proxy_url}", "info")
|
||||
|
||||
# Fall back to config file for cookie_file if database not available
|
||||
if not unified_db and config:
|
||||
self.cookie_file = config.get('cookie_file', '/opt/media-downloader/cookies/coppermine_cookies.json')
|
||||
|
||||
# Session with proper headers
|
||||
self.session = requests.Session()
|
||||
self.user_agent = get_flaresolverr_user_agent()
|
||||
self.session.headers.update({
|
||||
'User-Agent': self.user_agent,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1'
|
||||
})
|
||||
|
||||
# Configure session proxy if available
|
||||
if self.proxy_url:
|
||||
self.session.proxies = {
|
||||
'http': self.proxy_url,
|
||||
'https': self.proxy_url
|
||||
}
|
||||
|
||||
# Initialize universal Cloudflare handler with conservative expiry
|
||||
# Pass proxy_url if configured, and cookie_file=None for database storage
|
||||
self.cf_handler = CloudflareHandler(
|
||||
module_name="Coppermine",
|
||||
cookie_file=self.cookie_file, # None when using database
|
||||
user_agent=self.user_agent,
|
||||
logger=self.logger,
|
||||
aggressive_expiry=False, # Conservative mode for Coppermine
|
||||
proxy_url=self.proxy_url # Pass proxy to FlareSolverr
|
||||
)
|
||||
|
||||
# Keep for backwards compatibility
|
||||
self.flaresolverr_url = self.cf_handler.flaresolverr_url
|
||||
self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled
|
||||
|
||||
# Load cookies from file if exists
|
||||
self._load_cookies()
|
||||
|
||||
def _record_download(self, url: str, platform: str, source: str, content_type: str,
|
||||
filename: str, file_path: str, file_size: int, file_hash: str,
|
||||
post_date=None, metadata: dict = None, deferred: bool = False):
|
||||
"""Record a download in the database
|
||||
|
||||
Args:
|
||||
deferred: If True, don't record to database now - add to pending_downloads list
|
||||
for later recording after file move is complete
|
||||
"""
|
||||
# If deferred, store for later recording instead of recording now
|
||||
if deferred:
|
||||
self.pending_downloads.append({
|
||||
'url': url,
|
||||
'platform': platform,
|
||||
'source': source,
|
||||
'content_type': content_type,
|
||||
'filename': filename,
|
||||
'file_path': file_path,
|
||||
'file_size': file_size,
|
||||
'file_hash': file_hash,
|
||||
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
|
||||
'metadata': metadata
|
||||
})
|
||||
self.log(f"Deferred recording for {filename}", "debug")
|
||||
return True
|
||||
|
||||
if not self.use_database or not self.db:
|
||||
return
|
||||
|
||||
try:
|
||||
self.db.add_download(
|
||||
url=url,
|
||||
platform=platform,
|
||||
source=source,
|
||||
content_type=content_type,
|
||||
filename=filename,
|
||||
file_path=file_path,
|
||||
file_size=file_size,
|
||||
file_hash=file_hash,
|
||||
post_date=post_date,
|
||||
metadata=metadata
|
||||
)
|
||||
except Exception as e:
|
||||
self.log(f"Failed to record download: {e}", "debug")
|
||||
|
||||
def get_pending_downloads(self):
|
||||
"""Get list of downloads that were deferred for later recording"""
|
||||
return self.pending_downloads.copy()
|
||||
|
||||
def clear_pending_downloads(self):
|
||||
"""Clear the pending downloads list after they've been recorded"""
|
||||
self.pending_downloads = []
|
||||
|
||||
def _load_cookies(self):
|
||||
"""Load cookies from database or file"""
|
||||
# Try database first if available
|
||||
if self.unified_db:
|
||||
try:
|
||||
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
||||
if cookies:
|
||||
cf_clearance_found = False
|
||||
for cookie in cookies:
|
||||
try:
|
||||
self.session.cookies.set(
|
||||
cookie['name'],
|
||||
cookie['value'],
|
||||
domain=cookie.get('domain', ''),
|
||||
path=cookie.get('path', '/')
|
||||
)
|
||||
if cookie['name'] == 'cf_clearance':
|
||||
cf_clearance_found = True
|
||||
except Exception as e:
|
||||
self.log(f"Error setting cookie {cookie.get('name')}: {e}", "warning")
|
||||
|
||||
if cf_clearance_found:
|
||||
self.log(f"✓ Loaded {len(cookies)} cookies including cf_clearance from database", "info")
|
||||
else:
|
||||
self.log(f"⚠ Loaded {len(cookies)} cookies from database but cf_clearance NOT found", "warning")
|
||||
|
||||
# Also load cookies into CloudflareHandler for consistency
|
||||
self.cf_handler._cookies = cookies
|
||||
return
|
||||
else:
|
||||
self.log("No cookies found in database", "debug")
|
||||
except Exception as e:
|
||||
self.log(f"Error loading cookies from database: {e}", "warning")
|
||||
|
||||
# Fall back to cookie file if no database
|
||||
if not self.cookie_file:
|
||||
self.log("No cookie file configured", "debug")
|
||||
return
|
||||
|
||||
cookie_path = Path(self.cookie_file)
|
||||
if not cookie_path.exists():
|
||||
self.log(f"Cookie file does not exist: {self.cookie_file}", "info")
|
||||
return
|
||||
|
||||
try:
|
||||
import json
|
||||
with open(cookie_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Handle both old format (list) and new format (dict with 'cookies' and 'timestamp')
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
cookies = data['cookies']
|
||||
elif isinstance(data, list):
|
||||
cookies = data
|
||||
else:
|
||||
self.log(f"Invalid cookie file format", "warning")
|
||||
return
|
||||
|
||||
# Count critical cookies
|
||||
cf_clearance_found = False
|
||||
for cookie in cookies:
|
||||
try:
|
||||
# Set cookie with basic attributes (requests.Session compatible)
|
||||
self.session.cookies.set(
|
||||
cookie['name'],
|
||||
cookie['value'],
|
||||
domain=cookie.get('domain', ''),
|
||||
path=cookie.get('path', '/')
|
||||
)
|
||||
if cookie['name'] == 'cf_clearance':
|
||||
cf_clearance_found = True
|
||||
except Exception as e:
|
||||
self.log(f"Error setting cookie {cookie.get('name')}: {e}", "warning")
|
||||
|
||||
if cf_clearance_found:
|
||||
self.log(f"✓ Loaded {len(cookies)} cookies including cf_clearance from {self.cookie_file}", "info")
|
||||
else:
|
||||
self.log(f"⚠ Loaded {len(cookies)} cookies but cf_clearance NOT found", "warning")
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error loading cookies: {e}", "warning")
|
||||
|
||||
def _cookies_expired(self):
|
||||
"""Check if cookies are expired - delegates to CloudflareHandler"""
|
||||
return self.cf_handler.cookies_expired()
|
||||
|
||||
def _save_cookies(self, cookies: list, user_agent: str = None):
|
||||
"""Save cookies to database or file with timestamp
|
||||
|
||||
Args:
|
||||
cookies: List of cookie dictionaries
|
||||
user_agent: User agent to associate with cookies (important for cf_clearance).
|
||||
If not provided, uses self.user_agent as fallback.
|
||||
"""
|
||||
# Use provided user_agent or fall back to self.user_agent
|
||||
ua = user_agent or self.user_agent
|
||||
|
||||
# Try database first if available
|
||||
if self.unified_db:
|
||||
try:
|
||||
self.unified_db.save_scraper_cookies(
|
||||
self.scraper_id,
|
||||
cookies,
|
||||
user_agent=ua,
|
||||
merge=True # Merge with existing cookies
|
||||
)
|
||||
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50] if ua else 'None'}...)", "debug")
|
||||
return
|
||||
except Exception as e:
|
||||
self.log(f"Error saving cookies to database: {e}", "warning")
|
||||
|
||||
# Fall back to file
|
||||
if not self.cookie_file:
|
||||
return
|
||||
|
||||
try:
|
||||
import json
|
||||
from datetime import datetime
|
||||
cookie_path = Path(self.cookie_file)
|
||||
cookie_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
storage_data = {
|
||||
'cookies': cookies,
|
||||
'timestamp': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
with open(cookie_path, 'w') as f:
|
||||
json.dump(storage_data, f, indent=2)
|
||||
self.log(f"Saved {len(cookies)} cookies to {self.cookie_file}", "debug")
|
||||
except Exception as e:
|
||||
self.log(f"Error saving cookies: {e}", "warning")
|
||||
|
||||
def _get_cookies_via_flaresolverr(self, url: str, max_retries: int = 2) -> bool:
|
||||
"""Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler
|
||||
|
||||
Args:
|
||||
url: URL to fetch
|
||||
max_retries: Maximum number of retry attempts (default: 2)
|
||||
|
||||
Returns:
|
||||
True if cookies obtained successfully, False otherwise
|
||||
"""
|
||||
# Delegate to CloudflareHandler
|
||||
success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries)
|
||||
|
||||
# If successful, also load cookies into the session and save to database
|
||||
if success:
|
||||
cookies_dict = self.cf_handler.get_cookies_dict()
|
||||
for name, value in cookies_dict.items():
|
||||
# Extract domain from URL
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc
|
||||
self.session.cookies.set(name, value, domain=domain, path='/')
|
||||
|
||||
# Save cookies to database (the handler already saved to file if configured)
|
||||
if self.unified_db:
|
||||
cookies_list = self.cf_handler.get_cookies_list()
|
||||
if cookies_list:
|
||||
# CRITICAL: Get the user_agent from FlareSolverr solution, not self.user_agent
|
||||
# cf_clearance cookies are fingerprinted to the browser that solved the challenge
|
||||
flaresolverr_ua = self.cf_handler.get_user_agent()
|
||||
self._save_cookies(cookies_list, user_agent=flaresolverr_ua)
|
||||
|
||||
return success
|
||||
|
||||
def _request_with_retry(self, url: str, timeout: int = 30, max_attempts: int = 2):
|
||||
"""Make HTTP request with automatic Cloudflare challenge retry
|
||||
|
||||
Args:
|
||||
url: URL to fetch
|
||||
timeout: Request timeout in seconds
|
||||
max_attempts: Maximum number of attempts (default: 2)
|
||||
|
||||
Returns:
|
||||
requests.Response object
|
||||
|
||||
Raises:
|
||||
Exception if all retry attempts fail
|
||||
"""
|
||||
last_error = None
|
||||
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
try:
|
||||
response = self.session.get(url, timeout=timeout)
|
||||
|
||||
# Detect Cloudflare challenges
|
||||
is_cloudflare = False
|
||||
if response.status_code in [403, 503]:
|
||||
is_cloudflare = True
|
||||
self.log(f"Cloudflare challenge detected (HTTP {response.status_code})", "warning")
|
||||
elif len(response.text) < 1000:
|
||||
is_cloudflare = True
|
||||
self.log(f"Cloudflare challenge detected (short response: {len(response.text)} bytes)", "warning")
|
||||
elif 'challenge' in response.text.lower()[:500]:
|
||||
is_cloudflare = True
|
||||
self.log("Cloudflare challenge detected in HTML", "warning")
|
||||
|
||||
# If Cloudflare detected and we have retry attempts left
|
||||
if is_cloudflare and attempt < max_attempts:
|
||||
if self.flaresolverr_enabled:
|
||||
self.log(f"Attempt {attempt}/{max_attempts}: Refreshing cookies via FlareSolverr...", "info")
|
||||
if self._get_cookies_via_flaresolverr(url):
|
||||
self.log("Cookies refreshed, retrying request...", "info")
|
||||
continue # Retry the request
|
||||
else:
|
||||
raise Exception("Failed to refresh cookies via FlareSolverr")
|
||||
else:
|
||||
raise Exception("Cloudflare challenge detected but FlareSolverr is disabled")
|
||||
|
||||
# No Cloudflare challenge or final attempt - check status and return
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
if attempt < max_attempts:
|
||||
self.log(f"Attempt {attempt}/{max_attempts} failed: {e}", "warning")
|
||||
else:
|
||||
self.log(f"All {max_attempts} attempts failed", "error")
|
||||
|
||||
# All attempts failed
|
||||
raise last_error
|
||||
|
||||
def _parse_date(self, date_str: str) -> Optional[datetime]:
|
||||
"""
|
||||
Parse Coppermine date format: 'Date added=Sep 29, 2025'
|
||||
|
||||
Args:
|
||||
date_str: Date string from Coppermine
|
||||
|
||||
Returns:
|
||||
datetime object or None
|
||||
"""
|
||||
try:
|
||||
# Extract date from "Date added=Sep 29, 2025" format
|
||||
match = re.search(r'Date added=([A-Za-z]+ \d+, \d{4})', date_str)
|
||||
if match:
|
||||
date_part = match.group(1)
|
||||
return datetime.strptime(date_part, '%b %d, %Y')
|
||||
except Exception as e:
|
||||
self.log(f"Error parsing date '{date_str}': {e}", "debug")
|
||||
return None
|
||||
|
||||
def _extract_full_image_url(self, base_url: str, thumbnail_url: str) -> str:
|
||||
"""
|
||||
Convert thumbnail URL to full-resolution URL
|
||||
|
||||
Pattern:
|
||||
Thumbnail: albums/userpics/1052219/thumb_1000523798.jpg
|
||||
Normal: albums/userpics/1052219/normal_1000523798.jpg
|
||||
Full: albums/userpics/1052219/1000523798.jpg
|
||||
|
||||
Args:
|
||||
base_url: Base URL of the gallery (e.g., https://hqdiesel.net)
|
||||
thumbnail_url: Relative thumbnail URL
|
||||
|
||||
Returns:
|
||||
Full-resolution image URL
|
||||
"""
|
||||
# Remove thumb_ or normal_ prefix
|
||||
full_path = re.sub(r'/(thumb_|normal_)', '/', thumbnail_url)
|
||||
return urljoin(base_url, full_path)
|
||||
|
||||
def _parse_gallery_page(self, html: str, base_url: str) -> List[Dict]:
|
||||
"""
|
||||
Parse a Coppermine gallery page to extract image information
|
||||
|
||||
Args:
|
||||
html: HTML content of the page
|
||||
base_url: Base URL of the gallery
|
||||
|
||||
Returns:
|
||||
List of dicts with image info
|
||||
"""
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
images = []
|
||||
|
||||
# Find all thumbnail cells
|
||||
thumbnail_cells = soup.find_all('td', class_='thumbnails')
|
||||
self.log(f"Found {len(thumbnail_cells)} thumbnail cells on page", "debug")
|
||||
|
||||
for cell in thumbnail_cells:
|
||||
try:
|
||||
# Find image link
|
||||
link = cell.find('a', href=re.compile(r'displayimage\.php'))
|
||||
if not link:
|
||||
continue
|
||||
|
||||
# Extract PID from URL
|
||||
href = link.get('href', '')
|
||||
parsed = parse_qs(urlparse(href).query)
|
||||
pid = parsed.get('pid', [None])[0]
|
||||
|
||||
if not pid:
|
||||
continue
|
||||
|
||||
# Find thumbnail image
|
||||
img = link.find('img')
|
||||
if not img:
|
||||
continue
|
||||
|
||||
thumbnail_url = img.get('src', '')
|
||||
if not thumbnail_url:
|
||||
continue
|
||||
|
||||
# Get image title (contains metadata)
|
||||
title = img.get('title', '')
|
||||
|
||||
# Extract filename
|
||||
filename_match = re.search(r'Filename=([^\s]+)', title)
|
||||
filename = filename_match.group(1) if filename_match else None
|
||||
|
||||
# Extract date from dedicated span (more reliable)
|
||||
upload_date = None
|
||||
date_span = cell.find('span', class_='thumb_caption_ctime')
|
||||
if date_span and date_span.text.strip():
|
||||
try:
|
||||
upload_date = datetime.strptime(date_span.text.strip(), '%b %d, %Y')
|
||||
except Exception:
|
||||
# Fallback to title parsing
|
||||
upload_date = self._parse_date(title)
|
||||
else:
|
||||
upload_date = self._parse_date(title)
|
||||
|
||||
# Extract uploader
|
||||
uploader = None
|
||||
uploader_link = cell.find('a', href=re.compile(r'profile\.php'))
|
||||
if uploader_link:
|
||||
uploader = uploader_link.text.strip()
|
||||
|
||||
# Extract dimensions
|
||||
dimensions_match = re.search(r'Dimensions=(\d+x\d+)', title)
|
||||
dimensions = dimensions_match.group(1) if dimensions_match else None
|
||||
|
||||
# Extract filesize
|
||||
filesize_match = re.search(r'Filesize=([^\s]+)', title)
|
||||
filesize = filesize_match.group(1) if filesize_match else None
|
||||
|
||||
# Extract views
|
||||
views = None
|
||||
views_span = cell.find('span', class_='thumb_title_views')
|
||||
if views_span:
|
||||
views_match = re.search(r'(\d+)\s+views?', views_span.text)
|
||||
if views_match:
|
||||
views = int(views_match.group(1))
|
||||
|
||||
# Construct full-resolution URL
|
||||
full_url = self._extract_full_image_url(base_url, thumbnail_url)
|
||||
|
||||
images.append({
|
||||
'pid': pid,
|
||||
'filename': filename,
|
||||
'thumbnail_url': urljoin(base_url, thumbnail_url),
|
||||
'full_url': full_url,
|
||||
'upload_date': upload_date,
|
||||
'dimensions': dimensions,
|
||||
'filesize': filesize,
|
||||
'uploader': uploader,
|
||||
'views': views,
|
||||
'title': title
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error parsing thumbnail cell: {e}", "debug")
|
||||
continue
|
||||
|
||||
return images
|
||||
|
||||
def _get_total_pages(self, html: str) -> int:
|
||||
"""
|
||||
Extract total number of pages from gallery
|
||||
|
||||
Args:
|
||||
html: HTML content
|
||||
|
||||
Returns:
|
||||
Number of pages
|
||||
"""
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
# Look for pagination info like "2005 files on 20 page(s)"
|
||||
text = soup.get_text()
|
||||
match = re.search(r'(\d+)\s+files?\s+on\s+(\d+)\s+page', text)
|
||||
if match:
|
||||
return int(match.group(2))
|
||||
except Exception as e:
|
||||
self.log(f"Error extracting page count: {e}", "debug")
|
||||
return 1
|
||||
|
||||
def _download_image(self, image_info: Dict, output_dir: Path,
|
||||
gallery_name: str) -> Optional[str]:
|
||||
"""
|
||||
Download a single image
|
||||
|
||||
Args:
|
||||
image_info: Image information dict
|
||||
output_dir: Output directory
|
||||
gallery_name: Name of gallery for database tracking
|
||||
|
||||
Returns:
|
||||
Path to downloaded file or None
|
||||
"""
|
||||
try:
|
||||
url = image_info['full_url']
|
||||
pid = image_info['pid']
|
||||
filename = image_info['filename']
|
||||
|
||||
# Check if already downloaded
|
||||
if self.use_database and self.db:
|
||||
if self.db.is_downloaded(url, platform='coppermine'):
|
||||
self.log(f"Already downloaded (database): {filename} (PID: {pid})", "info")
|
||||
return None
|
||||
|
||||
# Create output directory
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Construct output filename
|
||||
output_file = output_dir / filename
|
||||
|
||||
# Skip if file exists
|
||||
if output_file.exists():
|
||||
self.log(f"File already exists: {filename}", "info")
|
||||
return str(output_file)
|
||||
|
||||
# Download image
|
||||
self.log(f"Downloading: {filename} (PID: {pid})", "info")
|
||||
|
||||
response = self._request_with_retry(url, timeout=30)
|
||||
|
||||
# Save image
|
||||
with open(output_file, 'wb') as f:
|
||||
f.write(response.content)
|
||||
|
||||
# Check for duplicate hash before recording
|
||||
if self.db and hasattr(self.db, 'unified_db'):
|
||||
from pathlib import Path as PathLib
|
||||
# Check for duplicate hash (hash blacklist persists even if original deleted)
|
||||
file_hash_check = self.db.unified_db.get_file_hash(str(output_file))
|
||||
if file_hash_check:
|
||||
existing = self.db.unified_db.get_download_by_file_hash(file_hash_check)
|
||||
if existing and existing.get('file_path') and str(output_file) != existing.get('file_path'):
|
||||
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
|
||||
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
||||
# Delete the duplicate regardless of whether original file still exists
|
||||
try:
|
||||
output_file.unlink()
|
||||
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
|
||||
return
|
||||
except Exception as e:
|
||||
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
|
||||
return
|
||||
|
||||
# Calculate SHA256 file hash from saved file (consistent with other modules)
|
||||
file_hash = None
|
||||
if self.db and hasattr(self.db, 'unified_db'):
|
||||
try:
|
||||
file_hash = self.db.unified_db.get_file_hash(str(output_file))
|
||||
except Exception as e:
|
||||
self.log(f"Failed to calculate file hash: {e}", "warning")
|
||||
|
||||
# Track timestamp for this file
|
||||
if image_info.get('upload_date'):
|
||||
self.file_timestamps[filename] = image_info['upload_date']
|
||||
|
||||
# Record in database
|
||||
self._record_download(
|
||||
url=url,
|
||||
platform='coppermine',
|
||||
source=gallery_name,
|
||||
content_type='image',
|
||||
filename=filename,
|
||||
file_path=str(output_file),
|
||||
file_size=len(response.content),
|
||||
file_hash=file_hash,
|
||||
post_date=image_info.get('upload_date'),
|
||||
metadata={
|
||||
'pid': pid,
|
||||
'dimensions': image_info.get('dimensions'),
|
||||
'filesize': image_info.get('filesize')
|
||||
},
|
||||
deferred=getattr(self, 'defer_database', False)
|
||||
)
|
||||
|
||||
self.download_count += 1
|
||||
time.sleep(self.min_delay + (self.max_delay - self.min_delay) * __import__('random').random())
|
||||
|
||||
return str(output_file)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading {image_info.get('filename', 'unknown')}: {e}", "error")
|
||||
return None
|
||||
|
||||
def download(self, gallery_url: str, output_dir: str,
|
||||
days_back: Optional[int] = None, max_pages: Optional[int] = None,
|
||||
gallery_name: Optional[str] = None, defer_database: bool = False) -> tuple:
|
||||
"""
|
||||
Download images from a Coppermine gallery
|
||||
|
||||
Args:
|
||||
gallery_url: URL to the gallery page (e.g., thumbnails.php?album=lastup&cat=123)
|
||||
output_dir: Directory to save images
|
||||
days_back: Only download images from last N days (None = all)
|
||||
max_pages: Maximum number of pages to process (None = all)
|
||||
gallery_name: Name for database tracking (extracted from URL if not provided)
|
||||
defer_database: If True, don't record to database immediately - store in
|
||||
pending_downloads for later recording after file move is complete
|
||||
|
||||
Returns:
|
||||
Tuple of (file_timestamps dict, download_count)
|
||||
file_timestamps: Dict mapping filename -> upload_date
|
||||
"""
|
||||
self.defer_database = defer_database # Store for use in download methods
|
||||
# Clear downloaded_files cache between galleries to prevent memory growth
|
||||
self.downloaded_files.clear()
|
||||
|
||||
# Check site status before doing anything else
|
||||
self.log("Checking Coppermine gallery site status...", "debug")
|
||||
site_status, error_msg = self.cf_handler.check_site_status(gallery_url, timeout=10)
|
||||
|
||||
if self.cf_handler.should_skip_download(site_status):
|
||||
self.log(f"Skipping download - Coppermine gallery is unavailable: {error_msg}", "warning")
|
||||
return ({}, 0)
|
||||
elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
|
||||
self.log("Cloudflare challenge detected, will attempt bypass during download", "info")
|
||||
|
||||
self.download_count = 0
|
||||
self.file_timestamps = {} # Track timestamps for each file
|
||||
output_path = Path(output_dir)
|
||||
|
||||
# Extract base URL and gallery name
|
||||
parsed_url = urlparse(gallery_url)
|
||||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||
|
||||
if not gallery_name:
|
||||
# Extract category from URL
|
||||
query_params = parse_qs(parsed_url.query)
|
||||
cat = query_params.get('cat', ['unknown'])[0]
|
||||
album = query_params.get('album', ['unknown'])[0]
|
||||
gallery_name = f"{parsed_url.netloc}_cat{cat}_{album}"
|
||||
|
||||
self.log(f"Starting download from: {gallery_url}", "info")
|
||||
self.activity_manager.update_status(f"Checking gallery: {gallery_name}")
|
||||
self.log(f"Gallery: {gallery_name}", "info")
|
||||
if days_back:
|
||||
self.log(f"Filtering: Last {days_back} days", "info")
|
||||
|
||||
# Calculate cutoff date
|
||||
cutoff_date = None
|
||||
if days_back:
|
||||
cutoff_date = datetime.now() - timedelta(days=days_back)
|
||||
|
||||
# Check if cookies have expired before testing
|
||||
cookies_valid = False
|
||||
cookie_count = len(self.session.cookies)
|
||||
|
||||
# Check for short-lived session cookies that may have expired
|
||||
if self.cf_handler.cookies_expired():
|
||||
self.log(f"Cookies expired, skipping test and refreshing via FlareSolverr", "info")
|
||||
else:
|
||||
self.log(f"Testing with {cookie_count} existing cookies...", "info")
|
||||
|
||||
try:
|
||||
# Try with existing cookies first (short timeout for fast fail)
|
||||
test_response = self.session.get(gallery_url, timeout=5)
|
||||
|
||||
# Check if we got a Cloudflare challenge or error
|
||||
if test_response.status_code == 403 or test_response.status_code == 503:
|
||||
self.log(f"Existing cookies failed (HTTP {test_response.status_code}), need FlareSolverr", "info")
|
||||
elif len(test_response.text) < 1000:
|
||||
self.log(f"Response too short ({len(test_response.text)} bytes), likely Cloudflare challenge", "info")
|
||||
elif 'challenge' in test_response.text.lower()[:500]:
|
||||
self.log("Cloudflare challenge detected in response", "info")
|
||||
else:
|
||||
# Cookies work (or no challenge presented)!
|
||||
cookies_valid = True
|
||||
self.log(f"✓ Existing cookies valid ({cookie_count} cookies, skipped FlareSolverr)", "info")
|
||||
response = test_response
|
||||
except Exception as e:
|
||||
self.log(f"Test request failed ({type(e).__name__}: {e}), need FlareSolverr", "info")
|
||||
|
||||
# Only call FlareSolverr if existing cookies don't work
|
||||
if not cookies_valid:
|
||||
if self.flaresolverr_enabled:
|
||||
self.log("Calling FlareSolverr to get fresh cookies...", "info")
|
||||
if not self._get_cookies_via_flaresolverr(gallery_url):
|
||||
self.log("Failed to bypass Cloudflare", "error")
|
||||
return ({}, 0)
|
||||
else:
|
||||
self.log("FlareSolverr disabled and cookies invalid", "error")
|
||||
return ({}, 0)
|
||||
|
||||
# Fetch first page to get total pages (reuse response if cookies were valid)
|
||||
try:
|
||||
if not cookies_valid:
|
||||
response = self._request_with_retry(gallery_url, timeout=30)
|
||||
|
||||
total_pages = self._get_total_pages(response.text)
|
||||
|
||||
if max_pages:
|
||||
total_pages = min(total_pages, max_pages)
|
||||
|
||||
self.log(f"Total pages to process: {total_pages}", "info")
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching gallery: {e}", "error")
|
||||
return ({}, 0)
|
||||
|
||||
# Set initial progress so dashboard shows 0/N immediately
|
||||
self.activity_manager.update_status(
|
||||
"Downloading images",
|
||||
progress_current=0,
|
||||
progress_total=total_pages
|
||||
)
|
||||
|
||||
# Process each page
|
||||
for page_num in range(1, total_pages + 1):
|
||||
try:
|
||||
# Construct page URL
|
||||
if page_num == 1:
|
||||
page_url = gallery_url
|
||||
else:
|
||||
separator = '&' if '?' in gallery_url else '?'
|
||||
page_url = f"{gallery_url}{separator}page={page_num}"
|
||||
|
||||
self.log(f"Processing page {page_num}/{total_pages}...", "info")
|
||||
|
||||
# Fetch page with automatic Cloudflare retry
|
||||
response = self._request_with_retry(page_url, timeout=30)
|
||||
|
||||
# Debug: Check what we received
|
||||
self.log(f"Fetched page, status: {response.status_code}, length: {len(response.text)} bytes", "debug")
|
||||
if len(response.text) < 10000:
|
||||
self.log(f"WARNING: Response seems too short! First 1000 chars: {response.text[:1000]}", "warning")
|
||||
|
||||
# Parse images
|
||||
images = self._parse_gallery_page(response.text, base_url)
|
||||
self.log(f"Found {len(images)} images on page {page_num}", "info")
|
||||
|
||||
# Track if we found any new images on this page
|
||||
found_new_images = False
|
||||
skipped_old_images = 0
|
||||
|
||||
# Filter by date and download
|
||||
for image_info in images:
|
||||
# Apply date filter
|
||||
if cutoff_date and image_info.get('upload_date'):
|
||||
if image_info['upload_date'] < cutoff_date:
|
||||
skipped_old_images += 1
|
||||
self.log(f"Skipping old image: {image_info['filename']} "
|
||||
f"(uploaded {image_info['upload_date'].date()})", "debug")
|
||||
continue
|
||||
|
||||
# Log image being processed
|
||||
upload_date_str = image_info.get('upload_date').strftime('%Y-%m-%d') if image_info.get('upload_date') else 'unknown'
|
||||
self.log(f"Processing image: {image_info['filename']} (uploaded {upload_date_str})", "info")
|
||||
|
||||
# This image is within date range
|
||||
found_new_images = True
|
||||
|
||||
# Download image
|
||||
self._download_image(image_info, output_path, gallery_name)
|
||||
|
||||
# If using date filter and ALL images on this page were too old, stop processing
|
||||
# (assumes gallery is sorted newest-first, which is true for album=lastup)
|
||||
if cutoff_date and not found_new_images and len(images) > 0:
|
||||
self.log(f"All {skipped_old_images} images on page {page_num} are older than {days_back} days. "
|
||||
f"Stopping pagination (assuming chronological order).", "info")
|
||||
break
|
||||
|
||||
# Update activity status with page progress
|
||||
self.activity_manager.update_status(
|
||||
"Downloading images",
|
||||
progress_current=page_num,
|
||||
progress_total=total_pages
|
||||
)
|
||||
|
||||
# Rate limiting between pages
|
||||
if page_num < total_pages:
|
||||
time.sleep(self.min_delay)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error processing page {page_num}: {e}", "error")
|
||||
continue
|
||||
|
||||
self.log(f"Download complete! Total: {self.download_count} images", "info")
|
||||
return (self.file_timestamps, self.download_count)
|
||||
|
||||
def cleanup(self):
|
||||
"""Cleanup resources"""
|
||||
if self.session:
|
||||
self.session.close()
|
||||
473
modules/date_utils.py
Executable file
473
modules/date_utils.py
Executable file
@@ -0,0 +1,473 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Shared date utilities module for media downloaders
|
||||
Provides comprehensive date extraction and timestamp updating
|
||||
|
||||
Features:
|
||||
- Extract dates from text/titles (multiple formats)
|
||||
- Extract TV show season/episode info and lookup air dates via OMDB
|
||||
- Update filesystem timestamps (mtime, atime)
|
||||
- Update creation time (platform-specific)
|
||||
- Update EXIF metadata for images
|
||||
- Update video metadata
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import platform
|
||||
import subprocess
|
||||
import requests
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union, Tuple
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('DateUtils')
|
||||
|
||||
|
||||
class DateHandler:
|
||||
"""Comprehensive date extraction and timestamp updating"""
|
||||
|
||||
# OMDB API key (should be set by user)
|
||||
OMDB_API_KEY = None
|
||||
|
||||
# TV show season/episode patterns
|
||||
TV_PATTERNS = [
|
||||
r'S(\d{1,2})E(\d{1,2})', # S01E01
|
||||
r'Season\s+(\d{1,2})\s+Episode\s+(\d{1,2})', # Season 1 Episode 1
|
||||
r'(\d{1,2})x(\d{1,2})', # 1x01
|
||||
r's(\d{1,2})\s*e(\d{1,2})', # s01 e01 or s01e01
|
||||
]
|
||||
|
||||
# Year pattern for fallback
|
||||
YEAR_PATTERN = r'\b(19\d{2}|20\d{2})\b'
|
||||
|
||||
# Date patterns for extraction from text
|
||||
DATE_PATTERNS = [
|
||||
# Instagram filename format: YYYYMMDD_HHMMSS (e.g., "20251027_155842")
|
||||
(r'(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2})', 'instagram'),
|
||||
# DD.MM.YYYY or DD/MM/YYYY or DD-MM-YYYY or DD_MM_YYYY (underscore for forum titles)
|
||||
(r'(\d{1,2})[\.\/\-_](\d{1,2})[\.\/\-_](\d{4})', 'dmy'),
|
||||
# YYYY-MM-DD or YYYY/MM/DD or YYYY_MM_DD
|
||||
(r'(\d{4})[\-\/_](\d{1,2})[\-\/_](\d{1,2})', 'ymd'),
|
||||
# Month DD, YYYY (e.g., "August 15, 2025")
|
||||
(r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),?\s+(\d{4})', 'mdy_name'),
|
||||
# Month YYYY (e.g., "April 2025") - use first day of month
|
||||
(r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{4})', 'my_name'),
|
||||
# DD Mon YYYY (e.g., "15 Aug 2025")
|
||||
(r'(\d{1,2})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(\d{4})', 'dmy_abbr'),
|
||||
# Mon DD, YYYY (e.g., "Aug 15, 2025")
|
||||
(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(\d{1,2}),?\s+(\d{4})', 'mdy_abbr'),
|
||||
# Mon YYYY (e.g., "Apr 2025") - use first day of month
|
||||
(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(\d{4})', 'my_abbr'),
|
||||
]
|
||||
|
||||
MONTH_MAP = {
|
||||
'January': 1, 'February': 2, 'March': 3, 'April': 4,
|
||||
'May': 5, 'June': 6, 'July': 7, 'August': 8,
|
||||
'September': 9, 'October': 10, 'November': 11, 'December': 12,
|
||||
'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
|
||||
'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
|
||||
'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def set_omdb_api_key(cls, api_key: str):
|
||||
"""Set OMDB API key for TV show lookups"""
|
||||
cls.OMDB_API_KEY = api_key
|
||||
|
||||
@classmethod
|
||||
def extract_tv_info(cls, text: str) -> Optional[Tuple[str, int, int]]:
|
||||
"""
|
||||
Extract TV show name, season, and episode from text
|
||||
|
||||
Returns:
|
||||
Tuple of (show_name, season, episode) or None
|
||||
"""
|
||||
for pattern in cls.TV_PATTERNS:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
season = int(match.group(1))
|
||||
episode = int(match.group(2))
|
||||
|
||||
# Extract show name (everything before the season/episode)
|
||||
show_part = text[:match.start()].strip()
|
||||
|
||||
# Look for common TV show names in the text
|
||||
# Common pattern: "Actor Name & Actor Name - Show Name S01E01"
|
||||
if ' - ' in show_part:
|
||||
# Split on dash and take the last part as show name
|
||||
parts = show_part.split(' - ')
|
||||
show_name = parts[-1].strip()
|
||||
else:
|
||||
# Clean up common separators
|
||||
show_name = re.sub(r'[-_.]', ' ', show_part)
|
||||
show_name = re.sub(r'\s+', ' ', show_name).strip()
|
||||
|
||||
# Remove trailing "Season" or similar words
|
||||
show_name = re.sub(r'\s+(Season|Series|S)\s*$', '', show_name, re.IGNORECASE)
|
||||
|
||||
if show_name:
|
||||
return (show_name, season, episode)
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def lookup_tv_episode_date(cls, show_name: str, season: int, episode: int) -> Optional[datetime]:
|
||||
"""
|
||||
Lookup TV episode air date using OMDB API
|
||||
|
||||
Args:
|
||||
show_name: Name of the TV show
|
||||
season: Season number
|
||||
episode: Episode number
|
||||
|
||||
Returns:
|
||||
Air date of the episode or None
|
||||
"""
|
||||
if not cls.OMDB_API_KEY:
|
||||
logger.debug("OMDB API key not set")
|
||||
return None
|
||||
|
||||
try:
|
||||
# First, search for the show
|
||||
search_url = "http://www.omdbapi.com/"
|
||||
params = {
|
||||
'apikey': cls.OMDB_API_KEY,
|
||||
't': show_name,
|
||||
'type': 'series'
|
||||
}
|
||||
|
||||
response = requests.get(search_url, params=params, timeout=5)
|
||||
if response.status_code != 200:
|
||||
return None
|
||||
|
||||
show_data = response.json()
|
||||
if show_data.get('Response') != 'True':
|
||||
return None
|
||||
|
||||
# Get the IMDB ID
|
||||
imdb_id = show_data.get('imdbID')
|
||||
if not imdb_id:
|
||||
return None
|
||||
|
||||
# Now get the specific episode
|
||||
episode_params = {
|
||||
'apikey': cls.OMDB_API_KEY,
|
||||
'i': imdb_id,
|
||||
'Season': season,
|
||||
'Episode': episode
|
||||
}
|
||||
|
||||
episode_response = requests.get(search_url, params=episode_params, timeout=5)
|
||||
if episode_response.status_code != 200:
|
||||
return None
|
||||
|
||||
episode_data = episode_response.json()
|
||||
if episode_data.get('Response') != 'True':
|
||||
return None
|
||||
|
||||
# Parse the release date
|
||||
release_date = episode_data.get('Released')
|
||||
if release_date and release_date != 'N/A':
|
||||
# Try different date formats
|
||||
for fmt in ['%d %b %Y', '%Y-%m-%d', '%d %B %Y']:
|
||||
try:
|
||||
return datetime.strptime(release_date, fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"OMDB lookup failed: {e}")
|
||||
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def extract_date_from_text(cls, text: str, fallback_date: Optional[datetime] = None, use_omdb: bool = True) -> Optional[datetime]:
|
||||
"""
|
||||
Extract date from text using multiple format patterns
|
||||
|
||||
Args:
|
||||
text: Text to search for dates (e.g., post title, caption)
|
||||
fallback_date: Date to use if no date found in text
|
||||
use_omdb: Whether to try OMDB lookup for TV shows
|
||||
|
||||
Returns:
|
||||
Extracted datetime or fallback_date if no date found
|
||||
"""
|
||||
if not text:
|
||||
return fallback_date
|
||||
|
||||
# First, try TV show lookup if enabled
|
||||
if use_omdb:
|
||||
tv_info = cls.extract_tv_info(text)
|
||||
if tv_info:
|
||||
show_name, season, episode = tv_info
|
||||
tv_date = cls.lookup_tv_episode_date(show_name, season, episode)
|
||||
if tv_date:
|
||||
logger.info(f"Found TV episode date via OMDB: {show_name} S{season:02d}E{episode:02d} -> {tv_date}")
|
||||
return tv_date
|
||||
|
||||
# Try standard date patterns
|
||||
for pattern, format_type in cls.DATE_PATTERNS:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
if format_type == 'instagram':
|
||||
# Instagram format: YYYYMMDD_HHMMSS
|
||||
year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3))
|
||||
hour, minute, second = int(match.group(4)), int(match.group(5)), int(match.group(6))
|
||||
return datetime(year, month, day, hour, minute, second)
|
||||
|
||||
elif format_type == 'dmy':
|
||||
day, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3))
|
||||
# Handle ambiguous dates (could be DD/MM or MM/DD)
|
||||
if '.' in text[match.start():match.end()]:
|
||||
# European format with dots: DD.MM.YYYY
|
||||
return datetime(year, month, day)
|
||||
elif day <= 12 and month <= 12:
|
||||
# Ambiguous, assume MM/DD/YYYY for US format
|
||||
return datetime(year, day, month)
|
||||
else:
|
||||
# Clear from values which is day/month
|
||||
if day > 12:
|
||||
return datetime(year, month, day)
|
||||
else:
|
||||
return datetime(year, day, month)
|
||||
|
||||
elif format_type == 'ymd':
|
||||
year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3))
|
||||
return datetime(year, month, day)
|
||||
|
||||
elif format_type == 'mdy_name':
|
||||
month_str, day, year = match.group(1), int(match.group(2)), int(match.group(3))
|
||||
month = cls.MONTH_MAP.get(month_str, 0)
|
||||
if month:
|
||||
return datetime(year, month, day)
|
||||
|
||||
elif format_type == 'my_name':
|
||||
# Month YYYY (no day) - use first day of month
|
||||
month_str, year = match.group(1), int(match.group(2))
|
||||
month = cls.MONTH_MAP.get(month_str, 0)
|
||||
if month:
|
||||
return datetime(year, month, 1)
|
||||
|
||||
elif format_type == 'dmy_abbr':
|
||||
day, month_str, year = int(match.group(1)), match.group(2), int(match.group(3))
|
||||
month = cls.MONTH_MAP.get(month_str, 0)
|
||||
if month:
|
||||
return datetime(year, month, day)
|
||||
|
||||
elif format_type == 'mdy_abbr':
|
||||
month_str, day, year = match.group(1), int(match.group(2)), int(match.group(3))
|
||||
month = cls.MONTH_MAP.get(month_str, 0)
|
||||
if month:
|
||||
return datetime(year, month, day)
|
||||
|
||||
elif format_type == 'my_abbr':
|
||||
# Mon YYYY (no day) - use first day of month
|
||||
month_str, year = match.group(1), int(match.group(2))
|
||||
month = cls.MONTH_MAP.get(month_str, 0)
|
||||
if month:
|
||||
return datetime(year, month, 1)
|
||||
|
||||
except (ValueError, IndexError) as e:
|
||||
logger.debug(f"Failed to parse date from pattern {pattern}: {e}")
|
||||
continue
|
||||
|
||||
# Don't use year-only as fallback - it's too unreliable
|
||||
# Examples: "Moments of 2025" shouldn't default to Jan 1, 2025
|
||||
# Instead, use the actual post date from the forum
|
||||
return fallback_date
|
||||
|
||||
@classmethod
|
||||
def update_file_timestamps(cls, filepath: Union[str, Path], date: datetime) -> bool:
|
||||
"""
|
||||
Update all timestamps for a file: filesystem, creation time, and EXIF data
|
||||
|
||||
Args:
|
||||
filepath: Path to the file to update
|
||||
date: DateTime to set
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
filepath = Path(filepath)
|
||||
if not filepath.exists():
|
||||
logger.error(f"File not found: {filepath}")
|
||||
return False
|
||||
|
||||
if not date:
|
||||
logger.warning(f"No date provided for {filepath}")
|
||||
return False
|
||||
|
||||
success = True
|
||||
|
||||
# 1. Update EXIF data for images FIRST (this modifies the file)
|
||||
if filepath.suffix.lower() in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.gif']:
|
||||
try:
|
||||
cls._update_exif_data(filepath, date)
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to update EXIF data: {e}")
|
||||
# Don't mark as failure since not all images support EXIF
|
||||
|
||||
# 2. Update video metadata SECOND (this also modifies the file)
|
||||
if filepath.suffix.lower() in ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v']:
|
||||
try:
|
||||
cls._update_video_metadata(filepath, date)
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to update video metadata: {e}")
|
||||
# Don't mark as failure since this requires ffmpeg
|
||||
|
||||
# 3. Update creation time (platform-specific)
|
||||
try:
|
||||
if platform.system() == 'Darwin': # macOS
|
||||
cls._update_macos_creation_time(filepath, date)
|
||||
elif platform.system() == 'Windows':
|
||||
cls._update_windows_creation_time(filepath, date)
|
||||
# Linux doesn't have a reliable way to set creation time
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to update creation time: {e}")
|
||||
# Don't mark as failure since this is platform-specific
|
||||
|
||||
# 4. Update filesystem timestamps LAST (mtime and atime)
|
||||
# This must be last because EXIF/video updates modify the file and change mtime
|
||||
try:
|
||||
timestamp = date.timestamp()
|
||||
os.utime(filepath, (timestamp, timestamp))
|
||||
logger.debug(f"Updated filesystem timestamps for {filepath}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to update filesystem timestamps: {e}")
|
||||
success = False
|
||||
|
||||
return success
|
||||
|
||||
@classmethod
|
||||
def _update_macos_creation_time(cls, filepath: Path, date: datetime):
|
||||
"""Update creation time on macOS using SetFile"""
|
||||
date_str = date.strftime("%m/%d/%Y %H:%M:%S")
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['SetFile', '-d', date_str, str(filepath)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False
|
||||
)
|
||||
if result.returncode == 0:
|
||||
logger.debug(f"Updated macOS creation time for {filepath}")
|
||||
else:
|
||||
logger.debug(f"SetFile failed: {result.stderr}")
|
||||
except FileNotFoundError:
|
||||
logger.debug("SetFile not found (Xcode Command Line Tools not installed)")
|
||||
|
||||
@classmethod
|
||||
def _update_windows_creation_time(cls, filepath: Path, date: datetime):
|
||||
"""Update creation time on Windows using PowerShell"""
|
||||
date_str = date.strftime("%Y-%m-%d %H:%M:%S")
|
||||
ps_command = f'''
|
||||
$file = Get-Item "{filepath}"
|
||||
$file.CreationTime = "{date_str}"
|
||||
'''
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['powershell', '-Command', ps_command],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False
|
||||
)
|
||||
if result.returncode == 0:
|
||||
logger.debug(f"Updated Windows creation time for {filepath}")
|
||||
except FileNotFoundError:
|
||||
logger.debug("PowerShell not available")
|
||||
|
||||
@classmethod
|
||||
def _update_exif_data(cls, filepath: Path, date: datetime):
|
||||
"""Update EXIF metadata using exiftool
|
||||
|
||||
Sets all date fields comprehensively to ensure consistent timestamps
|
||||
across all metadata readers (including Immich):
|
||||
- AllDates (DateTimeOriginal, CreateDate, ModifyDate)
|
||||
- MetadataDate (used by some photo managers)
|
||||
- FileModifyDate (filesystem modification time)
|
||||
- Clears HistoryWhen to avoid conflicting timestamps
|
||||
"""
|
||||
date_str = date.strftime("%Y:%m:%d %H:%M:%S")
|
||||
try:
|
||||
result = subprocess.run([
|
||||
'exiftool',
|
||||
'-overwrite_original',
|
||||
f'-AllDates={date_str}',
|
||||
f'-MetadataDate={date_str}',
|
||||
'-HistoryWhen=',
|
||||
f'-FileModifyDate={date_str}',
|
||||
str(filepath)
|
||||
], capture_output=True, text=True, check=False)
|
||||
|
||||
if result.returncode == 0:
|
||||
logger.debug(f"Updated EXIF data for {filepath}")
|
||||
else:
|
||||
logger.debug(f"exiftool failed: {result.stderr}")
|
||||
except FileNotFoundError:
|
||||
logger.debug("exiftool not found")
|
||||
|
||||
@classmethod
|
||||
def _update_video_metadata(cls, filepath: Path, date: datetime):
|
||||
"""Update video metadata using ffmpeg"""
|
||||
date_str = date.strftime("%Y-%m-%d %H:%M:%S")
|
||||
temp_file = filepath.with_suffix('.tmp' + filepath.suffix)
|
||||
|
||||
try:
|
||||
result = subprocess.run([
|
||||
'ffmpeg', '-i', str(filepath),
|
||||
'-c', 'copy',
|
||||
'-metadata', f'creation_time={date_str}',
|
||||
'-y', str(temp_file)
|
||||
], capture_output=True, text=True, check=False)
|
||||
|
||||
if result.returncode == 0 and temp_file.exists():
|
||||
# Replace original with updated file
|
||||
temp_file.replace(filepath)
|
||||
logger.debug(f"Updated video metadata for {filepath}")
|
||||
else:
|
||||
if temp_file.exists():
|
||||
temp_file.unlink()
|
||||
logger.debug(f"ffmpeg failed: {result.stderr}")
|
||||
except FileNotFoundError:
|
||||
logger.debug("ffmpeg not found")
|
||||
except Exception as e:
|
||||
if temp_file.exists():
|
||||
temp_file.unlink()
|
||||
logger.debug(f"Video metadata update failed: {e}")
|
||||
|
||||
|
||||
# Convenience functions for direct use
|
||||
def extract_date(text: str, fallback: Optional[datetime] = None) -> Optional[datetime]:
|
||||
"""Extract date from text"""
|
||||
return DateHandler.extract_date_from_text(text, fallback)
|
||||
|
||||
|
||||
def update_timestamps(filepath: Union[str, Path], date: datetime) -> bool:
|
||||
"""Update all timestamps for a file"""
|
||||
return DateHandler.update_file_timestamps(filepath, date)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test examples
|
||||
test_texts = [
|
||||
"Eva Longoria - 15.08.2025 Event Photos",
|
||||
"Photos from 08/15/2025",
|
||||
"August 15, 2025 - Red Carpet",
|
||||
"15 Aug 2025 Photoshoot",
|
||||
"Event 2025-08-15",
|
||||
]
|
||||
|
||||
print("Date extraction tests:")
|
||||
for text in test_texts:
|
||||
extracted = extract_date(text)
|
||||
print(f" '{text}' -> {extracted}")
|
||||
|
||||
# Test file timestamp update
|
||||
test_file = Path("test_image.jpg")
|
||||
if test_file.exists():
|
||||
test_date = datetime(2025, 8, 15, 18, 30, 0)
|
||||
if update_timestamps(test_file, test_date):
|
||||
print(f"\nSuccessfully updated timestamps for {test_file}")
|
||||
27
modules/db_bootstrap.py
Normal file
27
modules/db_bootstrap.py
Normal file
@@ -0,0 +1,27 @@
|
||||
"""
|
||||
Database Backend Bootstrap
|
||||
|
||||
Import this module before any other imports that use sqlite3.
|
||||
When DATABASE_BACKEND=postgresql, it monkey-patches sys.modules['sqlite3']
|
||||
with pg_adapter so every subsequent `import sqlite3` gets the PostgreSQL adapter.
|
||||
|
||||
Default is 'sqlite' (no change — original behavior preserved).
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Load .env BEFORE checking DATABASE_BACKEND — systemd services don't set
|
||||
# this env var, so .env is the primary source of truth.
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
_env_path = Path(__file__).resolve().parent.parent / '.env'
|
||||
if _env_path.exists():
|
||||
load_dotenv(_env_path)
|
||||
except ImportError:
|
||||
pass # rely on system env vars
|
||||
|
||||
if os.getenv('DATABASE_BACKEND', 'sqlite').lower() == 'postgresql':
|
||||
import sys
|
||||
from modules import pg_adapter
|
||||
sys.modules['sqlite3'] = pg_adapter
|
||||
634
modules/dependency_updater.py
Normal file
634
modules/dependency_updater.py
Normal file
@@ -0,0 +1,634 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dependency Updater - Automatically updates critical dependencies
|
||||
Only runs in scheduler mode, once per day
|
||||
|
||||
Version Compatibility:
|
||||
- bcrypt <5.0 required for passlib 1.7.4 compatibility
|
||||
- passlib 1.7.4 requires bcrypt 4.x (not 5.x)
|
||||
- uvicorn <0.35.0 required (0.40.0+ has breaking loop_factory changes)
|
||||
- Pinned packages are skipped during auto-updates to prevent incompatibilities
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
|
||||
class DependencyUpdater:
|
||||
"""Manages automatic updates for critical dependencies"""
|
||||
|
||||
def __init__(self,
|
||||
state_file: str = "/opt/media-downloader/database/dependency_updates.json",
|
||||
config: dict = None,
|
||||
pushover_notifier = None,
|
||||
scheduler_mode: bool = False):
|
||||
"""
|
||||
Initialize dependency updater
|
||||
|
||||
Args:
|
||||
state_file: Path to JSON file storing update state
|
||||
config: Configuration dict from settings.json
|
||||
pushover_notifier: Instance of PushoverNotifier for alerts
|
||||
scheduler_mode: Only run updates when True (scheduler mode)
|
||||
"""
|
||||
self.state_file = Path(state_file)
|
||||
self.state_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.pushover = pushover_notifier
|
||||
self.scheduler_mode = scheduler_mode
|
||||
|
||||
# Derive venv paths from module location (more portable than hardcoded path)
|
||||
import sys
|
||||
self._base_dir = Path(__file__).parent.parent
|
||||
self._venv_pip = self._base_dir / 'venv' / 'bin' / 'pip'
|
||||
self._venv_python = self._base_dir / 'venv' / 'bin' / 'python'
|
||||
# Fallback to sys.executable's directory if venv not found
|
||||
if not self._venv_pip.exists():
|
||||
self._venv_pip = Path(sys.executable).parent / 'pip'
|
||||
if not self._venv_python.exists():
|
||||
self._venv_python = Path(sys.executable)
|
||||
|
||||
# Default configuration
|
||||
self.config = {
|
||||
'enabled': True,
|
||||
'check_interval_hours': 24,
|
||||
'auto_install': True,
|
||||
'components': {
|
||||
'flaresolverr': {
|
||||
'enabled': True,
|
||||
'notify_on_update': True
|
||||
},
|
||||
'playwright': {
|
||||
'enabled': True,
|
||||
'notify_on_update': False
|
||||
},
|
||||
'yt_dlp': {
|
||||
'enabled': True,
|
||||
'notify_on_update': False
|
||||
},
|
||||
'python_packages': {
|
||||
'enabled': True,
|
||||
'notify_on_update': True,
|
||||
'packages': [
|
||||
# Core API framework
|
||||
'fastapi',
|
||||
'uvicorn',
|
||||
'pydantic',
|
||||
'python-jose',
|
||||
'passlib',
|
||||
'slowapi',
|
||||
'starlette',
|
||||
'python-multipart',
|
||||
'websockets',
|
||||
# Security & Auth
|
||||
'bcrypt',
|
||||
'cryptography',
|
||||
'certifi',
|
||||
'2captcha-python',
|
||||
'duo-universal',
|
||||
# Image processing
|
||||
'pillow',
|
||||
'numpy',
|
||||
# Face recognition
|
||||
'insightface',
|
||||
'onnxruntime',
|
||||
'deepface',
|
||||
'tensorflow',
|
||||
'face-recognition',
|
||||
'dlib',
|
||||
# Web scraping & downloads
|
||||
'requests',
|
||||
'beautifulsoup4',
|
||||
'selenium',
|
||||
'playwright',
|
||||
'playwright-stealth',
|
||||
'instaloader',
|
||||
'yt-dlp',
|
||||
'curl-cffi',
|
||||
'gallery-dl',
|
||||
# Database
|
||||
'psycopg2-binary',
|
||||
# Utilities
|
||||
'python-dotenv',
|
||||
'python-dateutil',
|
||||
'pyotp',
|
||||
'click',
|
||||
'attrs',
|
||||
'charset-normalizer',
|
||||
'idna',
|
||||
'websocket-client',
|
||||
'trio',
|
||||
'typing_extensions'
|
||||
]
|
||||
}
|
||||
},
|
||||
'pushover': {
|
||||
'enabled': True,
|
||||
'priority': -1,
|
||||
'sound': 'magic'
|
||||
}
|
||||
}
|
||||
|
||||
# Merge user config
|
||||
if config:
|
||||
self._deep_update(self.config, config)
|
||||
|
||||
# Load or initialize state
|
||||
self.state = self._load_state()
|
||||
|
||||
# Setup logging
|
||||
self.logger = get_logger('DependencyUpdater')
|
||||
|
||||
# Known version incompatibilities and constraints
|
||||
# Format: package_name: [constraints, incompatible_with, reason]
|
||||
self.version_constraints = {
|
||||
'bcrypt': {
|
||||
'constraint': '<5.0',
|
||||
'reason': 'bcrypt 5.x is incompatible with passlib 1.7.4',
|
||||
'incompatible_with': ['passlib>=1.7.4,<2.0']
|
||||
},
|
||||
'passlib': {
|
||||
'constraint': '>=1.7.4,<2.0',
|
||||
'reason': 'passlib 1.7.4 requires bcrypt <5.0',
|
||||
'requires': ['bcrypt>=4.0.0,<5.0']
|
||||
},
|
||||
'uvicorn': {
|
||||
'constraint': '<0.35.0',
|
||||
'reason': 'uvicorn 0.40.0+ has breaking changes with loop_factory parameter that crashes on startup',
|
||||
'known_working': '0.34.0'
|
||||
}
|
||||
}
|
||||
|
||||
# Packages that should not be auto-updated
|
||||
self.pinned_packages = {
|
||||
'bcrypt': 'Version constrained for passlib compatibility',
|
||||
'passlib': 'Version constrained for bcrypt compatibility',
|
||||
'uvicorn': 'Version 0.40.0+ has breaking changes with loop_factory parameter'
|
||||
}
|
||||
|
||||
def _deep_update(self, base: dict, update: dict):
|
||||
"""Deep update dict (recursive merge)"""
|
||||
for key, value in update.items():
|
||||
if isinstance(value, dict) and key in base and isinstance(base[key], dict):
|
||||
self._deep_update(base[key], value)
|
||||
else:
|
||||
base[key] = value
|
||||
|
||||
def _load_state(self) -> Dict:
|
||||
"""Load update state from file"""
|
||||
if self.state_file.exists():
|
||||
try:
|
||||
with open(self.state_file, 'r') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to load update state: {e}")
|
||||
|
||||
# Initialize empty state
|
||||
return {
|
||||
'last_check': None,
|
||||
'components': {}
|
||||
}
|
||||
|
||||
def _save_state(self):
|
||||
"""Save update state to file"""
|
||||
try:
|
||||
with open(self.state_file, 'w') as f:
|
||||
json.dump(self.state, f, indent=2, default=str)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to save update state: {e}")
|
||||
|
||||
def _should_check_updates(self, force: bool = False) -> bool:
|
||||
"""Check if enough time has passed since last update check
|
||||
|
||||
Args:
|
||||
force: If True, bypass all checks and return True
|
||||
|
||||
Returns:
|
||||
True if updates should be checked, False otherwise
|
||||
"""
|
||||
if force:
|
||||
return True
|
||||
|
||||
if not self.config.get('enabled', True):
|
||||
return False
|
||||
|
||||
# Allow manual checks even outside scheduler mode
|
||||
if not self.scheduler_mode:
|
||||
# In non-scheduler mode, only proceed if explicitly called
|
||||
# This allows manual force_update_check() to work
|
||||
return False
|
||||
|
||||
last_check = self.state.get('last_check')
|
||||
if not last_check:
|
||||
return True
|
||||
|
||||
try:
|
||||
last_check_time = datetime.fromisoformat(last_check)
|
||||
interval_hours = self.config.get('check_interval_hours', 24)
|
||||
return datetime.now() - last_check_time > timedelta(hours=interval_hours)
|
||||
except Exception:
|
||||
return True
|
||||
|
||||
def check_and_update_all(self, force: bool = False) -> Dict[str, bool]:
|
||||
"""
|
||||
Check and update all enabled components
|
||||
|
||||
Args:
|
||||
force: If True, bypass interval checks and update immediately
|
||||
|
||||
Returns:
|
||||
Dict mapping component name to update success status
|
||||
"""
|
||||
if not self._should_check_updates(force=force):
|
||||
return {}
|
||||
|
||||
# Check if auto_install is enabled (default: True)
|
||||
auto_install = self.config.get('auto_install', True)
|
||||
|
||||
if auto_install:
|
||||
self.logger.info("Checking for dependency updates...")
|
||||
else:
|
||||
self.logger.info("Checking for dependency updates (auto_install disabled - check only)...")
|
||||
return {} # Skip updates if auto_install is disabled
|
||||
|
||||
results = {}
|
||||
|
||||
# Update last check timestamp
|
||||
self.state['last_check'] = datetime.now().isoformat()
|
||||
self._save_state()
|
||||
|
||||
# Check each component
|
||||
components = self.config.get('components', {})
|
||||
|
||||
if components.get('flaresolverr', {}).get('enabled', True):
|
||||
results['flaresolverr'] = self._update_flaresolverr()
|
||||
|
||||
if components.get('playwright', {}).get('enabled', True):
|
||||
results['playwright'] = self._update_playwright()
|
||||
|
||||
if components.get('yt_dlp', {}).get('enabled', True):
|
||||
results['yt_dlp'] = self._update_yt_dlp()
|
||||
|
||||
if components.get('python_packages', {}).get('enabled', True):
|
||||
results['python_packages'] = self._update_python_packages()
|
||||
|
||||
# Send summary notification if any updates installed
|
||||
if any(results.values()) and self.pushover:
|
||||
self._send_update_notification(results)
|
||||
|
||||
return results
|
||||
|
||||
def _update_flaresolverr(self) -> bool:
|
||||
"""
|
||||
Update FlareSolverr Docker container
|
||||
|
||||
Returns:
|
||||
True if update was installed, False otherwise
|
||||
"""
|
||||
try:
|
||||
self.logger.info("Checking FlareSolverr for updates...")
|
||||
|
||||
# Pull latest image
|
||||
result = subprocess.run(
|
||||
['docker', 'pull', 'ghcr.io/flaresolverr/flaresolverr:latest'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
self.logger.error(f"Failed to pull FlareSolverr image: {result.stderr}")
|
||||
return False
|
||||
|
||||
# Check if image was updated (look for "Downloaded newer image" or "Image is up to date")
|
||||
output = result.stdout + result.stderr
|
||||
updated = "Downloaded newer image" in output or "pulling from" in output.lower()
|
||||
|
||||
if not updated:
|
||||
self.logger.info("FlareSolverr is already up to date")
|
||||
self._update_component_state('flaresolverr', False)
|
||||
return False
|
||||
|
||||
# Image was updated - restart container if running
|
||||
self.logger.info("FlareSolverr image updated, restarting container...")
|
||||
|
||||
# Check if container exists
|
||||
check_result = subprocess.run(
|
||||
['docker', 'ps', '-a', '--filter', 'name=flaresolverr', '--format', '{{.Names}}'],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if 'flaresolverr' in check_result.stdout:
|
||||
# Stop and remove old container
|
||||
subprocess.run(['docker', 'stop', 'flaresolverr'], capture_output=True)
|
||||
subprocess.run(['docker', 'rm', 'flaresolverr'], capture_output=True)
|
||||
|
||||
# Start new container with latest image
|
||||
subprocess.run([
|
||||
'docker', 'run', '-d',
|
||||
'--name', 'flaresolverr',
|
||||
'-p', '8191:8191',
|
||||
'-e', 'LOG_LEVEL=info',
|
||||
'--restart', 'unless-stopped',
|
||||
'ghcr.io/flaresolverr/flaresolverr:latest'
|
||||
], capture_output=True)
|
||||
|
||||
self.logger.info("✓ FlareSolverr updated and restarted successfully")
|
||||
else:
|
||||
self.logger.info("✓ FlareSolverr image updated (container not running)")
|
||||
|
||||
self._update_component_state('flaresolverr', True)
|
||||
return True
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
self.logger.error("FlareSolverr update timed out")
|
||||
return False
|
||||
except Exception as e:
|
||||
self.logger.error(f"FlareSolverr update error: {e}")
|
||||
return False
|
||||
|
||||
def _update_playwright(self) -> bool:
|
||||
"""
|
||||
Update Playwright browsers (Chromium and Firefox)
|
||||
|
||||
Returns:
|
||||
True if update was installed, False otherwise
|
||||
"""
|
||||
try:
|
||||
self.logger.info("Checking Playwright browsers for updates...")
|
||||
|
||||
# Use venv python for playwright commands
|
||||
venv_python = str(self._venv_python)
|
||||
|
||||
# Update Chromium
|
||||
result_chromium = subprocess.run(
|
||||
[venv_python, '-m', 'playwright', 'install', 'chromium'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600,
|
||||
cwd=str(self._base_dir)
|
||||
)
|
||||
|
||||
# Update Firefox
|
||||
result_firefox = subprocess.run(
|
||||
[venv_python, '-m', 'playwright', 'install', 'firefox'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600,
|
||||
cwd=str(self._base_dir)
|
||||
)
|
||||
|
||||
success = result_chromium.returncode == 0 and result_firefox.returncode == 0
|
||||
|
||||
if success:
|
||||
# Check if anything was actually updated
|
||||
output = result_chromium.stdout + result_firefox.stdout
|
||||
updated = "Downloading" in output or "Installing" in output
|
||||
|
||||
if updated:
|
||||
self.logger.info("✓ Playwright browsers updated successfully")
|
||||
self._update_component_state('playwright', True)
|
||||
return True
|
||||
else:
|
||||
self.logger.info("Playwright browsers already up to date")
|
||||
self._update_component_state('playwright', False)
|
||||
return False
|
||||
else:
|
||||
self.logger.error("Failed to update Playwright browsers")
|
||||
return False
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
self.logger.error("Playwright update timed out")
|
||||
return False
|
||||
except Exception as e:
|
||||
self.logger.error(f"Playwright update error: {e}")
|
||||
return False
|
||||
|
||||
def _update_yt_dlp(self) -> bool:
|
||||
"""
|
||||
Update yt-dlp (critical for TikTok downloads)
|
||||
|
||||
Returns:
|
||||
True if update was installed, False otherwise
|
||||
"""
|
||||
try:
|
||||
self.logger.info("Checking yt-dlp for updates...")
|
||||
|
||||
# Use venv pip (derived from module location for portability)
|
||||
venv_pip = str(self._venv_pip)
|
||||
|
||||
# Try updating via pip
|
||||
result = subprocess.run(
|
||||
[venv_pip, 'install', '--upgrade', 'yt-dlp'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
self.logger.error(f"Failed to update yt-dlp: {result.stderr}")
|
||||
return False
|
||||
|
||||
# Check if update was installed
|
||||
output = result.stdout + result.stderr
|
||||
updated = "Successfully installed" in output and "yt-dlp" in output
|
||||
|
||||
if updated:
|
||||
self.logger.info("✓ yt-dlp updated successfully")
|
||||
self._update_component_state('yt_dlp', True)
|
||||
return True
|
||||
else:
|
||||
self.logger.info("yt-dlp already up to date")
|
||||
self._update_component_state('yt_dlp', False)
|
||||
return False
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
self.logger.error("yt-dlp update timed out")
|
||||
return False
|
||||
except Exception as e:
|
||||
self.logger.error(f"yt-dlp update error: {e}")
|
||||
return False
|
||||
|
||||
def _update_python_packages(self) -> bool:
|
||||
"""
|
||||
Update Python packages (FastAPI, Uvicorn, Pydantic, etc.)
|
||||
|
||||
Returns:
|
||||
True if any updates were installed, False otherwise
|
||||
"""
|
||||
try:
|
||||
self.logger.info("Checking Python packages for updates...")
|
||||
|
||||
# Get list of packages to update
|
||||
packages = self.config.get('components', {}).get('python_packages', {}).get('packages', [])
|
||||
if not packages:
|
||||
self.logger.info("No Python packages configured for updates")
|
||||
return False
|
||||
|
||||
# Use venv pip (derived from module location for portability)
|
||||
venv_pip = str(self._venv_pip)
|
||||
|
||||
updated_packages = []
|
||||
|
||||
for package in packages:
|
||||
try:
|
||||
# Check if package is pinned (should not be auto-updated)
|
||||
if package in self.pinned_packages:
|
||||
self.logger.info(f"⚠ Skipping {package}: {self.pinned_packages[package]}")
|
||||
continue
|
||||
|
||||
# Check for version constraints
|
||||
if package in self.version_constraints:
|
||||
constraint_info = self.version_constraints[package]
|
||||
constraint = constraint_info.get('constraint', '')
|
||||
reason = constraint_info.get('reason', 'Version constraint')
|
||||
|
||||
if constraint:
|
||||
# Install with constraint instead of --upgrade
|
||||
package_spec = f"{package}{constraint}"
|
||||
self.logger.info(f"📌 {package}: Applying constraint {constraint} ({reason})")
|
||||
|
||||
result = subprocess.run(
|
||||
[venv_pip, 'install', package_spec],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
else:
|
||||
# No constraint, normal upgrade
|
||||
result = subprocess.run(
|
||||
[venv_pip, 'install', '--upgrade', package],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
else:
|
||||
# Update package normally
|
||||
result = subprocess.run(
|
||||
[venv_pip, 'install', '--upgrade', package],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
output = result.stdout + result.stderr
|
||||
# Check if package was actually updated
|
||||
if "Successfully installed" in output and package in output:
|
||||
updated_packages.append(package)
|
||||
self.logger.info(f"✓ {package} updated")
|
||||
elif "Requirement already satisfied" in output:
|
||||
self.logger.debug(f" {package} already up to date")
|
||||
else:
|
||||
self.logger.debug(f" {package} checked")
|
||||
else:
|
||||
self.logger.warning(f"Failed to update {package}: {result.stderr}")
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
self.logger.warning(f"{package} update timed out")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error updating {package}: {e}")
|
||||
|
||||
if updated_packages:
|
||||
self.logger.info(f"✓ Updated {len(updated_packages)} Python package(s): {', '.join(updated_packages)}")
|
||||
self._update_component_state('python_packages', True)
|
||||
|
||||
# Store list of updated packages in state
|
||||
if 'components' not in self.state:
|
||||
self.state['components'] = {}
|
||||
if 'python_packages' not in self.state['components']:
|
||||
self.state['components']['python_packages'] = {}
|
||||
self.state['components']['python_packages']['updated_packages'] = updated_packages
|
||||
self._save_state()
|
||||
|
||||
return True
|
||||
else:
|
||||
self.logger.info("All Python packages already up to date")
|
||||
self._update_component_state('python_packages', False)
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Python packages update error: {e}")
|
||||
return False
|
||||
|
||||
def _update_component_state(self, component: str, updated: bool):
|
||||
"""Update component state in JSON"""
|
||||
if 'components' not in self.state:
|
||||
self.state['components'] = {}
|
||||
|
||||
if component not in self.state['components']:
|
||||
self.state['components'][component] = {}
|
||||
|
||||
self.state['components'][component]['last_update'] = datetime.now().isoformat() if updated else self.state['components'][component].get('last_update')
|
||||
self.state['components'][component]['last_check'] = datetime.now().isoformat()
|
||||
self.state['components'][component]['status'] = 'updated' if updated else 'current'
|
||||
|
||||
self._save_state()
|
||||
|
||||
def _send_update_notification(self, results: Dict[str, bool]):
|
||||
"""Send Pushover notification about installed updates"""
|
||||
if not self.config.get('pushover', {}).get('enabled', True):
|
||||
return
|
||||
|
||||
# Build list of updated components
|
||||
updated_components = [name for name, updated in results.items() if updated]
|
||||
|
||||
if not updated_components:
|
||||
return
|
||||
|
||||
# Check which components should send notifications
|
||||
notify_components = []
|
||||
for component in updated_components:
|
||||
component_config = self.config.get('components', {}).get(component, {})
|
||||
if component_config.get('notify_on_update', True):
|
||||
notify_components.append(component)
|
||||
|
||||
if not notify_components:
|
||||
return
|
||||
|
||||
# Format component names
|
||||
component_map = {
|
||||
'flaresolverr': 'FlareSolverr',
|
||||
'playwright': 'Playwright Browsers',
|
||||
'yt_dlp': 'yt-dlp',
|
||||
'python_packages': 'Python Packages'
|
||||
}
|
||||
|
||||
formatted_names = [component_map.get(c, c) for c in notify_components]
|
||||
|
||||
title = "🔄 Dependencies Updated"
|
||||
if len(formatted_names) == 1:
|
||||
message = f"{formatted_names[0]} has been updated to the latest version."
|
||||
else:
|
||||
message = f"The following components have been updated:\n\n"
|
||||
for name in formatted_names:
|
||||
message += f"• {name}\n"
|
||||
|
||||
message += f"\nUpdated at: {datetime.now().strftime('%b %d, %I:%M %p')}"
|
||||
|
||||
try:
|
||||
priority = self.config.get('pushover', {}).get('priority', -1)
|
||||
sound = self.config.get('pushover', {}).get('sound', 'magic')
|
||||
|
||||
self.pushover.send_notification(
|
||||
title=title,
|
||||
message=message,
|
||||
priority=priority,
|
||||
sound=sound
|
||||
)
|
||||
|
||||
self.logger.info(f"Sent update notification for: {', '.join(formatted_names)}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to send update notification: {e}")
|
||||
|
||||
def get_update_status(self) -> Dict:
|
||||
"""Get current update status for all components"""
|
||||
return self.state.copy()
|
||||
|
||||
def force_update_check(self) -> Dict[str, bool]:
|
||||
"""Force immediate update check regardless of interval or scheduler mode"""
|
||||
return self.check_and_update_all(force=True)
|
||||
1051
modules/discovery_system.py
Normal file
1051
modules/discovery_system.py
Normal file
File diff suppressed because it is too large
Load Diff
940
modules/download_manager.py
Executable file
940
modules/download_manager.py
Executable file
@@ -0,0 +1,940 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Multi-threaded Download Manager
|
||||
Handles concurrent downloads with rate limiting, retries, and progress tracking
|
||||
Can be used by forum_downloader, fastdl_module, and other downloaders
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import hashlib
|
||||
import requests
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Any, Callable
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from threading import Lock, Semaphore
|
||||
from dataclasses import dataclass
|
||||
import sqlite3
|
||||
from urllib.parse import urlparse
|
||||
from modules.base_module import LoggingMixin
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('DownloadManager') # For standalone/example usage
|
||||
|
||||
|
||||
@dataclass
|
||||
class DownloadItem:
|
||||
"""Single download item"""
|
||||
url: str
|
||||
save_path: Path
|
||||
referer: Optional[str] = None
|
||||
headers: Optional[Dict[str, str]] = None
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
post_date: Optional[datetime] = None # Timestamp to set on downloaded file
|
||||
retry_count: int = 0
|
||||
max_retries: int = 3
|
||||
|
||||
|
||||
@dataclass
|
||||
class DownloadResult:
|
||||
"""Result of a download"""
|
||||
success: bool
|
||||
item: DownloadItem
|
||||
file_size: Optional[int] = None
|
||||
download_time: Optional[float] = None
|
||||
error: Optional[str] = None
|
||||
file_hash: Optional[str] = None
|
||||
|
||||
|
||||
class DownloadManager(LoggingMixin):
|
||||
"""
|
||||
Multi-threaded download manager with:
|
||||
- Concurrent downloads
|
||||
- Rate limiting
|
||||
- Automatic retries
|
||||
- Progress tracking
|
||||
- Database tracking
|
||||
- Playwright support for authenticated downloads
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
max_workers: int = 5,
|
||||
rate_limit: float = 0.5,
|
||||
timeout: int = 30,
|
||||
chunk_size: int = 8192,
|
||||
use_database: bool = False,
|
||||
db_path: str = None,
|
||||
show_progress: bool = True,
|
||||
show_debug: bool = False):
|
||||
"""
|
||||
Initialize download manager
|
||||
|
||||
Args:
|
||||
max_workers: Maximum concurrent downloads
|
||||
rate_limit: Seconds between downloads per thread
|
||||
timeout: Download timeout in seconds
|
||||
chunk_size: Chunk size for streaming downloads
|
||||
use_database: Track downloads in database
|
||||
db_path: Path to database file
|
||||
show_progress: Show download progress
|
||||
show_debug: Show debug messages
|
||||
"""
|
||||
self.max_workers = max_workers
|
||||
self.rate_limit = rate_limit
|
||||
self.timeout = timeout
|
||||
self.chunk_size = chunk_size
|
||||
self.use_database = use_database
|
||||
self.db_path = db_path
|
||||
self.show_progress = show_progress
|
||||
|
||||
# Initialize logging via mixin
|
||||
self._init_logger('DownloadManager', None, default_module='Download', show_debug=show_debug)
|
||||
|
||||
# Thread synchronization
|
||||
self.download_lock = Lock()
|
||||
self.rate_limiter = Semaphore(max_workers)
|
||||
self.last_download_time = {}
|
||||
|
||||
# Thread-local storage for ImageBam sessions (each thread gets its own session)
|
||||
self._imagebam_session_local = threading.local()
|
||||
|
||||
# Statistics
|
||||
self.stats = {
|
||||
'total': 0,
|
||||
'successful': 0,
|
||||
'failed': 0,
|
||||
'skipped': 0,
|
||||
'total_bytes': 0,
|
||||
'total_time': 0
|
||||
}
|
||||
|
||||
# User agent
|
||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
|
||||
# Playwright context for authenticated downloads
|
||||
self.playwright_context = None
|
||||
|
||||
# Initialize database only if explicitly enabled AND path provided
|
||||
if self.use_database and self.db_path:
|
||||
self._init_database()
|
||||
elif self.use_database and not self.db_path:
|
||||
# Disable database if no path provided to prevent creating files in CWD
|
||||
self.use_database = False
|
||||
|
||||
def _init_database(self):
|
||||
"""Initialize download tracking database"""
|
||||
if not self.db_path:
|
||||
return
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS downloads (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT UNIQUE NOT NULL,
|
||||
file_path TEXT NOT NULL,
|
||||
file_hash TEXT,
|
||||
file_size INTEGER,
|
||||
download_date DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
metadata TEXT
|
||||
)
|
||||
''')
|
||||
|
||||
cursor.execute('''
|
||||
CREATE INDEX IF NOT EXISTS idx_downloads_url ON downloads(url)
|
||||
''')
|
||||
cursor.execute('''
|
||||
CREATE INDEX IF NOT EXISTS idx_downloads_hash ON downloads(file_hash)
|
||||
''')
|
||||
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def set_playwright_context(self, context):
|
||||
"""Set Playwright context for authenticated downloads"""
|
||||
self.playwright_context = context
|
||||
# Extract cookies from context for requests library
|
||||
if context:
|
||||
try:
|
||||
self.cookies = {}
|
||||
cookies = context.cookies()
|
||||
for cookie in cookies:
|
||||
self.cookies[cookie['name']] = cookie['value']
|
||||
except Exception:
|
||||
self.cookies = {}
|
||||
|
||||
def _is_already_downloaded(self, url: str, file_path: Path) -> bool:
|
||||
"""Check if file was already downloaded"""
|
||||
if not self.use_database:
|
||||
return file_path.exists() and file_path.stat().st_size > 0
|
||||
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute(
|
||||
"SELECT file_path, file_size FROM downloads WHERE url = ?",
|
||||
(url,)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if result:
|
||||
# Check if file still exists and has expected size
|
||||
saved_path = Path(result[0])
|
||||
if saved_path.exists() and saved_path.stat().st_size == result[1]:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _apply_rate_limit(self, thread_id: int):
|
||||
"""Apply rate limiting per thread"""
|
||||
with self.download_lock:
|
||||
if thread_id in self.last_download_time:
|
||||
elapsed = time.time() - self.last_download_time[thread_id]
|
||||
if elapsed < self.rate_limit:
|
||||
time.sleep(self.rate_limit - elapsed)
|
||||
self.last_download_time[thread_id] = time.time()
|
||||
|
||||
def _extract_pixhost_direct_url(self, show_url: str) -> Optional[str]:
|
||||
"""Extract direct image URL from pixhost show URL"""
|
||||
try:
|
||||
# Pattern to extract ID and filename from show URL
|
||||
show_pattern = re.compile(r"https?://(?:www\.)?pixhost\.to/show/(\d+)/([^/]+)$", re.IGNORECASE)
|
||||
match = show_pattern.match(show_url)
|
||||
|
||||
if not match:
|
||||
return None
|
||||
|
||||
img_id = match.group(1)
|
||||
filename = match.group(2)
|
||||
|
||||
# Try common hosts in order
|
||||
common_hosts = [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100]
|
||||
|
||||
for host_num in common_hosts:
|
||||
test_url = f"https://img{host_num}.pixhost.to/images/{img_id}/{filename}"
|
||||
|
||||
try:
|
||||
# Quick HEAD request to check if URL exists
|
||||
response = requests.head(test_url, timeout=2, allow_redirects=False)
|
||||
if response.status_code == 200:
|
||||
return test_url
|
||||
except requests.RequestException:
|
||||
continue
|
||||
|
||||
# Try sequential scan if common hosts don't work
|
||||
for host_num in range(1, 121):
|
||||
if host_num in common_hosts:
|
||||
continue
|
||||
|
||||
test_url = f"https://img{host_num}.pixhost.to/images/{img_id}/{filename}"
|
||||
|
||||
try:
|
||||
response = requests.head(test_url, timeout=1, allow_redirects=False)
|
||||
if response.status_code == 200:
|
||||
return test_url
|
||||
except requests.RequestException:
|
||||
continue
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
self.log(f"Error extracting pixhost URL: {e}", "error")
|
||||
return None
|
||||
|
||||
def _extract_imagebam_direct_url(self, imagebam_url: str) -> Optional[str]:
|
||||
"""Extract direct image URL from ImageBam page"""
|
||||
try:
|
||||
# Get or create thread-local ImageBam session (thread-safe)
|
||||
session = getattr(self._imagebam_session_local, 'session', None)
|
||||
if session is None:
|
||||
session = requests.Session()
|
||||
session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
})
|
||||
# Set cookies to bypass the interstitial ad page (both old and new cookies)
|
||||
session.cookies.set('nsfw_inter', '1', domain='.imagebam.com')
|
||||
session.cookies.set('sfw_inter', '1', domain='.imagebam.com')
|
||||
self._imagebam_session_local.session = session
|
||||
|
||||
# ImageBam now requires two requests - first to get session cookies, second to get image
|
||||
# First request sets up the session
|
||||
response = session.get(imagebam_url, timeout=5)
|
||||
|
||||
if response.status_code != 200:
|
||||
self.log(f"ImageBam page returned {response.status_code}", "warning")
|
||||
return None
|
||||
|
||||
# Check if we got the interstitial page (contains "Continue to your image")
|
||||
if 'Continue to your image' in response.text or 'Please wait' in response.text:
|
||||
# Make sure bypass cookies are set and request again
|
||||
session.cookies.set('sfw_inter', '1', domain='.imagebam.com')
|
||||
session.cookies.set('nsfw_inter', '1', domain='.imagebam.com')
|
||||
response = session.get(imagebam_url, timeout=5)
|
||||
|
||||
# Look for the direct image URL in the HTML
|
||||
# ImageBam stores the full image with _o suffix
|
||||
# First try to find the full resolution image
|
||||
full_img_pattern = r'(https?://images\d*\.imagebam\.com/[a-f0-9/]+/[A-Z0-9]+_o\.\w+)'
|
||||
matches = re.findall(full_img_pattern, response.text, re.IGNORECASE)
|
||||
|
||||
if matches:
|
||||
# Return the first full resolution image found
|
||||
direct_url = matches[0]
|
||||
self.log(f"Extracted ImageBam direct URL: {direct_url}", "debug")
|
||||
return direct_url
|
||||
|
||||
# Fallback: look for any image on images*.imagebam.com
|
||||
fallback_patterns = [
|
||||
r'<img[^>]+src="(https?://images\d*\.imagebam\.com/[^"]+)"',
|
||||
r'"(https?://images\d*\.imagebam\.com/[^"]+\.(?:jpg|jpeg|png|gif))"',
|
||||
]
|
||||
|
||||
for pattern in fallback_patterns:
|
||||
matches = re.findall(pattern, response.text, re.IGNORECASE)
|
||||
if matches:
|
||||
direct_url = matches[0]
|
||||
self.log(f"Extracted ImageBam direct URL (fallback): {direct_url}", "debug")
|
||||
return direct_url
|
||||
|
||||
self.log("No direct image URL found in ImageBam HTML", "warning")
|
||||
return None
|
||||
|
||||
except requests.Timeout:
|
||||
self.log(f"ImageBam extraction timed out for {imagebam_url}", "warning")
|
||||
return None
|
||||
except Exception as e:
|
||||
self.log(f"Error extracting ImageBam URL: {e}", "error")
|
||||
return None
|
||||
|
||||
def _download_with_gallery_dl(self, item: DownloadItem) -> DownloadResult:
|
||||
"""Download using gallery-dl for supported hosts (ImageTwist, etc.)"""
|
||||
import subprocess
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Ensure parent directory exists
|
||||
item.save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build gallery-dl command
|
||||
cmd = [
|
||||
"gallery-dl",
|
||||
"--dest", str(item.save_path.parent),
|
||||
"--filename", item.save_path.name,
|
||||
"--no-skip",
|
||||
"--no-part",
|
||||
"--quiet"
|
||||
]
|
||||
|
||||
# Add referer if provided
|
||||
if item.referer:
|
||||
cmd.extend(["--header", f"Referer: {item.referer}"])
|
||||
|
||||
cmd.append(item.url)
|
||||
|
||||
# Run gallery-dl with timeout
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if result.returncode == 0 and item.save_path.exists():
|
||||
file_size = item.save_path.stat().st_size
|
||||
download_time = time.time() - start_time
|
||||
|
||||
# Calculate hash (SHA256 for consistency with unified database)
|
||||
with open(item.save_path, 'rb') as f:
|
||||
file_hash = hashlib.sha256(f.read()).hexdigest()
|
||||
|
||||
# Set file timestamp if we have a date
|
||||
if item.post_date:
|
||||
try:
|
||||
timestamp_unix = item.post_date.timestamp()
|
||||
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
|
||||
except Exception as e:
|
||||
self.log(f"Failed to set timestamp: {e}", "warning")
|
||||
|
||||
self.log(f"Downloaded via gallery-dl: {item.save_path.name}", "success")
|
||||
return DownloadResult(
|
||||
success=True,
|
||||
item=item,
|
||||
file_size=file_size,
|
||||
download_time=download_time,
|
||||
file_hash=file_hash
|
||||
)
|
||||
else:
|
||||
error_msg = result.stderr or "Unknown error"
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error=f"gallery-dl failed: {error_msg}"
|
||||
)
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error="gallery-dl timed out"
|
||||
)
|
||||
except Exception as e:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
def _download_from_imagetwist(self, item: DownloadItem) -> DownloadResult:
|
||||
"""Download image from ImageTwist using gallery-dl for URL resolution"""
|
||||
import subprocess
|
||||
start_time = time.time()
|
||||
|
||||
# Rate limiting for ImageTwist (they return error images if too fast)
|
||||
if not hasattr(self, '_imagetwist_last_request'):
|
||||
self._imagetwist_last_request = 0
|
||||
|
||||
with self.download_lock:
|
||||
elapsed = time.time() - self._imagetwist_last_request
|
||||
if elapsed < 2.0: # Minimum 2 seconds between ImageTwist requests
|
||||
time.sleep(2.0 - elapsed)
|
||||
self._imagetwist_last_request = time.time()
|
||||
|
||||
try:
|
||||
# Use gallery-dl to get the actual image URL
|
||||
result = subprocess.run(
|
||||
['/opt/media-downloader/venv/bin/gallery-dl', '-g', item.url],
|
||||
capture_output=True, text=True, timeout=30
|
||||
)
|
||||
|
||||
if result.returncode != 0 or not result.stdout.strip():
|
||||
# Fallback to manual parsing
|
||||
return self._download_from_imagetwist_fallback(item, start_time)
|
||||
|
||||
img_url = result.stdout.strip().split('\n')[0]
|
||||
|
||||
if not img_url or 'imagetwist' not in img_url:
|
||||
return self._download_from_imagetwist_fallback(item, start_time)
|
||||
|
||||
# Rate limit again before actual download
|
||||
with self.download_lock:
|
||||
elapsed = time.time() - self._imagetwist_last_request
|
||||
if elapsed < 2.0:
|
||||
time.sleep(2.0 - elapsed)
|
||||
self._imagetwist_last_request = time.time()
|
||||
|
||||
# Download the actual image - use imagetwist page as Referer
|
||||
item.save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
headers = {
|
||||
'User-Agent': self.user_agent,
|
||||
'Referer': item.url # Use imagetwist page URL as Referer
|
||||
}
|
||||
|
||||
img_response = requests.get(img_url, headers=headers, timeout=30, stream=True)
|
||||
img_response.raise_for_status()
|
||||
|
||||
# Check for ImageTwist error placeholder (8346 bytes - rate limited or deleted)
|
||||
content_length = img_response.headers.get('Content-Length', '')
|
||||
if content_length == '8346':
|
||||
self.log(f"ImageTwist rate limited or unavailable: {item.url}", "warning")
|
||||
return DownloadResult(success=False, item=item, error="ImageTwist error image (rate limited)")
|
||||
|
||||
# Validate it's an image, not HTML
|
||||
chunks = []
|
||||
for chunk in img_response.iter_content(chunk_size=8192):
|
||||
if not chunks: # First chunk
|
||||
if chunk[:100].lower().find(b'<html') != -1 or chunk[:100].lower().find(b'<!doctype') != -1:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error="Got HTML instead of image"
|
||||
)
|
||||
chunks.append(chunk)
|
||||
|
||||
# Save the image
|
||||
with open(item.save_path, 'wb') as f:
|
||||
for chunk in chunks:
|
||||
f.write(chunk)
|
||||
|
||||
file_size = item.save_path.stat().st_size
|
||||
download_time = time.time() - start_time
|
||||
|
||||
# Calculate hash (SHA256 for consistency with unified database)
|
||||
with open(item.save_path, 'rb') as f:
|
||||
file_hash = hashlib.sha256(f.read()).hexdigest()
|
||||
|
||||
# Set file timestamp if we have a date
|
||||
if item.post_date:
|
||||
try:
|
||||
timestamp_unix = item.post_date.timestamp()
|
||||
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self.log(f"Downloaded ImageTwist: {item.save_path.name}", "success")
|
||||
return DownloadResult(
|
||||
success=True,
|
||||
item=item,
|
||||
file_size=file_size,
|
||||
download_time=download_time,
|
||||
file_hash=file_hash
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error=f"ImageTwist download failed: {e}"
|
||||
)
|
||||
|
||||
def _download_from_imagetwist_fallback(self, item: DownloadItem, start_time: float) -> DownloadResult:
|
||||
"""Fallback method using manual page parsing"""
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': self.user_agent,
|
||||
'Referer': item.referer or 'https://forum.phun.org/'
|
||||
}
|
||||
|
||||
response = requests.get(item.url, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
page_content = response.text
|
||||
img_url = None
|
||||
|
||||
# Method 1: Look for pic class
|
||||
soup = BeautifulSoup(page_content, 'html.parser')
|
||||
pic_img = soup.find('img', class_='pic')
|
||||
if pic_img and pic_img.get('src'):
|
||||
img_url = pic_img['src']
|
||||
|
||||
# Method 2: Regex for i*.imagetwist.com/i/ pattern
|
||||
if not img_url:
|
||||
match = re.search(r'(https?://i\d*(?:phun)?\.imagetwist\.com/i/[^"\'>\s]+)', page_content)
|
||||
if match:
|
||||
img_url = match.group(1)
|
||||
|
||||
if not img_url:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error="Could not find direct image URL on ImageTwist page"
|
||||
)
|
||||
|
||||
# Download the actual image
|
||||
item.save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
img_response = requests.get(img_url, headers=headers, timeout=30, stream=True)
|
||||
img_response.raise_for_status()
|
||||
|
||||
chunks = []
|
||||
for chunk in img_response.iter_content(chunk_size=8192):
|
||||
if not chunks:
|
||||
if chunk[:100].lower().find(b'<html') != -1:
|
||||
return DownloadResult(success=False, item=item, error="Got HTML instead of image")
|
||||
chunks.append(chunk)
|
||||
|
||||
with open(item.save_path, 'wb') as f:
|
||||
for chunk in chunks:
|
||||
f.write(chunk)
|
||||
|
||||
file_size = item.save_path.stat().st_size
|
||||
download_time = time.time() - start_time
|
||||
|
||||
with open(item.save_path, 'rb') as f:
|
||||
file_hash = hashlib.sha256(f.read()).hexdigest()
|
||||
|
||||
self.log(f"Downloaded ImageTwist (fallback): {item.save_path.name}", "success")
|
||||
return DownloadResult(success=True, item=item, file_size=file_size, download_time=download_time, file_hash=file_hash)
|
||||
|
||||
except Exception as e:
|
||||
return DownloadResult(success=False, item=item, error=f"ImageTwist fallback failed: {e}")
|
||||
|
||||
def _download_with_playwright(self, item: DownloadItem) -> DownloadResult:
|
||||
"""Download using Playwright for authenticated sessions"""
|
||||
if not self.playwright_context:
|
||||
return self._download_with_requests(item)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
page = self.playwright_context.new_page()
|
||||
try:
|
||||
# Set headers
|
||||
headers = item.headers or {}
|
||||
if item.referer:
|
||||
headers['Referer'] = item.referer
|
||||
if headers:
|
||||
page.set_extra_http_headers(headers)
|
||||
|
||||
# Direct download (pixhost should already be processed)
|
||||
response = page.goto(item.url, wait_until='networkidle',
|
||||
timeout=self.timeout * 1000)
|
||||
|
||||
if response and response.ok:
|
||||
content = response.body()
|
||||
|
||||
# Check for HTML error pages
|
||||
if content[:1000].lower().find(b'<!doctype') != -1 or \
|
||||
content[:1000].lower().find(b'<html') != -1:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error="Got HTML instead of expected file"
|
||||
)
|
||||
|
||||
# Save file
|
||||
item.save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
item.save_path.write_bytes(content)
|
||||
|
||||
# Calculate hash (SHA256 for consistency with unified database)
|
||||
file_hash = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Update timestamps if we have a date
|
||||
if item.post_date:
|
||||
try:
|
||||
timestamp_unix = item.post_date.timestamp()
|
||||
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
|
||||
self.log(f"Set timestamp to {item.post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
|
||||
except Exception as e:
|
||||
self.log(f"Failed to set timestamp: {e}", "warning")
|
||||
|
||||
download_time = time.time() - start_time
|
||||
|
||||
return DownloadResult(
|
||||
success=True,
|
||||
item=item,
|
||||
file_size=len(content),
|
||||
download_time=download_time,
|
||||
file_hash=file_hash
|
||||
)
|
||||
else:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error=f"HTTP {response.status if response else 'No response'}"
|
||||
)
|
||||
|
||||
finally:
|
||||
page.close()
|
||||
|
||||
except Exception as e:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
def _download_with_requests(self, item: DownloadItem) -> DownloadResult:
|
||||
"""Download using requests library"""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
headers = item.headers or {}
|
||||
headers['User-Agent'] = self.user_agent
|
||||
if item.referer:
|
||||
headers['Referer'] = item.referer
|
||||
|
||||
# Use cookies if available
|
||||
cookies = getattr(self, 'cookies', {})
|
||||
|
||||
response = requests.get(
|
||||
item.url,
|
||||
headers=headers,
|
||||
cookies=cookies if cookies else None,
|
||||
timeout=self.timeout,
|
||||
stream=True
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Stream download to memory first to validate content
|
||||
item.save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
content = b''
|
||||
first_chunk_checked = False
|
||||
|
||||
for chunk in response.iter_content(chunk_size=self.chunk_size):
|
||||
if chunk:
|
||||
# Check first chunk for HTML error pages
|
||||
if not first_chunk_checked:
|
||||
first_chunk_checked = True
|
||||
if chunk[:100].lower().find(b'<html') != -1 or \
|
||||
chunk[:100].lower().find(b'<!doctype') != -1 or \
|
||||
chunk[:100].lower().find(b'<head>') != -1:
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error="Got HTML instead of image"
|
||||
)
|
||||
content += chunk
|
||||
|
||||
# Save to file only after validation
|
||||
with open(item.save_path, 'wb') as f:
|
||||
f.write(content)
|
||||
|
||||
# Calculate hash (SHA256 for consistency with unified database)
|
||||
file_hash = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Set file timestamp if we have a date
|
||||
if item.post_date:
|
||||
try:
|
||||
timestamp_unix = item.post_date.timestamp()
|
||||
os.utime(item.save_path, (timestamp_unix, timestamp_unix))
|
||||
self.log(f"Set timestamp to {item.post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
|
||||
except Exception as e:
|
||||
self.log(f"Failed to set timestamp: {e}", "warning")
|
||||
|
||||
download_time = time.time() - start_time
|
||||
|
||||
return DownloadResult(
|
||||
success=True,
|
||||
item=item,
|
||||
file_size=len(content),
|
||||
download_time=download_time,
|
||||
file_hash=file_hash
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# Clean up partial download
|
||||
if item.save_path.exists():
|
||||
item.save_path.unlink()
|
||||
|
||||
return DownloadResult(
|
||||
success=False,
|
||||
item=item,
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
def _download_worker(self, item: DownloadItem, thread_id: int) -> DownloadResult:
|
||||
"""Worker function for downloading a single item"""
|
||||
# Process image hosting URLs to get direct URLs
|
||||
if 'pixhost.to/show/' in item.url:
|
||||
direct_url = self._extract_pixhost_direct_url(item.url)
|
||||
if direct_url:
|
||||
self.log(f"Converted pixhost URL to direct: {direct_url.split('/')[-1]}", "debug")
|
||||
item.url = direct_url
|
||||
else:
|
||||
self.log(f"Failed to extract pixhost direct URL: {item.url}", "warning")
|
||||
|
||||
elif 'imagebam.com' in item.url:
|
||||
direct_url = self._extract_imagebam_direct_url(item.url)
|
||||
if direct_url:
|
||||
self.log(f"Converted ImageBam URL to direct: {direct_url.split('/')[-1]}", "debug")
|
||||
item.url = direct_url
|
||||
else:
|
||||
self.log(f"Failed to extract ImageBam direct URL: {item.url}", "warning")
|
||||
|
||||
elif 'imagetwist.com' in item.url:
|
||||
# ImageTwist requires parsing the page to get direct image URL
|
||||
result = self._download_from_imagetwist(item)
|
||||
if result.success:
|
||||
return result
|
||||
self.log(f"ImageTwist download failed: {item.url}", "warning")
|
||||
|
||||
# Check if already downloaded
|
||||
if self._is_already_downloaded(item.url, item.save_path):
|
||||
self.log(f"Already downloaded: {item.save_path.name}", "skip")
|
||||
return DownloadResult(
|
||||
success=True,
|
||||
item=item,
|
||||
file_size=item.save_path.stat().st_size if item.save_path.exists() else 0
|
||||
)
|
||||
|
||||
# Apply rate limiting
|
||||
self._apply_rate_limit(thread_id)
|
||||
|
||||
# Always use requests for direct image downloads (faster)
|
||||
result = self._download_with_requests(item)
|
||||
|
||||
# Handle retries
|
||||
if not result.success and item.retry_count < item.max_retries:
|
||||
item.retry_count += 1
|
||||
self.log(f"Retrying {item.url} ({item.retry_count}/{item.max_retries})", "warning")
|
||||
time.sleep(self.rate_limit * 2) # Extra delay before retry
|
||||
return self._download_worker(item, thread_id)
|
||||
|
||||
# Save to database if successful
|
||||
if result.success and self.use_database:
|
||||
self._save_to_database(result)
|
||||
|
||||
# Update statistics
|
||||
with self.download_lock:
|
||||
if result.success:
|
||||
self.stats['successful'] += 1
|
||||
if result.file_size:
|
||||
self.stats['total_bytes'] += result.file_size
|
||||
if result.download_time:
|
||||
self.stats['total_time'] += result.download_time
|
||||
else:
|
||||
self.stats['failed'] += 1
|
||||
|
||||
return result
|
||||
|
||||
def _save_to_database(self, result: DownloadResult):
|
||||
"""Save successful download to database"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
|
||||
metadata_str = None
|
||||
if result.item.metadata:
|
||||
import json
|
||||
metadata_str = json.dumps(result.item.metadata)
|
||||
|
||||
cursor.execute('''
|
||||
INSERT OR REPLACE INTO downloads
|
||||
(url, file_path, file_hash, file_size, metadata)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
''', (
|
||||
result.item.url,
|
||||
str(result.item.save_path),
|
||||
result.file_hash,
|
||||
result.file_size,
|
||||
metadata_str
|
||||
))
|
||||
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def download_batch(self, items: List[DownloadItem],
|
||||
progress_callback: Optional[Callable] = None) -> List[DownloadResult]:
|
||||
"""
|
||||
Download multiple items concurrently
|
||||
|
||||
Args:
|
||||
items: List of DownloadItem objects
|
||||
progress_callback: Optional callback for progress updates
|
||||
|
||||
Returns:
|
||||
List of DownloadResult objects
|
||||
"""
|
||||
self.stats['total'] = len(items)
|
||||
results = []
|
||||
|
||||
self.log(f"Starting batch download of {len(items)} items with {self.max_workers} workers", "info")
|
||||
|
||||
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
||||
# Submit all downloads
|
||||
futures = {
|
||||
executor.submit(self._download_worker, item, i % self.max_workers): item
|
||||
for i, item in enumerate(items)
|
||||
}
|
||||
|
||||
# Process completed downloads
|
||||
completed = 0
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
results.append(result)
|
||||
completed += 1
|
||||
|
||||
# Progress update
|
||||
if progress_callback:
|
||||
progress_callback(completed, len(items), result)
|
||||
|
||||
if self.show_progress:
|
||||
pct = (completed / len(items)) * 100
|
||||
status = "✓" if result.success else "✗"
|
||||
self.log(
|
||||
f"[{completed}/{len(items)}] {pct:.1f}% - {status} {result.item.save_path.name}",
|
||||
"success" if result.success else "error"
|
||||
)
|
||||
|
||||
# Summary
|
||||
self.log(f"Batch complete: {self.stats['successful']} successful, {self.stats['failed']} failed", "info")
|
||||
|
||||
if self.stats['successful'] > 0:
|
||||
avg_speed = self.stats['total_bytes'] / self.stats['total_time'] / 1024 / 1024
|
||||
self.log(f"Average speed: {avg_speed:.2f} MB/s", "info")
|
||||
|
||||
return results
|
||||
|
||||
def download_urls(self, urls: List[str], base_path: Path,
|
||||
referer: Optional[str] = None,
|
||||
metadata: Optional[Dict] = None) -> List[DownloadResult]:
|
||||
"""
|
||||
Convenience method to download URLs to a directory
|
||||
|
||||
Args:
|
||||
urls: List of URLs to download
|
||||
base_path: Directory to save files
|
||||
referer: Optional referer header
|
||||
metadata: Optional metadata for all downloads
|
||||
|
||||
Returns:
|
||||
List of DownloadResult objects
|
||||
"""
|
||||
items = []
|
||||
for url in urls:
|
||||
filename = os.path.basename(urlparse(url).path) or f"download_{hashlib.sha256(url.encode()).hexdigest()[:8]}"
|
||||
save_path = base_path / filename
|
||||
|
||||
items.append(DownloadItem(
|
||||
url=url,
|
||||
save_path=save_path,
|
||||
referer=referer,
|
||||
metadata=metadata
|
||||
))
|
||||
|
||||
return self.download_batch(items)
|
||||
|
||||
def get_statistics(self) -> Dict:
|
||||
"""Get download statistics"""
|
||||
return self.stats.copy()
|
||||
|
||||
def cleanup_old_downloads(self, days: int = 30):
|
||||
"""Remove old download records from database"""
|
||||
if not self.use_database:
|
||||
return 0
|
||||
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
DELETE FROM downloads
|
||||
WHERE download_date < datetime('now', ? || ' days')
|
||||
''', (-days,))
|
||||
|
||||
deleted = cursor.rowcount
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
self.log(f"Cleaned up {deleted} old download records", "info")
|
||||
return deleted
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
from pathlib import Path
|
||||
|
||||
# Test download manager
|
||||
manager = DownloadManager(
|
||||
max_workers=3,
|
||||
rate_limit=0.5,
|
||||
show_progress=True
|
||||
)
|
||||
|
||||
# Test URLs
|
||||
urls = [
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
||||
"https://sample-videos.com/img/Sample-jpg-image-50kb.jpg",
|
||||
"https://www.w3schools.com/html/img_girl.jpg"
|
||||
]
|
||||
|
||||
# Download
|
||||
results = manager.download_urls(urls, Path("/tmp/test-downloads"))
|
||||
|
||||
# Print results
|
||||
logger.info(f"Downloaded {len([r for r in results if r.success])} of {len(results)} files")
|
||||
logger.info(f"Total bytes: {manager.stats['total_bytes'] / 1024:.1f} KB")
|
||||
logger.info(f"Total time: {manager.stats['total_time']:.2f} seconds")
|
||||
375
modules/downloader_monitor.py
Normal file
375
modules/downloader_monitor.py
Normal file
@@ -0,0 +1,375 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Downloader Monitoring Module
|
||||
Tracks download success/failure and sends alerts when downloaders are consistently failing
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, List
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
|
||||
class DownloaderMonitor:
|
||||
"""Monitor downloader health and send alerts on persistent failures"""
|
||||
|
||||
def __init__(self, unified_db=None, settings_manager=None):
|
||||
"""
|
||||
Initialize monitor
|
||||
|
||||
Args:
|
||||
unified_db: UnifiedDatabase instance
|
||||
settings_manager: SettingsManager instance for config
|
||||
"""
|
||||
self.db = unified_db
|
||||
self.settings_manager = settings_manager
|
||||
self.logger = get_logger('DownloaderMonitor')
|
||||
|
||||
# Default config
|
||||
self.config = {
|
||||
'enabled': True,
|
||||
'failure_window_hours': 3,
|
||||
'min_consecutive_failures': 2,
|
||||
'pushover': {
|
||||
'enabled': True,
|
||||
'priority': 1 # High priority
|
||||
},
|
||||
'downloaders': {
|
||||
'fastdl': True,
|
||||
'imginn': True,
|
||||
'toolzu': True,
|
||||
'instagram': True,
|
||||
'snapchat': True,
|
||||
'tiktok': True,
|
||||
'forums': True
|
||||
}
|
||||
}
|
||||
|
||||
# Load config from settings manager
|
||||
if self.settings_manager:
|
||||
try:
|
||||
monitoring_config = self.settings_manager.get('monitoring', {})
|
||||
if monitoring_config:
|
||||
self.config.update(monitoring_config)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not load monitoring config: {e}")
|
||||
|
||||
def log_download_attempt(self, downloader: str, username: str, success: bool,
|
||||
file_count: int = 0, error_message: str = None):
|
||||
"""
|
||||
Log a download attempt
|
||||
|
||||
Args:
|
||||
downloader: Downloader name (fastdl, imginn, toolzu, etc.)
|
||||
username: Username being downloaded
|
||||
success: Whether download succeeded
|
||||
file_count: Number of files downloaded
|
||||
error_message: Error message if failed
|
||||
"""
|
||||
if not self.config.get('enabled', True):
|
||||
return
|
||||
|
||||
# Check if this downloader is being monitored
|
||||
if not self.config.get('downloaders', {}).get(downloader, True):
|
||||
return
|
||||
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
INSERT INTO download_monitor
|
||||
(downloader, username, timestamp, success, file_count, error_message, alert_sent)
|
||||
VALUES (?, ?, ?, ?, ?, ?, 0)
|
||||
""", (
|
||||
downloader,
|
||||
username,
|
||||
datetime.now().isoformat(),
|
||||
1 if success else 0,
|
||||
file_count,
|
||||
error_message
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
self.logger.debug(f"Logged {downloader}/{username}: {'success' if success else 'failure'} ({file_count} files)")
|
||||
|
||||
# Check if we should send an alert
|
||||
if not success:
|
||||
self._check_and_alert(downloader, username)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to log download attempt: {e}")
|
||||
|
||||
def _check_and_alert(self, downloader: str, username: str):
|
||||
"""
|
||||
Check if downloader has been failing consistently and send alert
|
||||
|
||||
Args:
|
||||
downloader: Downloader name
|
||||
username: Username
|
||||
"""
|
||||
try:
|
||||
window_hours = self.config.get('failure_window_hours', 3)
|
||||
min_failures = self.config.get('min_consecutive_failures', 2)
|
||||
|
||||
cutoff_time = datetime.now() - timedelta(hours=window_hours)
|
||||
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get recent attempts within the window
|
||||
cursor.execute("""
|
||||
SELECT timestamp, success, file_count, error_message, alert_sent
|
||||
FROM download_monitor
|
||||
WHERE downloader = ? AND username = ?
|
||||
AND timestamp > ?
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT 10
|
||||
""", (downloader, username, cutoff_time.isoformat()))
|
||||
|
||||
attempts = cursor.fetchall()
|
||||
|
||||
if not attempts:
|
||||
return
|
||||
|
||||
# Count consecutive failures from most recent
|
||||
consecutive_failures = 0
|
||||
latest_error = None
|
||||
last_success_time = None
|
||||
|
||||
for attempt in attempts:
|
||||
if attempt['success'] == 0:
|
||||
consecutive_failures += 1
|
||||
if latest_error is None and attempt['error_message']:
|
||||
latest_error = attempt['error_message']
|
||||
else:
|
||||
last_success_time = attempt['timestamp']
|
||||
break
|
||||
|
||||
# Check if we should alert
|
||||
if consecutive_failures >= min_failures:
|
||||
# Check if we already sent an alert recently
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM download_monitor
|
||||
WHERE downloader = ? AND username = ?
|
||||
AND alert_sent = 1
|
||||
AND timestamp > ?
|
||||
""", (downloader, username, cutoff_time.isoformat()))
|
||||
|
||||
result = cursor.fetchone()
|
||||
alert_count = result[0] if result else 0
|
||||
|
||||
if alert_count == 0:
|
||||
# Send alert
|
||||
self._send_alert(
|
||||
downloader,
|
||||
username,
|
||||
consecutive_failures,
|
||||
last_success_time,
|
||||
latest_error
|
||||
)
|
||||
|
||||
# Mark most recent failure as alerted
|
||||
cursor.execute("""
|
||||
UPDATE download_monitor
|
||||
SET alert_sent = 1
|
||||
WHERE id = (
|
||||
SELECT id FROM download_monitor
|
||||
WHERE downloader = ? AND username = ?
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT 1
|
||||
)
|
||||
""", (downloader, username))
|
||||
conn.commit()
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to check for alerts: {e}")
|
||||
|
||||
def _send_alert(self, downloader: str, username: str, failure_count: int,
|
||||
last_success_time: str, error_message: str):
|
||||
"""
|
||||
Send Pushover alert for persistent failures
|
||||
|
||||
Args:
|
||||
downloader: Downloader name
|
||||
username: Username
|
||||
failure_count: Number of consecutive failures
|
||||
last_success_time: Timestamp of last success (ISO format)
|
||||
error_message: Latest error message
|
||||
"""
|
||||
if not self.config.get('pushover', {}).get('enabled', True):
|
||||
return
|
||||
|
||||
try:
|
||||
from modules.pushover_notifier import PushoverNotifier
|
||||
|
||||
# Get pushover config from settings
|
||||
pushover_config = {}
|
||||
if self.settings_manager:
|
||||
pushover_config = self.settings_manager.get('pushover', {})
|
||||
|
||||
if not pushover_config.get('enabled'):
|
||||
return
|
||||
|
||||
notifier = PushoverNotifier(
|
||||
api_token=pushover_config.get('api_token'),
|
||||
user_key=pushover_config.get('user_key')
|
||||
)
|
||||
|
||||
# Calculate time since last success
|
||||
time_since_success = "Never"
|
||||
if last_success_time:
|
||||
try:
|
||||
last_success = datetime.fromisoformat(last_success_time)
|
||||
delta = datetime.now() - last_success
|
||||
hours = int(delta.total_seconds() / 3600)
|
||||
if hours < 24:
|
||||
time_since_success = f"{hours} hours ago"
|
||||
else:
|
||||
days = hours // 24
|
||||
time_since_success = f"{days} days ago"
|
||||
except (ValueError, TypeError) as e:
|
||||
self.logger.warning(f"Failed to parse last_success_time '{last_success_time}': {e}")
|
||||
time_since_success = "Unknown (parse error)"
|
||||
|
||||
# Format downloader name nicely
|
||||
downloader_display = downloader.replace('_', ' ').title()
|
||||
|
||||
# Build message
|
||||
title = f"🚨 {downloader_display} Failing"
|
||||
message = f"""Downloader has been failing for {self.config.get('failure_window_hours', 3)}+ hours
|
||||
|
||||
Username: {username}
|
||||
Consecutive Failures: {failure_count}
|
||||
Last Success: {time_since_success}
|
||||
Latest Error: {error_message or 'Unknown'}
|
||||
|
||||
Check logs for details."""
|
||||
|
||||
# Send notification with high priority
|
||||
notifier.send_notification(
|
||||
title=title,
|
||||
message=message,
|
||||
priority=self.config.get('pushover', {}).get('priority', 1)
|
||||
)
|
||||
|
||||
self.logger.warning(f"Sent alert for {downloader}/{username} ({failure_count} failures)")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to send alert: {e}")
|
||||
|
||||
def get_downloader_status(self, downloader: str = None, hours: int = 24) -> List[Dict]:
|
||||
"""
|
||||
Get recent status for downloader(s)
|
||||
|
||||
Args:
|
||||
downloader: Specific downloader (None = all)
|
||||
hours: How many hours to look back
|
||||
|
||||
Returns:
|
||||
List of status dicts with stats per downloader
|
||||
"""
|
||||
try:
|
||||
cutoff = datetime.now() - timedelta(hours=hours)
|
||||
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
if downloader:
|
||||
cursor.execute("""
|
||||
SELECT
|
||||
downloader,
|
||||
COUNT(*) as total_attempts,
|
||||
SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful,
|
||||
SUM(CASE WHEN success = 0 THEN 1 ELSE 0 END) as failed,
|
||||
SUM(file_count) as total_files,
|
||||
MAX(CASE WHEN success = 1 THEN timestamp END) as last_success,
|
||||
MAX(timestamp) as last_attempt
|
||||
FROM download_monitor
|
||||
WHERE downloader = ? AND timestamp > ?
|
||||
GROUP BY downloader
|
||||
""", (downloader, cutoff.isoformat()))
|
||||
else:
|
||||
cursor.execute("""
|
||||
SELECT
|
||||
downloader,
|
||||
COUNT(*) as total_attempts,
|
||||
SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful,
|
||||
SUM(CASE WHEN success = 0 THEN 1 ELSE 0 END) as failed,
|
||||
SUM(file_count) as total_files,
|
||||
MAX(CASE WHEN success = 1 THEN timestamp END) as last_success,
|
||||
MAX(timestamp) as last_attempt
|
||||
FROM download_monitor
|
||||
WHERE timestamp > ?
|
||||
GROUP BY downloader
|
||||
ORDER BY downloader
|
||||
""", (cutoff.isoformat(),))
|
||||
|
||||
results = []
|
||||
for row in cursor.fetchall():
|
||||
results.append({
|
||||
'downloader': row['downloader'],
|
||||
'total_attempts': row['total_attempts'],
|
||||
'successful': row['successful'] or 0,
|
||||
'failed': row['failed'] or 0,
|
||||
'total_files': row['total_files'] or 0,
|
||||
'success_rate': round((row['successful'] or 0) / row['total_attempts'] * 100, 1) if row['total_attempts'] > 0 else 0,
|
||||
'last_success': row['last_success'],
|
||||
'last_attempt': row['last_attempt']
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get downloader status: {e}")
|
||||
return []
|
||||
|
||||
def clear_old_logs(self, days: int = 30):
|
||||
"""
|
||||
Clear monitoring logs older than specified days
|
||||
|
||||
Args:
|
||||
days: How many days to keep
|
||||
"""
|
||||
try:
|
||||
cutoff = datetime.now() - timedelta(days=days)
|
||||
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
DELETE FROM download_monitor
|
||||
WHERE timestamp < ?
|
||||
""", (cutoff.isoformat(),))
|
||||
deleted = cursor.rowcount
|
||||
conn.commit()
|
||||
|
||||
self.logger.info(f"Cleared {deleted} old monitoring logs (older than {days} days)")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to clear old logs: {e}")
|
||||
|
||||
|
||||
# Singleton instance with thread-safe initialization
|
||||
_monitor_instance = None
|
||||
_monitor_instance_lock = __import__('threading').Lock()
|
||||
|
||||
|
||||
def get_monitor(unified_db=None, settings_manager=None):
|
||||
"""Get or create monitor singleton (thread-safe)"""
|
||||
global _monitor_instance
|
||||
if _monitor_instance is None:
|
||||
with _monitor_instance_lock:
|
||||
# Double-check inside lock to prevent race condition
|
||||
if _monitor_instance is None:
|
||||
# Auto-initialize database if not provided
|
||||
if unified_db is None:
|
||||
from modules.unified_database import UnifiedDatabase
|
||||
unified_db = UnifiedDatabase()
|
||||
|
||||
# Auto-initialize settings manager if not provided
|
||||
if settings_manager is None:
|
||||
from modules.settings_manager import SettingsManager
|
||||
settings_manager = SettingsManager('/opt/media-downloader/database/media_downloader.db')
|
||||
|
||||
_monitor_instance = DownloaderMonitor(unified_db, settings_manager)
|
||||
return _monitor_instance
|
||||
502
modules/easynews_client.py
Normal file
502
modules/easynews_client.py
Normal file
@@ -0,0 +1,502 @@
|
||||
"""
|
||||
Easynews Client Module
|
||||
|
||||
Provides a client for interacting with the Easynews API to search for and download files.
|
||||
All connections use HTTPS with HTTP Basic Auth.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
from urllib.parse import quote, urljoin
|
||||
|
||||
import requests
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('EasynewsClient')
|
||||
|
||||
|
||||
@dataclass
|
||||
class EasynewsResult:
|
||||
"""Represents a single search result from Easynews."""
|
||||
filename: str
|
||||
download_url: str
|
||||
size_bytes: int
|
||||
post_date: Optional[str]
|
||||
subject: Optional[str]
|
||||
poster: Optional[str]
|
||||
newsgroup: Optional[str]
|
||||
extension: Optional[str]
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
'filename': self.filename,
|
||||
'download_url': self.download_url,
|
||||
'size_bytes': self.size_bytes,
|
||||
'post_date': self.post_date,
|
||||
'subject': self.subject,
|
||||
'poster': self.poster,
|
||||
'newsgroup': self.newsgroup,
|
||||
'extension': self.extension,
|
||||
}
|
||||
|
||||
|
||||
class EasynewsClient:
|
||||
"""
|
||||
Client for interacting with Easynews search and download APIs.
|
||||
|
||||
All connections use HTTPS with HTTP Basic Auth.
|
||||
Supports HTTP, HTTPS, SOCKS4, and SOCKS5 proxies.
|
||||
"""
|
||||
|
||||
BASE_URL = "https://members.easynews.com"
|
||||
SEARCH_URL = "https://members.easynews.com/2.0/search/solr-search/advanced"
|
||||
|
||||
# Quality patterns for parsing
|
||||
QUALITY_PATTERNS = [
|
||||
(r'2160p|4k|uhd', '2160p'),
|
||||
(r'1080p|fhd', '1080p'),
|
||||
(r'720p|hd', '720p'),
|
||||
(r'480p|sd', '480p'),
|
||||
(r'360p', '360p'),
|
||||
]
|
||||
|
||||
# Audio codec patterns (order matters - check combinations first)
|
||||
AUDIO_PATTERNS = [
|
||||
(r'truehd.*atmos|atmos.*truehd', 'Atmos'),
|
||||
(r'atmos', 'Atmos'),
|
||||
(r'truehd', 'TrueHD'),
|
||||
(r'dts[\.\-]?hd[\.\-]?ma', 'DTS-HD'),
|
||||
(r'dts[\.\-]?hd', 'DTS-HD'),
|
||||
(r'dts[\.\-]?x', 'DTS:X'),
|
||||
(r'dts', 'DTS'),
|
||||
(r'7[\.\-]?1', '7.1'),
|
||||
(r'ddp[\.\-\s]?5[\.\-]?1|eac3|e[\.\-]?ac[\.\-]?3|dd[\.\-]?5[\.\-]?1|ac3|5[\.\-]?1', '5.1'),
|
||||
(r'ddp|dd\+', '5.1'),
|
||||
(r'aac[\.\-]?5[\.\-]?1', '5.1'),
|
||||
(r'aac', 'AAC'),
|
||||
(r'flac', 'FLAC'),
|
||||
(r'mp3', 'MP3'),
|
||||
]
|
||||
|
||||
# Source/release type patterns
|
||||
SOURCE_PATTERNS = [
|
||||
(r'remux', 'Remux'),
|
||||
(r'blu[\.\-]?ray|bdrip|brrip', 'BluRay'),
|
||||
(r'web[\.\-]?dl', 'WEB-DL'),
|
||||
(r'webrip', 'WEBRip'),
|
||||
(r'web', 'WEB'),
|
||||
(r'hdtv', 'HDTV'),
|
||||
(r'dvdrip', 'DVDRip'),
|
||||
(r'dvd', 'DVD'),
|
||||
(r'hdcam|cam', 'CAM'),
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
username: str,
|
||||
password: str,
|
||||
proxy_enabled: bool = False,
|
||||
proxy_type: str = 'http',
|
||||
proxy_host: Optional[str] = None,
|
||||
proxy_port: Optional[int] = None,
|
||||
proxy_username: Optional[str] = None,
|
||||
proxy_password: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the Easynews client.
|
||||
|
||||
Args:
|
||||
username: Easynews username
|
||||
password: Easynews password
|
||||
proxy_enabled: Whether to use a proxy
|
||||
proxy_type: Proxy type (http, https, socks4, socks5)
|
||||
proxy_host: Proxy hostname/IP
|
||||
proxy_port: Proxy port
|
||||
proxy_username: Proxy auth username (optional)
|
||||
proxy_password: Proxy auth password (optional)
|
||||
"""
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.auth = HTTPBasicAuth(username, password)
|
||||
|
||||
# Set up session with retry logic
|
||||
self.session = requests.Session()
|
||||
self.session.auth = self.auth
|
||||
|
||||
# Configure proxy if enabled
|
||||
self.proxies = {}
|
||||
if proxy_enabled and proxy_host and proxy_port:
|
||||
proxy_url = self._build_proxy_url(
|
||||
proxy_type, proxy_host, proxy_port,
|
||||
proxy_username, proxy_password
|
||||
)
|
||||
self.proxies = {
|
||||
'http': proxy_url,
|
||||
'https': proxy_url,
|
||||
}
|
||||
self.session.proxies.update(self.proxies)
|
||||
logger.info(f"Easynews client configured with {proxy_type} proxy: {proxy_host}:{proxy_port}")
|
||||
|
||||
def _build_proxy_url(
|
||||
self,
|
||||
proxy_type: str,
|
||||
host: str,
|
||||
port: int,
|
||||
username: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Build a proxy URL with optional authentication."""
|
||||
scheme = proxy_type.lower()
|
||||
if scheme not in ('http', 'https', 'socks4', 'socks5'):
|
||||
scheme = 'http'
|
||||
|
||||
if username and password:
|
||||
return f"{scheme}://{quote(username)}:{quote(password)}@{host}:{port}"
|
||||
return f"{scheme}://{host}:{port}"
|
||||
|
||||
def test_connection(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Test the connection to Easynews with current credentials.
|
||||
|
||||
Returns:
|
||||
Dict with 'success' bool and 'message' string
|
||||
"""
|
||||
try:
|
||||
# Try to access the members area
|
||||
response = self.session.get(
|
||||
f"{self.BASE_URL}/",
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
# Check if we're actually authenticated (not redirected to login)
|
||||
if 'login' in response.url.lower() or 'sign in' in response.text.lower():
|
||||
return {
|
||||
'success': False,
|
||||
'message': 'Invalid credentials - authentication failed'
|
||||
}
|
||||
return {
|
||||
'success': True,
|
||||
'message': 'Successfully connected to Easynews'
|
||||
}
|
||||
elif response.status_code == 401:
|
||||
return {
|
||||
'success': False,
|
||||
'message': 'Invalid credentials - authentication failed'
|
||||
}
|
||||
else:
|
||||
return {
|
||||
'success': False,
|
||||
'message': f'Unexpected response: HTTP {response.status_code}'
|
||||
}
|
||||
except requests.exceptions.ProxyError as e:
|
||||
return {
|
||||
'success': False,
|
||||
'message': f'Proxy connection failed: {str(e)}'
|
||||
}
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
return {
|
||||
'success': False,
|
||||
'message': f'Connection failed: {str(e)}'
|
||||
}
|
||||
except requests.exceptions.Timeout:
|
||||
return {
|
||||
'success': False,
|
||||
'message': 'Connection timed out'
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Easynews connection test failed: {e}")
|
||||
return {
|
||||
'success': False,
|
||||
'message': f'Connection test failed: {str(e)}'
|
||||
}
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
page: int = 1,
|
||||
results_per_page: int = 50,
|
||||
file_types: Optional[List[str]] = None,
|
||||
) -> List[EasynewsResult]:
|
||||
"""
|
||||
Search Easynews for files matching the query.
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
page: Page number (1-indexed)
|
||||
results_per_page: Number of results per page (max 250)
|
||||
file_types: Optional list of file extensions to filter (e.g., ['mkv', 'mp4'])
|
||||
|
||||
Returns:
|
||||
List of EasynewsResult objects
|
||||
"""
|
||||
try:
|
||||
# Build search parameters
|
||||
params = {
|
||||
'gps': query,
|
||||
'pby': min(results_per_page, 250),
|
||||
'pno': page,
|
||||
'sS': 1, # Safe search off
|
||||
'saession': '', # Session
|
||||
'sb': 1, # Sort by date
|
||||
'sbj': 1, # Subject search
|
||||
'fly': 2, # File type filter mode
|
||||
'fex': 'mkv,mp4', # Only mkv and mp4 files
|
||||
}
|
||||
|
||||
# Add file type filter if specified
|
||||
if file_types:
|
||||
params['fty[]'] = file_types
|
||||
else:
|
||||
# Default to video file types
|
||||
params['fty[]'] = ['VIDEO']
|
||||
|
||||
response = self.session.get(
|
||||
self.SEARCH_URL,
|
||||
params=params,
|
||||
timeout=60,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check for empty response
|
||||
if not response.content or not response.content.strip():
|
||||
logger.warning(f"Easynews search for '{query}' returned empty response (HTTP {response.status_code})")
|
||||
return []
|
||||
|
||||
try:
|
||||
data = response.json()
|
||||
except (ValueError, Exception) as json_err:
|
||||
logger.warning(f"Easynews search for '{query}' returned invalid JSON (HTTP {response.status_code}, body: {response.text[:200]}): {json_err}")
|
||||
return []
|
||||
results = []
|
||||
|
||||
# Parse the response
|
||||
if 'data' in data and isinstance(data['data'], list):
|
||||
for item in data['data']:
|
||||
result = self._parse_search_result(item)
|
||||
if result:
|
||||
results.append(result)
|
||||
|
||||
logger.info(f"Easynews search for '{query}' returned {len(results)} results")
|
||||
return results
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Easynews search failed: {e}")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing Easynews search results: {e}")
|
||||
return []
|
||||
|
||||
def _parse_search_result(self, item: Dict[str, Any]) -> Optional[EasynewsResult]:
|
||||
"""Parse a single search result from the API response."""
|
||||
try:
|
||||
# Extract the filename
|
||||
filename = item.get('fn', '') or item.get('0', '')
|
||||
if not filename:
|
||||
return None
|
||||
|
||||
# Build download URL
|
||||
# Format: https://username:password@members.easynews.com/dl/{hash}/{filename}
|
||||
file_hash = item.get('hash', '') or item.get('0', '')
|
||||
sig = item.get('sig', '')
|
||||
|
||||
if file_hash and sig:
|
||||
# Use the authenticated download URL format
|
||||
download_path = f"/dl/{file_hash}/{quote(filename)}?sig={sig}"
|
||||
download_url = f"https://{quote(self.username)}:{quote(self.password)}@members.easynews.com{download_path}"
|
||||
else:
|
||||
# Fallback to basic URL
|
||||
download_url = item.get('url', '') or item.get('rawURL', '')
|
||||
if download_url and not download_url.startswith('http'):
|
||||
download_url = urljoin(self.BASE_URL, download_url)
|
||||
|
||||
if not download_url:
|
||||
return None
|
||||
|
||||
# Parse size
|
||||
size_bytes = 0
|
||||
size_str = item.get('rawSize', '') or item.get('size', '')
|
||||
if isinstance(size_str, (int, float)):
|
||||
size_bytes = int(size_str)
|
||||
elif isinstance(size_str, str):
|
||||
size_bytes = self._parse_size(size_str)
|
||||
|
||||
# Parse date
|
||||
post_date = item.get('date', '') or item.get('d', '')
|
||||
if post_date:
|
||||
try:
|
||||
# Try to parse and standardize the date format
|
||||
if isinstance(post_date, str):
|
||||
post_date = post_date.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Get extension from API field (more reliable than parsing filename)
|
||||
extension = item.get('extension', '') or item.get('11', '') or item.get('2', '')
|
||||
if extension and not extension.startswith('.'):
|
||||
extension = '.' + extension
|
||||
|
||||
return EasynewsResult(
|
||||
filename=filename,
|
||||
download_url=download_url,
|
||||
size_bytes=size_bytes,
|
||||
post_date=post_date if post_date else None,
|
||||
subject=item.get('subject', '') or item.get('s', ''),
|
||||
poster=item.get('poster', '') or item.get('p', ''),
|
||||
newsgroup=item.get('newsgroup', '') or item.get('ng', ''),
|
||||
extension=extension if extension else self._get_extension(filename),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to parse search result: {e}")
|
||||
return None
|
||||
|
||||
def _parse_size(self, size_str: str) -> int:
|
||||
"""Parse a size string like '1.5 GB' to bytes."""
|
||||
try:
|
||||
size_str = size_str.strip().upper()
|
||||
multipliers = {
|
||||
'B': 1,
|
||||
'KB': 1024,
|
||||
'MB': 1024 ** 2,
|
||||
'GB': 1024 ** 3,
|
||||
'TB': 1024 ** 4,
|
||||
}
|
||||
|
||||
for suffix, multiplier in multipliers.items():
|
||||
if size_str.endswith(suffix):
|
||||
value = float(size_str[:-len(suffix)].strip())
|
||||
return int(value * multiplier)
|
||||
|
||||
# Try to parse as plain number
|
||||
return int(float(size_str))
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
def _get_extension(self, filename: str) -> Optional[str]:
|
||||
"""Extract file extension from filename."""
|
||||
if '.' in filename:
|
||||
return filename.rsplit('.', 1)[-1].lower()
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def detect_quality(filename: str) -> Optional[str]:
|
||||
"""Detect video quality from filename."""
|
||||
filename_lower = filename.lower()
|
||||
for pattern, quality in EasynewsClient.QUALITY_PATTERNS:
|
||||
if re.search(pattern, filename_lower):
|
||||
return quality
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def detect_audio(filename: str) -> Optional[str]:
|
||||
"""Detect audio codec from filename."""
|
||||
filename_lower = filename.lower()
|
||||
for pattern, audio in EasynewsClient.AUDIO_PATTERNS:
|
||||
if re.search(pattern, filename_lower):
|
||||
return audio
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def detect_source(filename: str) -> Optional[str]:
|
||||
"""Detect source/release type from filename."""
|
||||
filename_lower = filename.lower()
|
||||
for pattern, source in EasynewsClient.SOURCE_PATTERNS:
|
||||
if re.search(pattern, filename_lower):
|
||||
return source
|
||||
return None
|
||||
|
||||
def download_file(
|
||||
self,
|
||||
url: str,
|
||||
dest_path: str,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||||
chunk_size: int = 8192,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Download a file from Easynews.
|
||||
|
||||
Args:
|
||||
url: Download URL (with authentication embedded or using session)
|
||||
dest_path: Destination file path
|
||||
progress_callback: Optional callback(downloaded_bytes, total_bytes)
|
||||
chunk_size: Download chunk size in bytes
|
||||
|
||||
Returns:
|
||||
Dict with 'success' bool and 'message' or 'path'
|
||||
"""
|
||||
try:
|
||||
# Start the download with streaming
|
||||
response = self.session.get(
|
||||
url,
|
||||
stream=True,
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
total_size = int(response.headers.get('content-length', 0))
|
||||
downloaded = 0
|
||||
|
||||
with open(dest_path, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=chunk_size):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
if progress_callback:
|
||||
progress_callback(downloaded, total_size)
|
||||
|
||||
logger.info(f"Downloaded file to {dest_path} ({downloaded} bytes)")
|
||||
return {
|
||||
'success': True,
|
||||
'path': dest_path,
|
||||
'size': downloaded,
|
||||
}
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Download failed: {e}")
|
||||
return {
|
||||
'success': False,
|
||||
'message': f'Download failed: {str(e)}'
|
||||
}
|
||||
except IOError as e:
|
||||
logger.error(f"Failed to write file: {e}")
|
||||
return {
|
||||
'success': False,
|
||||
'message': f'Failed to write file: {str(e)}'
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error during download: {e}")
|
||||
return {
|
||||
'success': False,
|
||||
'message': f'Download error: {str(e)}'
|
||||
}
|
||||
|
||||
def get_file_info(self, url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get information about a file without downloading it.
|
||||
|
||||
Args:
|
||||
url: File URL
|
||||
|
||||
Returns:
|
||||
Dict with file information (size, content-type, etc.)
|
||||
"""
|
||||
try:
|
||||
response = self.session.head(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'size': int(response.headers.get('content-length', 0)),
|
||||
'content_type': response.headers.get('content-type', ''),
|
||||
'last_modified': response.headers.get('last-modified', ''),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get file info: {e}")
|
||||
return {
|
||||
'success': False,
|
||||
'message': str(e)
|
||||
}
|
||||
1650
modules/easynews_monitor.py
Normal file
1650
modules/easynews_monitor.py
Normal file
File diff suppressed because it is too large
Load Diff
1762
modules/face_recognition_module.py
Normal file
1762
modules/face_recognition_module.py
Normal file
File diff suppressed because it is too large
Load Diff
3776
modules/fastdl_module.py
Executable file
3776
modules/fastdl_module.py
Executable file
File diff suppressed because it is too large
Load Diff
382
modules/filename_parser.py
Normal file
382
modules/filename_parser.py
Normal file
@@ -0,0 +1,382 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Filename Parser Module for Manual Import
|
||||
Parses filenames based on configurable patterns to extract metadata
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
|
||||
class FilenameParser:
|
||||
"""
|
||||
Parse filenames using configurable patterns to extract metadata.
|
||||
|
||||
Supported pattern tokens:
|
||||
- {username} - Username/source (alphanumeric, underscores, periods)
|
||||
- {YYYYMMDD} - Date as 8 digits (20251127)
|
||||
- {HHMMSS} - Time as 6 digits (172753)
|
||||
- {YYYYMMDD_HHMMSS} - Combined date_time with underscore
|
||||
- {id} - Media ID (any characters until next separator)
|
||||
- {description} - Text content (any characters until next separator)
|
||||
- {num} - Sequence number (digits)
|
||||
- {ext} - File extension (optional, auto-handled)
|
||||
|
||||
Example patterns:
|
||||
- Instagram Stories: "{username}_{YYYYMMDD}_{HHMMSS}_{id}"
|
||||
- Instagram Posts: "{username}_{YYYYMMDD}_{HHMMSS}_{id}"
|
||||
- TikTok: "{YYYYMMDD}_{description}_{id}_{num}"
|
||||
"""
|
||||
|
||||
# Token definitions: token_name -> (regex_pattern, is_greedy)
|
||||
TOKEN_PATTERNS = {
|
||||
'username': (r'[a-zA-Z0-9_.]+', False),
|
||||
'YYYYMMDD': (r'\d{8}', False),
|
||||
'HHMMSS': (r'\d{6}', False),
|
||||
'YYYYMMDD_HHMMSS': (r'\d{8}_\d{6}', False),
|
||||
'id': (r'.+', True), # Greedy - matches everything until separator
|
||||
'description': (r'.+', True), # Greedy
|
||||
'num': (r'\d+', False),
|
||||
'ext': (r'\.[a-zA-Z0-9]+', False),
|
||||
}
|
||||
|
||||
def __init__(self, pattern: str):
|
||||
"""
|
||||
Initialize parser with a filename pattern.
|
||||
|
||||
Args:
|
||||
pattern: Pattern string like "{username}-{YYYYMMDD}_{HHMMSS}-{id}"
|
||||
"""
|
||||
self.pattern = pattern
|
||||
self.regex, self.token_order = self._compile_pattern(pattern)
|
||||
|
||||
def _compile_pattern(self, pattern: str) -> tuple:
|
||||
"""
|
||||
Convert pattern string to compiled regex.
|
||||
|
||||
Returns:
|
||||
Tuple of (compiled_regex, list_of_token_names)
|
||||
"""
|
||||
# Find all tokens in the pattern
|
||||
token_regex = r'\{(\w+)\}'
|
||||
tokens = re.findall(token_regex, pattern)
|
||||
|
||||
# Build regex pattern
|
||||
regex_pattern = pattern
|
||||
|
||||
# Escape special regex characters in the pattern (except our tokens)
|
||||
# First, temporarily replace tokens
|
||||
for i, token in enumerate(tokens):
|
||||
regex_pattern = regex_pattern.replace(f'{{{token}}}', f'__TOKEN_{i}__', 1)
|
||||
|
||||
# Escape special chars
|
||||
regex_pattern = re.escape(regex_pattern)
|
||||
|
||||
# Replace tokens back with their regex patterns
|
||||
for i, token in enumerate(tokens):
|
||||
if token in self.TOKEN_PATTERNS:
|
||||
token_pattern, is_greedy = self.TOKEN_PATTERNS[token]
|
||||
# Use non-greedy for greedy tokens when there's a separator after
|
||||
if is_greedy:
|
||||
# Make it non-greedy so it stops at the next separator
|
||||
token_pattern = r'.+?'
|
||||
regex_pattern = regex_pattern.replace(f'__TOKEN_{i}__', f'({token_pattern})', 1)
|
||||
else:
|
||||
# Unknown token - treat as any characters
|
||||
regex_pattern = regex_pattern.replace(f'__TOKEN_{i}__', r'(.+?)', 1)
|
||||
|
||||
# Handle the last greedy token specially - it should be truly greedy
|
||||
# Find the last greedy token and make it greedy
|
||||
for token in reversed(tokens):
|
||||
if token in self.TOKEN_PATTERNS:
|
||||
_, is_greedy = self.TOKEN_PATTERNS[token]
|
||||
if is_greedy:
|
||||
# The last occurrence of .+? for this token should be .+
|
||||
# We need to be more careful here - just make the whole pattern work
|
||||
break
|
||||
|
||||
# Add start anchor, but allow extension at end
|
||||
regex_pattern = '^' + regex_pattern + r'(?:\.[a-zA-Z0-9]+)?$'
|
||||
|
||||
try:
|
||||
compiled = re.compile(regex_pattern)
|
||||
except re.error as e:
|
||||
raise ValueError(f"Invalid pattern '{pattern}': {e}")
|
||||
|
||||
return compiled, tokens
|
||||
|
||||
def parse(self, filename: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse a filename and extract metadata.
|
||||
|
||||
Args:
|
||||
filename: Filename to parse (with or without extension)
|
||||
|
||||
Returns:
|
||||
Dictionary with extracted metadata:
|
||||
- username: str or None
|
||||
- datetime: datetime object or None
|
||||
- media_id: str or None
|
||||
- description: str or None
|
||||
- num: int or None
|
||||
- extension: str or None
|
||||
- valid: bool
|
||||
- error: str or None (if valid is False)
|
||||
"""
|
||||
result = {
|
||||
'username': None,
|
||||
'datetime': None,
|
||||
'media_id': None,
|
||||
'description': None,
|
||||
'num': None,
|
||||
'extension': None,
|
||||
'valid': False,
|
||||
'error': None,
|
||||
'raw_values': {}
|
||||
}
|
||||
|
||||
# Extract extension
|
||||
path = Path(filename)
|
||||
extension = path.suffix.lower() if path.suffix else None
|
||||
basename = path.stem
|
||||
result['extension'] = extension
|
||||
|
||||
# Try to match the pattern
|
||||
match = self.regex.match(basename) or self.regex.match(filename)
|
||||
|
||||
if not match:
|
||||
result['error'] = f"Filename doesn't match pattern: {self.pattern}"
|
||||
return result
|
||||
|
||||
# Extract values for each token
|
||||
groups = match.groups()
|
||||
for i, token in enumerate(self.token_order):
|
||||
if i < len(groups):
|
||||
value = groups[i]
|
||||
result['raw_values'][token] = value
|
||||
|
||||
# Map tokens to result fields
|
||||
if token == 'username':
|
||||
result['username'] = value.lower()
|
||||
elif token == 'id':
|
||||
result['media_id'] = value
|
||||
elif token == 'description':
|
||||
result['description'] = value
|
||||
elif token == 'num':
|
||||
try:
|
||||
result['num'] = int(value)
|
||||
except ValueError:
|
||||
result['num'] = value
|
||||
|
||||
# Parse datetime from date/time tokens
|
||||
result['datetime'] = self._parse_datetime(result['raw_values'])
|
||||
|
||||
result['valid'] = True
|
||||
return result
|
||||
|
||||
def _parse_datetime(self, raw_values: Dict[str, str]) -> Optional[datetime]:
|
||||
"""
|
||||
Parse datetime from extracted raw values.
|
||||
|
||||
Supports:
|
||||
- YYYYMMDD_HHMMSS combined
|
||||
- YYYYMMDD + HHMMSS separate
|
||||
- YYYYMMDD only (time defaults to 00:00:00)
|
||||
"""
|
||||
try:
|
||||
if 'YYYYMMDD_HHMMSS' in raw_values:
|
||||
dt_str = raw_values['YYYYMMDD_HHMMSS']
|
||||
return datetime.strptime(dt_str, '%Y%m%d_%H%M%S')
|
||||
|
||||
if 'YYYYMMDD' in raw_values:
|
||||
date_str = raw_values['YYYYMMDD']
|
||||
|
||||
if 'HHMMSS' in raw_values:
|
||||
time_str = raw_values['HHMMSS']
|
||||
return datetime.strptime(f'{date_str}_{time_str}', '%Y%m%d_%H%M%S')
|
||||
else:
|
||||
# Date only, no time
|
||||
return datetime.strptime(date_str, '%Y%m%d')
|
||||
|
||||
return None
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def validate_pattern(self) -> tuple:
|
||||
"""
|
||||
Validate the pattern is properly formed.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid: bool, error_message: str or None)
|
||||
"""
|
||||
try:
|
||||
# Check for at least one recognized token
|
||||
token_regex = r'\{(\w+)\}'
|
||||
tokens = re.findall(token_regex, self.pattern)
|
||||
|
||||
if not tokens:
|
||||
return False, "Pattern must contain at least one token"
|
||||
|
||||
# Check all tokens are recognized
|
||||
unknown_tokens = [t for t in tokens if t not in self.TOKEN_PATTERNS]
|
||||
if unknown_tokens:
|
||||
return False, f"Unknown tokens: {', '.join(unknown_tokens)}"
|
||||
|
||||
return True, None
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
def create_parser(pattern: str) -> FilenameParser:
|
||||
"""
|
||||
Factory function to create a FilenameParser.
|
||||
|
||||
Args:
|
||||
pattern: Pattern string
|
||||
|
||||
Returns:
|
||||
FilenameParser instance
|
||||
"""
|
||||
return FilenameParser(pattern)
|
||||
|
||||
|
||||
def parse_with_fallbacks(filename: str, patterns: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Try parsing a filename with multiple patterns, return first successful match.
|
||||
|
||||
Args:
|
||||
filename: Filename to parse
|
||||
patterns: List of pattern strings to try in order
|
||||
|
||||
Returns:
|
||||
Dictionary with extracted metadata (same as FilenameParser.parse)
|
||||
"""
|
||||
last_error = None
|
||||
for pattern in patterns:
|
||||
try:
|
||||
parser = FilenameParser(pattern)
|
||||
result = parser.parse(filename)
|
||||
if result['valid']:
|
||||
result['matched_pattern'] = pattern
|
||||
return result
|
||||
last_error = result.get('error')
|
||||
except Exception as e:
|
||||
last_error = str(e)
|
||||
|
||||
# Return failure with last error
|
||||
return {
|
||||
'username': None,
|
||||
'datetime': None,
|
||||
'media_id': None,
|
||||
'description': None,
|
||||
'num': None,
|
||||
'extension': Path(filename).suffix.lower() if Path(filename).suffix else None,
|
||||
'valid': False,
|
||||
'error': last_error or f"Filename doesn't match any of {len(patterns)} patterns",
|
||||
'raw_values': {}
|
||||
}
|
||||
|
||||
|
||||
# Instagram has many filename formats from different download sources
|
||||
INSTAGRAM_PATTERNS = [
|
||||
# Standard gallery-dl formats
|
||||
'{username}_{YYYYMMDD}_{HHMMSS}_{id}', # gallery-dl default (underscores)
|
||||
'{username}-{YYYYMMDD}_{HHMMSS}-{id}', # alternative format (dashes around date)
|
||||
# Formats with _n suffix (common from some scrapers)
|
||||
'{username}_{YYYYMMDD}_{HHMMSS}_{id}_n', # with _n suffix
|
||||
'{username}-{YYYYMMDD}_{HHMMSS}-{id}_n', # dashes + _n suffix
|
||||
# Formats with hl=en language parameter (imginn/instaloader variants)
|
||||
'{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}_n', # language tag + _n suffix
|
||||
'{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}', # language tag, no _n suffix
|
||||
# Formats with leading underscore (some scrapers prefix underscore)
|
||||
'_{username}_{YYYYMMDD}_{HHMMSS}_{id}_n', # leading underscore + _n suffix
|
||||
'_{username}_hl=en-{YYYYMMDD}_{HHMMSS}-{id}_n', # leading underscore + lang + _n
|
||||
# Formats with media shortcode before date (some browser extensions / save tools)
|
||||
'{username}-video-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-video-shortcode-date_hash
|
||||
'{username}-photo-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-photo-shortcode-date_hash
|
||||
'{username}-{id}-{YYYYMMDD}_{HHMMSS}_{description}', # username-shortcode-date_hash (no type prefix, must be last)
|
||||
]
|
||||
|
||||
|
||||
# Predefined patterns for common platforms
|
||||
PRESET_PATTERNS = {
|
||||
'instagram_stories': {
|
||||
'name': 'Instagram Stories',
|
||||
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
|
||||
'alt_patterns': INSTAGRAM_PATTERNS,
|
||||
'example': 'evalongoria_20251127_172753_AQOGOcCUbrMy...',
|
||||
'platform': 'instagram',
|
||||
'content_type': 'stories'
|
||||
},
|
||||
'instagram_posts': {
|
||||
'name': 'Instagram Posts',
|
||||
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
|
||||
'alt_patterns': INSTAGRAM_PATTERNS,
|
||||
'example': 'evalongoria_20251127_172753_18538674661006538',
|
||||
'platform': 'instagram',
|
||||
'content_type': 'posts'
|
||||
},
|
||||
'instagram_reels': {
|
||||
'name': 'Instagram Reels',
|
||||
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
|
||||
'alt_patterns': INSTAGRAM_PATTERNS,
|
||||
'example': 'evalongoria_20251127_172753_18538674661006538',
|
||||
'platform': 'instagram',
|
||||
'content_type': 'reels'
|
||||
},
|
||||
'tiktok_videos': {
|
||||
'name': 'TikTok Videos',
|
||||
'pattern': '{YYYYMMDD}_{description}_{id}_{num}',
|
||||
'example': '20251127_beautiful_sunset_1234567890_1',
|
||||
'platform': 'tiktok',
|
||||
'content_type': 'videos'
|
||||
},
|
||||
'snapchat_stories': {
|
||||
'name': 'Snapchat Stories',
|
||||
'pattern': '{username}_{YYYYMMDD}_{HHMMSS}_{id}',
|
||||
'example': 'username_20251127_172753_story123',
|
||||
'platform': 'snapchat',
|
||||
'content_type': 'stories'
|
||||
},
|
||||
'youtube_videos': {
|
||||
'name': 'YouTube Videos',
|
||||
'pattern': '{id}',
|
||||
'example': 'dQw4w9WgXcQ',
|
||||
'platform': 'youtube',
|
||||
'content_type': 'videos',
|
||||
'use_ytdlp': True
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def get_preset_patterns() -> Dict[str, Dict]:
|
||||
"""Get all predefined filename patterns."""
|
||||
return PRESET_PATTERNS.copy()
|
||||
|
||||
|
||||
# Test/demo function
|
||||
if __name__ == '__main__':
|
||||
# Test with the user's example
|
||||
test_pattern = '{username}-{YYYYMMDD}_{HHMMSS}-{id}'
|
||||
test_filename = 'tiannahcgarcia-20251127_172753-AQOGOcCUbrMyAL0VXcQjnpHr6aY6U25C1SbaREqFJv7_MVXNVUvBd290MwlNFmwOTK5PuLx6DtK9cYoot0c5Y6a4vuDtOaug2heLank.jpg'
|
||||
|
||||
parser = FilenameParser(test_pattern)
|
||||
result = parser.parse(test_filename)
|
||||
|
||||
print(f"Pattern: {test_pattern}")
|
||||
print(f"Filename: {test_filename}")
|
||||
print(f"Result: {result}")
|
||||
print()
|
||||
|
||||
# Test Instagram post format
|
||||
test_pattern2 = '{username}_{YYYYMMDD}_{HHMMSS}_{id}'
|
||||
test_filename2 = 'evalongoria_20251027_155842_18538674661006538.jpg'
|
||||
|
||||
parser2 = FilenameParser(test_pattern2)
|
||||
result2 = parser2.parse(test_filename2)
|
||||
|
||||
print(f"Pattern: {test_pattern2}")
|
||||
print(f"Filename: {test_filename2}")
|
||||
print(f"Result: {result2}")
|
||||
485
modules/forum_db_adapter.py
Executable file
485
modules/forum_db_adapter.py
Executable file
@@ -0,0 +1,485 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Forum Database Adapter for Unified Database
|
||||
Provides compatibility layer for forum_downloader to use UnifiedDatabase
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
import time
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('ForumAdapter')
|
||||
|
||||
class ForumDatabaseAdapter:
|
||||
"""
|
||||
Adapter to allow forum_downloader to use UnifiedDatabase
|
||||
Mimics the original forum database interface
|
||||
"""
|
||||
|
||||
def __init__(self, unified_db, db_path=None):
|
||||
"""
|
||||
Initialize the adapter
|
||||
|
||||
Args:
|
||||
unified_db: UnifiedDatabase instance
|
||||
db_path: Ignored - kept for compatibility
|
||||
"""
|
||||
self.unified_db = unified_db
|
||||
self.db_path = db_path # Keep for compatibility but not used
|
||||
|
||||
def get_file_hash(self, file_path: str) -> Optional[str]:
|
||||
"""Calculate SHA256 hash of a file (delegates to UnifiedDatabase)"""
|
||||
from modules.unified_database import UnifiedDatabase
|
||||
return UnifiedDatabase.get_file_hash(file_path)
|
||||
|
||||
def get_download_by_file_hash(self, file_hash: str) -> Optional[Dict]:
|
||||
"""Get download record by file hash (delegates to UnifiedDatabase)"""
|
||||
return self.unified_db.get_download_by_file_hash(file_hash)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
pass
|
||||
|
||||
def _get_connection(self):
|
||||
"""Get a connection from unified database"""
|
||||
return self.unified_db.get_connection(for_write=True)
|
||||
|
||||
def _execute_with_retry(self, operation, retries: int = 3, for_write: bool = False):
|
||||
"""
|
||||
Execute a database operation with retry logic for lock/deadlock errors.
|
||||
|
||||
Args:
|
||||
operation: A callable that takes a connection and returns a result
|
||||
retries: Number of retry attempts
|
||||
for_write: Whether this is a write operation
|
||||
|
||||
Returns:
|
||||
The result of the operation
|
||||
|
||||
Raises:
|
||||
sqlite3.OperationalError: If operation fails after all retries
|
||||
"""
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
with self.unified_db.get_connection(for_write=for_write) as conn:
|
||||
return operation(conn)
|
||||
except sqlite3.OperationalError as e:
|
||||
if ("locked" in str(e) or "deadlock" in str(e).lower()) and attempt < retries - 1:
|
||||
delay = 1 + attempt * 2 # Exponential backoff
|
||||
logger.debug(f"Database locked, retrying in {delay} seconds...")
|
||||
time.sleep(delay)
|
||||
continue
|
||||
else:
|
||||
logger.error(f"Database operation failed after {attempt + 1} attempts: {e}")
|
||||
raise
|
||||
# This point should never be reached due to the raise above,
|
||||
# but raise explicitly to satisfy type checkers
|
||||
raise sqlite3.OperationalError("Database operation failed after all retries")
|
||||
|
||||
def db_add_thread(self, thread_id: str, forum_name: str, thread_url: str,
|
||||
thread_title: str = None, monitor_until: datetime = None) -> bool:
|
||||
"""Add a forum thread to tracking"""
|
||||
def operation(conn):
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT OR IGNORE INTO forum_threads
|
||||
(thread_id, forum_name, thread_url, thread_title,
|
||||
created_date, last_checked, status, monitor_until)
|
||||
VALUES (?, ?, ?, ?, ?, ?, 'active', ?)
|
||||
''', (thread_id, forum_name, thread_url, thread_title,
|
||||
datetime.now(), datetime.now(), monitor_until))
|
||||
conn.commit()
|
||||
return cursor.rowcount > 0
|
||||
|
||||
try:
|
||||
return self._execute_with_retry(operation, for_write=True)
|
||||
except Exception as e:
|
||||
logger.error(f"Error adding thread: {e}")
|
||||
return False
|
||||
|
||||
def db_update_thread(self, thread_id: str, last_post_date: datetime = None,
|
||||
post_count: int = None) -> bool:
|
||||
"""Update thread information"""
|
||||
# Build updates list outside the operation for clarity
|
||||
updates = ["last_checked = ?"]
|
||||
params = [datetime.now()]
|
||||
|
||||
if last_post_date:
|
||||
updates.append("last_post_date = ?")
|
||||
params.append(last_post_date)
|
||||
|
||||
if post_count is not None:
|
||||
updates.append("post_count = ?")
|
||||
params.append(post_count)
|
||||
|
||||
params.append(thread_id)
|
||||
|
||||
# Pre-build the SQL query to avoid f-string inside operation
|
||||
sql = f'UPDATE forum_threads SET {", ".join(updates)} WHERE thread_id = ?'
|
||||
|
||||
def operation(conn):
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(sql, params)
|
||||
conn.commit()
|
||||
return cursor.rowcount > 0
|
||||
|
||||
try:
|
||||
return self._execute_with_retry(operation, for_write=True)
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating thread {thread_id}: {e}")
|
||||
return False
|
||||
|
||||
def db_update_thread_last_checked(self, thread_id: str) -> bool:
|
||||
"""Update the last_checked timestamp for a forum thread"""
|
||||
def operation(conn):
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
UPDATE forum_threads
|
||||
SET last_checked = ?
|
||||
WHERE thread_id = ?
|
||||
''', (datetime.now(), thread_id))
|
||||
conn.commit()
|
||||
return cursor.rowcount > 0
|
||||
|
||||
try:
|
||||
return self._execute_with_retry(operation, for_write=True)
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating last_checked for thread {thread_id}: {e}")
|
||||
return False
|
||||
|
||||
def db_get_thread(self, thread_id: str) -> Optional[Dict]:
|
||||
"""Get thread information"""
|
||||
def operation(conn):
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT * FROM forum_threads WHERE thread_id = ?",
|
||||
(thread_id,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
try:
|
||||
return self._execute_with_retry(operation, for_write=False)
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting thread {thread_id}: {e}")
|
||||
return None
|
||||
|
||||
def db_add_post(self, post_id: str, thread_id: str, post_url: str = None,
|
||||
author: str = None, post_date: datetime = None,
|
||||
has_images: bool = False) -> bool:
|
||||
"""Add a forum post"""
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
content_hash = hashlib.sha256(f"{thread_id}:{post_id}".encode()).hexdigest()
|
||||
cursor.execute('''
|
||||
INSERT INTO forum_posts
|
||||
(post_id, thread_id, post_url, author, post_date,
|
||||
content_hash, has_images)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT (post_id) DO UPDATE SET
|
||||
thread_id = EXCLUDED.thread_id,
|
||||
post_url = EXCLUDED.post_url,
|
||||
author = EXCLUDED.author,
|
||||
post_date = EXCLUDED.post_date,
|
||||
content_hash = EXCLUDED.content_hash,
|
||||
has_images = EXCLUDED.has_images
|
||||
''', (post_id, thread_id, post_url, author, post_date,
|
||||
content_hash, has_images))
|
||||
conn.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error adding post: {e}")
|
||||
return False
|
||||
|
||||
def db_get_image_id(self, img_url: str) -> Optional[int]:
|
||||
"""Check if image already exists in downloads"""
|
||||
url_hash = self.unified_db.get_url_hash(img_url)
|
||||
|
||||
def operation(conn):
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT id FROM downloads WHERE url_hash = ? AND platform = 'forums'",
|
||||
(url_hash,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
try:
|
||||
return self._execute_with_retry(operation, for_write=False)
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking image existence: {e}")
|
||||
return None
|
||||
|
||||
def db_add_image(self, img_url: str, thread_id: str, post_id: str,
|
||||
filename: str, file_path: str, forum_name: str) -> bool:
|
||||
"""Add image to downloads"""
|
||||
metadata = {
|
||||
'thread_id': thread_id,
|
||||
'post_id': post_id,
|
||||
'forum_name': forum_name
|
||||
}
|
||||
|
||||
return self.unified_db.record_download(
|
||||
url=img_url,
|
||||
platform='forums',
|
||||
source=forum_name,
|
||||
content_type='image',
|
||||
filename=filename,
|
||||
file_path=file_path,
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
def db_search_exists(self, search_id: str) -> bool:
|
||||
"""Check if search already exists"""
|
||||
def operation(conn):
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT 1 FROM search_monitors WHERE search_id = ?",
|
||||
(search_id,)
|
||||
)
|
||||
return cursor.fetchone() is not None
|
||||
|
||||
try:
|
||||
return self._execute_with_retry(operation, for_write=False)
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking search existence: {e}")
|
||||
return False
|
||||
|
||||
def db_add_search(self, search_id: str, forum_name: str, search_query: str,
|
||||
search_url: str = None, check_frequency_hours: int = 24) -> bool:
|
||||
"""Add or update search monitor"""
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
cursor.execute('''
|
||||
INSERT OR REPLACE INTO search_monitors
|
||||
(search_id, platform, source, search_query, search_url,
|
||||
last_checked, check_frequency_hours, active)
|
||||
VALUES (?, 'forums', ?, ?, ?, ?, ?, 1)
|
||||
''', (search_id, forum_name, search_query, search_url,
|
||||
datetime.now(), check_frequency_hours))
|
||||
conn.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error adding search: {e}")
|
||||
return False
|
||||
|
||||
def db_update_search_results(self, search_id: str, results_count: int) -> bool:
|
||||
"""Update search results count"""
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
UPDATE search_monitors
|
||||
SET last_checked = ?, results_found = ?
|
||||
WHERE search_id = ?
|
||||
''', (datetime.now(), results_count, search_id))
|
||||
conn.commit()
|
||||
return cursor.rowcount > 0
|
||||
|
||||
def add_to_download_queue(self, url: str, referer: str = None, save_path: str = None,
|
||||
thread_id: str = None, post_id: str = None,
|
||||
forum_name: str = None, metadata: Dict = None) -> bool:
|
||||
"""Add item to download queue"""
|
||||
# Check if already downloaded
|
||||
if self.unified_db.is_downloaded(url, platform='forums'):
|
||||
return False
|
||||
|
||||
# Check if already in queue (with retry logic)
|
||||
def check_queue(conn):
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT status FROM download_queue WHERE url = ?",
|
||||
(url,)
|
||||
)
|
||||
return cursor.fetchone()
|
||||
|
||||
try:
|
||||
existing = self._execute_with_retry(check_queue, for_write=False)
|
||||
if existing:
|
||||
if existing[0] == 'completed':
|
||||
return False # Already downloaded
|
||||
elif existing[0] == 'pending':
|
||||
return False # Already in queue
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking download queue: {e}")
|
||||
return False
|
||||
|
||||
# Add to queue
|
||||
queue_metadata = metadata or {}
|
||||
queue_metadata.update({
|
||||
'thread_id': thread_id,
|
||||
'post_id': post_id,
|
||||
'forum_name': forum_name
|
||||
})
|
||||
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
cursor.execute('''
|
||||
INSERT INTO download_queue
|
||||
(url, platform, source, referer, save_path, status, metadata)
|
||||
VALUES (?, 'forums', ?, ?, ?, 'pending', ?)
|
||||
''', (url, forum_name, referer, str(save_path) if save_path else None, json.dumps(queue_metadata)))
|
||||
conn.commit()
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False # URL already in queue
|
||||
except Exception as e:
|
||||
logger.error(f"Error adding to queue: {e}")
|
||||
return False
|
||||
|
||||
def is_in_download_queue(self, url: str) -> bool:
|
||||
"""Check if URL is in download queue"""
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT 1 FROM download_queue WHERE url = ? AND status = 'pending'",
|
||||
(url,)
|
||||
)
|
||||
return cursor.fetchone() is not None
|
||||
|
||||
def is_already_downloaded(self, url: str, forum_name: str = None) -> bool:
|
||||
"""Check if thread URL is already being tracked"""
|
||||
# For thread URLs, check the forum_threads table
|
||||
import hashlib
|
||||
thread_id = hashlib.sha256(url.encode()).hexdigest()
|
||||
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
if forum_name:
|
||||
# Check for specific forum
|
||||
cursor.execute('''
|
||||
SELECT 1 FROM forum_threads
|
||||
WHERE forum_name = ? AND (thread_url = ? OR thread_id = ?)
|
||||
LIMIT 1
|
||||
''', (forum_name, url, thread_id))
|
||||
else:
|
||||
# Check any forum
|
||||
cursor.execute('''
|
||||
SELECT 1 FROM forum_threads
|
||||
WHERE thread_url = ? OR thread_id = ?
|
||||
LIMIT 1
|
||||
''', (url, thread_id))
|
||||
return cursor.fetchone() is not None
|
||||
|
||||
def mark_download_complete(self, url: str, filename: str = None,
|
||||
file_path: str = None) -> bool:
|
||||
"""Mark download as complete in queue"""
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
UPDATE download_queue
|
||||
SET status = 'completed', download_date = ?
|
||||
WHERE url = ?
|
||||
''', (datetime.now(), url))
|
||||
conn.commit()
|
||||
return cursor.rowcount > 0
|
||||
|
||||
def mark_download_failed(self, url: str, error_message: str = None) -> bool:
|
||||
"""Mark download as failed in queue"""
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
UPDATE download_queue
|
||||
SET status = 'failed', attempts = attempts + 1, error_message = ?
|
||||
WHERE url = ?
|
||||
''', (error_message, url))
|
||||
conn.commit()
|
||||
return cursor.rowcount > 0
|
||||
|
||||
def record_download(self, url: str, thread_id: str = None, post_id: str = None,
|
||||
filename: str = None, metadata: Dict = None, file_path: str = None,
|
||||
post_date = None) -> bool:
|
||||
"""Record a download in the unified database
|
||||
|
||||
Args:
|
||||
url: URL of the downloaded content
|
||||
thread_id: Forum thread ID
|
||||
post_id: Forum post ID
|
||||
filename: Name of downloaded file
|
||||
metadata: Additional metadata dict
|
||||
file_path: Full path to downloaded file
|
||||
post_date: Date of the forum post (datetime or None)
|
||||
"""
|
||||
# Extract forum name from metadata if available
|
||||
forum_name = metadata.get('forum_name') if metadata else None
|
||||
|
||||
# Prepare full metadata
|
||||
full_metadata = metadata or {}
|
||||
if thread_id:
|
||||
full_metadata['thread_id'] = thread_id
|
||||
if post_id:
|
||||
full_metadata['post_id'] = post_id
|
||||
|
||||
# Calculate file hash if file_path provided
|
||||
file_hash = None
|
||||
if file_path:
|
||||
try:
|
||||
from modules.unified_database import UnifiedDatabase
|
||||
file_hash = UnifiedDatabase.get_file_hash(file_path)
|
||||
except Exception:
|
||||
pass # If hash fails, continue without it
|
||||
|
||||
# Record in unified database
|
||||
return self.unified_db.record_download(
|
||||
url=url,
|
||||
platform='forums',
|
||||
source=forum_name or 'unknown',
|
||||
content_type='image',
|
||||
filename=filename,
|
||||
file_path=file_path,
|
||||
file_hash=file_hash,
|
||||
post_date=post_date,
|
||||
metadata=full_metadata
|
||||
)
|
||||
|
||||
def get_pending_downloads(self, limit: int = 100) -> List[Dict]:
|
||||
"""Get pending downloads from queue"""
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
SELECT * FROM download_queue
|
||||
WHERE platform = 'forums' AND status = 'pending'
|
||||
ORDER BY priority, created_date
|
||||
LIMIT ?
|
||||
''', (limit,))
|
||||
return [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
def cleanup_old_data(self, days: int = 180):
|
||||
"""Clean up old data"""
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Clean old downloads
|
||||
cursor.execute('''
|
||||
DELETE FROM downloads
|
||||
WHERE platform = 'forums'
|
||||
AND download_date < datetime('now', ? || ' days')
|
||||
AND status = 'completed'
|
||||
''', (-days,))
|
||||
|
||||
# Clean old queue items
|
||||
cursor.execute('''
|
||||
DELETE FROM download_queue
|
||||
WHERE platform = 'forums'
|
||||
AND created_date < datetime('now', ? || ' days')
|
||||
AND status IN ('completed', 'failed')
|
||||
''', (-days,))
|
||||
|
||||
# Expire old monitors
|
||||
cursor.execute('''
|
||||
UPDATE forum_threads
|
||||
SET status = 'expired'
|
||||
WHERE monitor_until < datetime('now')
|
||||
AND status = 'active'
|
||||
''')
|
||||
|
||||
conn.commit()
|
||||
5029
modules/forum_downloader.py
Executable file
5029
modules/forum_downloader.py
Executable file
File diff suppressed because it is too large
Load Diff
2019
modules/imginn_api_module.py
Normal file
2019
modules/imginn_api_module.py
Normal file
File diff suppressed because it is too large
Load Diff
3775
modules/imginn_module.py
Normal file
3775
modules/imginn_module.py
Normal file
File diff suppressed because it is too large
Load Diff
410
modules/immich_face_integration.py
Normal file
410
modules/immich_face_integration.py
Normal file
@@ -0,0 +1,410 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Immich Face Integration Module
|
||||
|
||||
Integrates with Immich's face recognition system to leverage its existing
|
||||
face clustering and recognition data for media-downloader files.
|
||||
|
||||
Immich uses:
|
||||
- InsightFace with buffalo_l model (same as media-downloader)
|
||||
- DBSCAN clustering for face grouping
|
||||
- 512-dimensional face embeddings
|
||||
- PostgreSQL for storage
|
||||
|
||||
Path mapping:
|
||||
- Media-downloader: /opt/immich/md/...
|
||||
- Immich sees: /mnt/media/md/...
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Dict, Any, Tuple
|
||||
from datetime import datetime
|
||||
import httpx
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('ImmichFace')
|
||||
|
||||
|
||||
class ImmichFaceIntegration:
|
||||
"""Interface with Immich's face recognition system."""
|
||||
|
||||
# Path mapping between systems
|
||||
LOCAL_BASE = '/opt/immich'
|
||||
IMMICH_BASE = '/mnt/media'
|
||||
|
||||
def __init__(self, api_url: str = None, api_key: str = None):
|
||||
"""
|
||||
Initialize Immich face integration.
|
||||
|
||||
Args:
|
||||
api_url: Immich API URL (default: http://localhost:2283/api)
|
||||
api_key: Immich API key
|
||||
"""
|
||||
self.api_url = (api_url or os.getenv('IMMICH_API_URL', 'http://localhost:2283/api')).rstrip('/')
|
||||
self.api_key = api_key or os.getenv('IMMICH_API_KEY', '')
|
||||
self._client = None
|
||||
self._people_cache = None
|
||||
self._people_cache_time = None
|
||||
self._cache_ttl = 300 # 5 minutes
|
||||
|
||||
@property
|
||||
def is_configured(self) -> bool:
|
||||
"""Check if Immich integration is properly configured."""
|
||||
return bool(self.api_key)
|
||||
|
||||
def _get_client(self) -> httpx.Client:
|
||||
"""Get or create HTTP client."""
|
||||
if self._client is None:
|
||||
self._client = httpx.Client(
|
||||
base_url=self.api_url,
|
||||
headers={
|
||||
'x-api-key': self.api_key,
|
||||
'Accept': 'application/json'
|
||||
},
|
||||
timeout=30.0
|
||||
)
|
||||
return self._client
|
||||
|
||||
def _local_to_immich_path(self, local_path: str) -> str:
|
||||
"""
|
||||
Convert local path to Immich's path format.
|
||||
|
||||
Example:
|
||||
/opt/immich/md/instagram/user/image.jpg
|
||||
-> /mnt/media/md/instagram/user/image.jpg
|
||||
"""
|
||||
return local_path.replace(self.LOCAL_BASE, self.IMMICH_BASE)
|
||||
|
||||
def _immich_to_local_path(self, immich_path: str) -> str:
|
||||
"""
|
||||
Convert Immich's path to local path format.
|
||||
|
||||
Example:
|
||||
/mnt/media/md/instagram/user/image.jpg
|
||||
-> /opt/immich/md/instagram/user/image.jpg
|
||||
"""
|
||||
return immich_path.replace(self.IMMICH_BASE, self.LOCAL_BASE)
|
||||
|
||||
def test_connection(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Test connection to Immich API.
|
||||
|
||||
Returns:
|
||||
Dict with 'success', 'message', and optionally 'server_info'
|
||||
"""
|
||||
if not self.is_configured:
|
||||
return {
|
||||
'success': False,
|
||||
'message': 'Immich API key not configured'
|
||||
}
|
||||
|
||||
try:
|
||||
client = self._get_client()
|
||||
response = client.get('/server/ping')
|
||||
|
||||
if response.status_code == 200:
|
||||
# Get server info
|
||||
info_response = client.get('/server/version')
|
||||
server_info = info_response.json() if info_response.status_code == 200 else {}
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'message': 'Connected to Immich',
|
||||
'server_info': server_info
|
||||
}
|
||||
else:
|
||||
return {
|
||||
'success': False,
|
||||
'message': f'Immich API returned status {response.status_code}'
|
||||
}
|
||||
except httpx.ConnectError as e:
|
||||
return {
|
||||
'success': False,
|
||||
'message': f'Cannot connect to Immich at {self.api_url}: {e}'
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'success': False,
|
||||
'message': f'Immich API error: {e}'
|
||||
}
|
||||
|
||||
def get_all_people(self, force_refresh: bool = False) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get all people/faces from Immich.
|
||||
|
||||
Returns:
|
||||
List of people with id, name, thumbnailPath, etc.
|
||||
"""
|
||||
if not self.is_configured:
|
||||
return []
|
||||
|
||||
# Check cache
|
||||
if not force_refresh and self._people_cache is not None:
|
||||
if self._people_cache_time:
|
||||
age = (datetime.now() - self._people_cache_time).total_seconds()
|
||||
if age < self._cache_ttl:
|
||||
return self._people_cache
|
||||
|
||||
try:
|
||||
client = self._get_client()
|
||||
response = client.get('/people')
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
# Immich returns {'people': [...], 'total': N, ...}
|
||||
people = data.get('people', data) if isinstance(data, dict) else data
|
||||
|
||||
# Cache the result
|
||||
self._people_cache = people
|
||||
self._people_cache_time = datetime.now()
|
||||
|
||||
logger.info(f"Fetched {len(people)} people from Immich")
|
||||
return people
|
||||
else:
|
||||
logger.error(f"Failed to get people: {response.status_code}")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting people from Immich: {e}")
|
||||
return []
|
||||
|
||||
def get_named_people(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get only people with names assigned in Immich.
|
||||
|
||||
Returns:
|
||||
List of named people
|
||||
"""
|
||||
people = self.get_all_people()
|
||||
return [p for p in people if p.get('name')]
|
||||
|
||||
def get_asset_by_path(self, local_path: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Find an Immich asset by its file path.
|
||||
|
||||
Args:
|
||||
local_path: Local file path (e.g., /opt/immich/md/...)
|
||||
|
||||
Returns:
|
||||
Asset dict or None if not found
|
||||
"""
|
||||
if not self.is_configured:
|
||||
return None
|
||||
|
||||
immich_path = self._local_to_immich_path(local_path)
|
||||
|
||||
try:
|
||||
client = self._get_client()
|
||||
|
||||
# Search by original path
|
||||
response = client.post('/search/metadata', json={
|
||||
'originalPath': immich_path
|
||||
})
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
assets = data.get('assets', {}).get('items', [])
|
||||
if assets:
|
||||
return assets[0]
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error searching asset by path: {e}")
|
||||
return None
|
||||
|
||||
def get_faces_for_asset(self, asset_id: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get all detected faces for an asset.
|
||||
|
||||
Args:
|
||||
asset_id: Immich asset ID
|
||||
|
||||
Returns:
|
||||
List of face data including person info and bounding boxes
|
||||
"""
|
||||
if not self.is_configured:
|
||||
return []
|
||||
|
||||
try:
|
||||
client = self._get_client()
|
||||
response = client.get(f'/faces', params={'id': asset_id})
|
||||
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
logger.warning(f"Failed to get faces for asset {asset_id}: {response.status_code}")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting faces for asset: {e}")
|
||||
return []
|
||||
|
||||
def get_faces_for_file(self, local_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get face recognition results for a local file using Immich.
|
||||
|
||||
This is the main method for integration - given a local file path,
|
||||
it finds the asset in Immich and returns any detected faces.
|
||||
|
||||
Args:
|
||||
local_path: Local file path (e.g., /opt/immich/md/...)
|
||||
|
||||
Returns:
|
||||
Dict with:
|
||||
- found: bool - whether file exists in Immich
|
||||
- faces: list of detected faces with person names
|
||||
- asset_id: Immich asset ID if found
|
||||
"""
|
||||
if not self.is_configured:
|
||||
return {
|
||||
'found': False,
|
||||
'error': 'Immich not configured',
|
||||
'faces': []
|
||||
}
|
||||
|
||||
# Find the asset
|
||||
asset = self.get_asset_by_path(local_path)
|
||||
if not asset:
|
||||
return {
|
||||
'found': False,
|
||||
'error': 'File not found in Immich',
|
||||
'faces': []
|
||||
}
|
||||
|
||||
asset_id = asset.get('id')
|
||||
|
||||
# Get faces for the asset
|
||||
faces_data = self.get_faces_for_asset(asset_id)
|
||||
|
||||
# Process faces into a more usable format
|
||||
faces = []
|
||||
for face in faces_data:
|
||||
person = face.get('person', {})
|
||||
faces.append({
|
||||
'face_id': face.get('id'),
|
||||
'person_id': person.get('id'),
|
||||
'person_name': person.get('name', ''),
|
||||
'bounding_box': {
|
||||
'x1': face.get('boundingBoxX1'),
|
||||
'y1': face.get('boundingBoxY1'),
|
||||
'x2': face.get('boundingBoxX2'),
|
||||
'y2': face.get('boundingBoxY2')
|
||||
},
|
||||
'image_width': face.get('imageWidth'),
|
||||
'image_height': face.get('imageHeight')
|
||||
})
|
||||
|
||||
# Filter to only named faces
|
||||
named_faces = [f for f in faces if f['person_name']]
|
||||
|
||||
return {
|
||||
'found': True,
|
||||
'asset_id': asset_id,
|
||||
'faces': faces,
|
||||
'named_faces': named_faces,
|
||||
'face_count': len(faces),
|
||||
'named_count': len(named_faces)
|
||||
}
|
||||
|
||||
def get_person_by_name(self, name: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Find a person in Immich by name.
|
||||
|
||||
Args:
|
||||
name: Person name to search for
|
||||
|
||||
Returns:
|
||||
Person dict or None
|
||||
"""
|
||||
people = self.get_all_people()
|
||||
for person in people:
|
||||
if person.get('name', '').lower() == name.lower():
|
||||
return person
|
||||
return None
|
||||
|
||||
def get_person_assets(self, person_id: str, limit: int = 1000) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get all assets containing a specific person using search API.
|
||||
|
||||
Args:
|
||||
person_id: Immich person ID
|
||||
limit: Maximum number of assets to return
|
||||
|
||||
Returns:
|
||||
List of assets
|
||||
"""
|
||||
if not self.is_configured:
|
||||
return []
|
||||
|
||||
try:
|
||||
client = self._get_client()
|
||||
# Use the search/metadata endpoint with personIds filter
|
||||
response = client.post('/search/metadata', json={
|
||||
'personIds': [person_id],
|
||||
'size': limit
|
||||
})
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
return data.get('assets', {}).get('items', [])
|
||||
else:
|
||||
logger.warning(f"Failed to get assets for person {person_id}: {response.status_code}")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting person assets: {e}")
|
||||
return []
|
||||
|
||||
def get_statistics(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get Immich face recognition statistics.
|
||||
|
||||
Returns:
|
||||
Dict with total people, named people, etc.
|
||||
"""
|
||||
people = self.get_all_people()
|
||||
named = [p for p in people if p.get('name')]
|
||||
|
||||
return {
|
||||
'total_people': len(people),
|
||||
'named_people': len(named),
|
||||
'unnamed_people': len(people) - len(named),
|
||||
'people_by_face_count': sorted(
|
||||
[{'name': p.get('name', 'Unnamed'), 'count': p.get('faces', 0)}
|
||||
for p in people if p.get('name')],
|
||||
key=lambda x: x['count'],
|
||||
reverse=True
|
||||
)[:20]
|
||||
}
|
||||
|
||||
def close(self):
|
||||
"""Close HTTP client."""
|
||||
if self._client:
|
||||
self._client.close()
|
||||
self._client = None
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_immich_integration = None
|
||||
|
||||
|
||||
def get_immich_integration(api_url: str = None, api_key: str = None) -> ImmichFaceIntegration:
|
||||
"""
|
||||
Get or create the Immich face integration instance.
|
||||
|
||||
Args:
|
||||
api_url: Optional API URL override
|
||||
api_key: Optional API key override
|
||||
|
||||
Returns:
|
||||
ImmichFaceIntegration instance
|
||||
"""
|
||||
global _immich_integration
|
||||
|
||||
if _immich_integration is None:
|
||||
_immich_integration = ImmichFaceIntegration(api_url, api_key)
|
||||
elif api_key and api_key != _immich_integration.api_key:
|
||||
# Recreate if API key changed
|
||||
_immich_integration.close()
|
||||
_immich_integration = ImmichFaceIntegration(api_url, api_key)
|
||||
|
||||
return _immich_integration
|
||||
2020
modules/instagram_client_module.py
Normal file
2020
modules/instagram_client_module.py
Normal file
File diff suppressed because it is too large
Load Diff
868
modules/instagram_perceptual_duplicate_detector.py
Normal file
868
modules/instagram_perceptual_duplicate_detector.py
Normal file
@@ -0,0 +1,868 @@
|
||||
"""
|
||||
Instagram Perceptual Duplicate Detector
|
||||
|
||||
Detects visually similar Instagram content (even with text overlays, stickers, etc.)
|
||||
and keeps the cleanest + highest quality version.
|
||||
|
||||
Priority: Clean (no overlays) > Quality (resolution/size)
|
||||
"""
|
||||
|
||||
import os
|
||||
import gc
|
||||
import json
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Tuple, List, TYPE_CHECKING
|
||||
from datetime import datetime
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
|
||||
try:
|
||||
import cv2
|
||||
import numpy as np
|
||||
OPENCV_AVAILABLE = True
|
||||
except ImportError:
|
||||
OPENCV_AVAILABLE = False
|
||||
np = None # Define np as None when not available
|
||||
|
||||
try:
|
||||
import imagehash
|
||||
from PIL import Image
|
||||
IMAGEHASH_AVAILABLE = True
|
||||
except ImportError:
|
||||
IMAGEHASH_AVAILABLE = False
|
||||
|
||||
# OCR disabled — not currently needed
|
||||
EASYOCR_AVAILABLE = False
|
||||
TESSERACT_AVAILABLE = False
|
||||
|
||||
|
||||
class InstagramPerceptualDuplicateDetector:
|
||||
"""
|
||||
Detects perceptual duplicates in Instagram content and keeps cleanest + best quality
|
||||
"""
|
||||
|
||||
def __init__(self, unified_db, log_callback=None):
|
||||
"""
|
||||
Initialize detector
|
||||
|
||||
Args:
|
||||
unified_db: UnifiedDatabase instance
|
||||
log_callback: Optional legacy callback (deprecated, uses universal logger)
|
||||
"""
|
||||
self.db = unified_db
|
||||
self.logger = get_logger('Perceptual_Duplicate_Detector')
|
||||
self.easyocr_reader = None
|
||||
|
||||
# Initialize EasyOCR reader (lazy loading - only when needed)
|
||||
if EASYOCR_AVAILABLE:
|
||||
try:
|
||||
# Suppress PyTorch pin_memory warning (we're using CPU anyway)
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore', category=UserWarning, module='torch.utils.data.dataloader')
|
||||
|
||||
self.easyocr_reader = easyocr.Reader(['en'], gpu=False, verbose=False)
|
||||
self.logger.debug("EasyOCR initialized for text overlay detection", module="Perceptual")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to initialize EasyOCR: {e}, will use Tesseract fallback", module="Perceptual")
|
||||
self.easyocr_reader = None
|
||||
|
||||
# Check dependencies
|
||||
if not OPENCV_AVAILABLE:
|
||||
self.logger.warning("OpenCV not available - perceptual duplicate detection disabled", module="Perceptual")
|
||||
if not IMAGEHASH_AVAILABLE:
|
||||
self.logger.warning("imagehash not available - perceptual duplicate detection disabled", module="Perceptual")
|
||||
if not EASYOCR_AVAILABLE and not TESSERACT_AVAILABLE:
|
||||
self.logger.debug("No OCR available (EasyOCR or pytesseract) - text overlay detection disabled", module="Perceptual")
|
||||
|
||||
self.dependencies_available = OPENCV_AVAILABLE and IMAGEHASH_AVAILABLE
|
||||
|
||||
def check_and_handle_duplicate(self, file_path: str, platform: str, source: str, content_type: str = None) -> Optional[str]:
|
||||
"""
|
||||
Check if file is a perceptual duplicate and handle accordingly
|
||||
|
||||
ALWAYS records perceptual hash (even when disabled) to build historical database.
|
||||
Only performs duplicate detection/handling when enabled.
|
||||
|
||||
Returns:
|
||||
- None if not a duplicate or feature disabled
|
||||
- "skip" if this file should be skipped (lower quality duplicate)
|
||||
- file_path if this file should be kept (same or better)
|
||||
"""
|
||||
filename = Path(file_path).name
|
||||
|
||||
self.logger.debug(f"[PERCEPTUAL] ENTRY: check_and_handle_duplicate called", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] File: {filename}", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] Platform: {platform}", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] Source: {source}", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] Content Type: {content_type}", module="Perceptual")
|
||||
|
||||
if not self.dependencies_available:
|
||||
self.logger.warning(f"[PERCEPTUAL] SKIP: Dependencies not available (OpenCV/ImageHash missing)", module="Perceptual")
|
||||
return None
|
||||
|
||||
# Get settings
|
||||
settings = self._get_settings()
|
||||
detection_enabled = settings.get('enabled', False)
|
||||
|
||||
self.logger.debug(f"[PERCEPTUAL] Settings loaded:", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] Enabled: {detection_enabled}", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] Platforms: {settings.get('platforms', [])}", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] Threshold: {settings.get('perceptual_hash_threshold', 12)}", module="Perceptual")
|
||||
|
||||
try:
|
||||
# ALWAYS calculate perceptual hash and scores (even when detection disabled)
|
||||
# This builds the historical database for future use
|
||||
self.logger.debug(f"[PERCEPTUAL] Calculating perceptual hash for {filename}...", module="Perceptual")
|
||||
phash = self._calculate_perceptual_hash(file_path)
|
||||
if not phash:
|
||||
self.logger.error(f"[PERCEPTUAL] FAILED: Could not calculate perceptual hash for {filename}", module="Perceptual")
|
||||
return None
|
||||
|
||||
self.logger.debug(f"[PERCEPTUAL] Hash calculated: {phash[:32]}...", module="Perceptual")
|
||||
|
||||
text_count, text_chars = self._detect_text_overlays(file_path) if settings.get('text_detection_enabled', True) else (0, 0)
|
||||
quality_metrics = self._get_quality_metrics(file_path)
|
||||
|
||||
clean_score = self._calculate_clean_score(text_count, text_chars)
|
||||
quality_score = self._calculate_quality_score(quality_metrics)
|
||||
|
||||
self.logger.debug(f"[PERCEPTUAL] Scores calculated:", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] Clean Score: {clean_score:.2f}", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] Quality Score: {quality_score:.2f}", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] Text Overlays: {text_count} ({text_chars} chars)", module="Perceptual")
|
||||
|
||||
# If detection is disabled, just store the hash and return (no duplicate checking)
|
||||
if not detection_enabled:
|
||||
self.logger.debug(f"[PERCEPTUAL] SKIP: Detection disabled - storing hash only for {filename}", module="Perceptual")
|
||||
self._store_perceptual_hash(
|
||||
file_path, platform, source, content_type,
|
||||
phash, text_count, text_chars, quality_score, clean_score, quality_metrics
|
||||
)
|
||||
return None # Detection disabled, allow file to proceed
|
||||
|
||||
# Check if this platform is enabled for detection
|
||||
platform_enabled = platform.lower() in [p.lower() for p in settings.get('platforms', ['instagram'])]
|
||||
self.logger.debug(f"[PERCEPTUAL] Platform check: {platform} enabled = {platform_enabled}", module="Perceptual")
|
||||
|
||||
if not platform_enabled:
|
||||
self.logger.debug(f"[PERCEPTUAL] SKIP: Platform '{platform}' not in enabled list - storing hash only", module="Perceptual")
|
||||
self._store_perceptual_hash(
|
||||
file_path, platform, source, content_type,
|
||||
phash, text_count, text_chars, quality_score, clean_score, quality_metrics
|
||||
)
|
||||
return None # Platform not enabled, allow file to proceed
|
||||
|
||||
# Detection is enabled - perform duplicate checking
|
||||
self.logger.debug(f"[PERCEPTUAL] CHECKING FOR DUPLICATES: {filename}", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] Platform: {platform}, Source: {source}", module="Perceptual")
|
||||
|
||||
self.logger.log(
|
||||
f"[PERCEPTUAL] New file: {Path(file_path).name} | "
|
||||
f"Hash: {phash[:16]}... | Clean: {clean_score:.2f} | Quality: {quality_score:.2f}",
|
||||
"info"
|
||||
)
|
||||
|
||||
# Find perceptual duplicates in database
|
||||
threshold = settings.get('perceptual_hash_threshold', 12)
|
||||
self.logger.debug(f"[PERCEPTUAL] Searching for similar files (threshold: {threshold})...", module="Perceptual")
|
||||
|
||||
similar_files = self._find_similar_files(
|
||||
phash,
|
||||
platform,
|
||||
source,
|
||||
threshold
|
||||
)
|
||||
|
||||
self.logger.debug(f"[PERCEPTUAL] Similar files found: {len(similar_files)}", module="Perceptual")
|
||||
|
||||
if similar_files:
|
||||
for i, sim in enumerate(similar_files, 1):
|
||||
self.logger.debug(
|
||||
f"[PERCEPTUAL] #{i}: {sim['filename']} | "
|
||||
f"Distance: {sim['hamming_distance']} | "
|
||||
f"Clean: {sim['clean_score']:.2f} | "
|
||||
f"Quality: {sim['quality_score']:.2f}",
|
||||
module="Perceptual"
|
||||
)
|
||||
|
||||
if not similar_files:
|
||||
# No duplicates found - store this file's hash and continue
|
||||
self.logger.debug(f"[PERCEPTUAL] NO DUPLICATES FOUND - keeping {filename}", module="Perceptual")
|
||||
self._store_perceptual_hash(
|
||||
file_path, platform, source, content_type,
|
||||
phash, text_count, text_chars, quality_score, clean_score, quality_metrics
|
||||
)
|
||||
return file_path # Keep this file
|
||||
|
||||
# Found similar file(s) - compare and decide which to keep
|
||||
best_existing = self._get_best_existing_file(similar_files)
|
||||
|
||||
self.logger.debug(f"[PERCEPTUAL] DUPLICATE DETECTED!", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] Best existing file: {best_existing['filename']}", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] Clean: {best_existing['clean_score']:.2f}", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] Quality: {best_existing['quality_score']:.2f}", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL] Path: {best_existing['file_path']}", module="Perceptual")
|
||||
|
||||
# Compare new file vs best existing
|
||||
self.logger.debug(f"[PERCEPTUAL] Comparing new vs existing...", module="Perceptual")
|
||||
comparison = self._compare_files(
|
||||
new_clean=clean_score,
|
||||
new_quality=quality_score,
|
||||
existing_clean=best_existing['clean_score'],
|
||||
existing_quality=best_existing['quality_score'],
|
||||
settings=settings
|
||||
)
|
||||
|
||||
self.logger.debug(f"[PERCEPTUAL] Comparison result: {comparison}", module="Perceptual")
|
||||
|
||||
if comparison == "new_better":
|
||||
# New file is better - move existing to recycle, keep new
|
||||
self.logger.info(
|
||||
f"[PERCEPTUAL] Replacing {best_existing['filename']} with cleaner version: {filename}",
|
||||
module="Perceptual"
|
||||
)
|
||||
|
||||
# Move existing to recycle bin
|
||||
self._move_to_recycle(
|
||||
best_existing['file_path'],
|
||||
reason='replaced_with_cleaner_duplicate',
|
||||
new_file=file_path
|
||||
)
|
||||
|
||||
# Update database - replace old entry with new
|
||||
self._replace_perceptual_hash_entry(
|
||||
old_id=best_existing['id'],
|
||||
new_file_path=file_path,
|
||||
new_phash=phash,
|
||||
new_text_count=text_count,
|
||||
new_text_chars=text_chars,
|
||||
new_quality_score=quality_score,
|
||||
new_clean_score=clean_score,
|
||||
new_quality_metrics=quality_metrics
|
||||
)
|
||||
|
||||
return file_path # Keep new file
|
||||
|
||||
elif comparison == "existing_better":
|
||||
# Existing file is better - move new to recycle, keep existing
|
||||
self.logger.info(
|
||||
f"[PERCEPTUAL] Skipping {filename} (duplicate of {best_existing['filename']})",
|
||||
module="Perceptual"
|
||||
)
|
||||
|
||||
# Move new file to recycle bin
|
||||
self._move_to_recycle(
|
||||
file_path,
|
||||
reason='duplicate_lower_quality_or_has_overlays',
|
||||
kept_file=best_existing['file_path']
|
||||
)
|
||||
|
||||
return "skip" # Skip this file
|
||||
|
||||
else:
|
||||
# Same quality - keep existing (default behavior)
|
||||
self.logger.info(
|
||||
f"[PERCEPTUAL] Skipping {filename} (same quality as {best_existing['filename']})",
|
||||
module="Perceptual"
|
||||
)
|
||||
|
||||
self._move_to_recycle(
|
||||
file_path,
|
||||
reason='duplicate_same_quality',
|
||||
kept_file=best_existing['file_path']
|
||||
)
|
||||
|
||||
return "skip"
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"[PERCEPTUAL] EXCEPTION: {e}", module="Perceptual")
|
||||
import traceback
|
||||
self.logger.error(f"[PERCEPTUAL] Traceback:\n{traceback.format_exc()}", module="Perceptual")
|
||||
return None
|
||||
|
||||
def _get_settings(self) -> dict:
|
||||
"""Get Instagram perceptual duplicate settings from database"""
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT value FROM settings WHERE key = 'instagram_perceptual_duplicates'")
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
return json.loads(result[0])
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Failed to get perceptual duplicate settings: {e}", module="Perceptual")
|
||||
|
||||
return {'enabled': False}
|
||||
|
||||
def _calculate_perceptual_hash(self, file_path: str) -> Optional[str]:
|
||||
"""Calculate perceptual hash for image or video"""
|
||||
if not IMAGEHASH_AVAILABLE:
|
||||
return None
|
||||
|
||||
frame = None
|
||||
frame_rgb = None
|
||||
pil_image = None
|
||||
|
||||
try:
|
||||
# For videos, extract middle frame
|
||||
if file_path.lower().endswith(('.mp4', '.mov', '.avi', '.mkv')):
|
||||
frame = self._extract_video_frame(file_path)
|
||||
if frame is None:
|
||||
return None
|
||||
|
||||
# Convert frame to PIL Image
|
||||
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
pil_image = Image.fromarray(frame_rgb)
|
||||
else:
|
||||
# For images, open directly
|
||||
pil_image = Image.open(file_path)
|
||||
|
||||
# Calculate perceptual hash (dHash - difference hash)
|
||||
phash = str(imagehash.dhash(pil_image, hash_size=16))
|
||||
return phash
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Failed to calculate perceptual hash: {e}", module="Perceptual")
|
||||
return None
|
||||
finally:
|
||||
# Clean up memory
|
||||
if pil_image is not None:
|
||||
pil_image.close()
|
||||
del pil_image
|
||||
if frame_rgb is not None:
|
||||
del frame_rgb
|
||||
if frame is not None:
|
||||
del frame
|
||||
gc.collect()
|
||||
|
||||
def _extract_video_frame(self, video_path: str, position: float = 0.5) -> Optional['np.ndarray']:
|
||||
"""Extract a frame from video at given position (0.0 to 1.0)"""
|
||||
if not OPENCV_AVAILABLE:
|
||||
return None
|
||||
|
||||
try:
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
if not cap.isOpened():
|
||||
return None
|
||||
|
||||
# Get total frames and seek to middle
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
target_frame = int(total_frames * position)
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, target_frame)
|
||||
|
||||
ret, frame = cap.read()
|
||||
cap.release()
|
||||
|
||||
return frame if ret else None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Failed to extract video frame: {e}", module="Perceptual")
|
||||
return None
|
||||
|
||||
def _detect_text_overlays(self, file_path: str) -> Tuple[int, int]:
|
||||
"""
|
||||
Detect text overlays in image/video using EasyOCR (primary) or Tesseract (fallback)
|
||||
|
||||
Returns:
|
||||
(text_region_count, total_text_characters)
|
||||
"""
|
||||
if not self.easyocr_reader and not TESSERACT_AVAILABLE:
|
||||
return (0, 0)
|
||||
|
||||
if not OPENCV_AVAILABLE:
|
||||
return (0, 0)
|
||||
|
||||
image = None
|
||||
gray = None
|
||||
|
||||
try:
|
||||
text_regions = 0
|
||||
total_chars = 0
|
||||
|
||||
# Load image or extract video frame
|
||||
if file_path.lower().endswith(('.mp4', '.mov', '.avi', '.mkv')):
|
||||
image = self._extract_video_frame(file_path)
|
||||
if image is None:
|
||||
return (0, 0)
|
||||
else:
|
||||
image = cv2.imread(file_path)
|
||||
if image is None:
|
||||
return (0, 0)
|
||||
|
||||
# Try EasyOCR first (better for Instagram overlays)
|
||||
if self.easyocr_reader:
|
||||
try:
|
||||
# EasyOCR works directly with image arrays
|
||||
results = self.easyocr_reader.readtext(image)
|
||||
|
||||
# EasyOCR returns list of (bbox, text, confidence)
|
||||
for bbox, text, conf in results:
|
||||
if conf > 0.5: # Only use detections with >50% confidence
|
||||
text_stripped = text.strip()
|
||||
if text_stripped:
|
||||
text_regions += 1
|
||||
total_chars += len(text_stripped)
|
||||
|
||||
if text_regions > 0:
|
||||
self.logger.log(
|
||||
f"[OVERLAY] EasyOCR detected {text_regions} text regions, {total_chars} chars in {Path(file_path).name}",
|
||||
"debug"
|
||||
)
|
||||
return (text_regions, total_chars)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"EasyOCR failed: {e}, falling back to Tesseract", module="Perceptual")
|
||||
|
||||
# Fallback to Tesseract if EasyOCR didn't find anything or failed
|
||||
if TESSERACT_AVAILABLE:
|
||||
try:
|
||||
# Convert to grayscale for Tesseract
|
||||
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Run OCR with detailed data
|
||||
ocr_data = pytesseract.image_to_data(
|
||||
gray,
|
||||
output_type=pytesseract.Output.DICT,
|
||||
config='--psm 11' # Sparse text mode
|
||||
)
|
||||
|
||||
# Count text regions and characters
|
||||
text_regions = 0
|
||||
total_chars = 0
|
||||
confidence_threshold = 30
|
||||
|
||||
for i, conf in enumerate(ocr_data['conf']):
|
||||
if int(conf) > confidence_threshold:
|
||||
text = ocr_data['text'][i].strip()
|
||||
if text:
|
||||
text_regions += 1
|
||||
total_chars += len(text)
|
||||
|
||||
self.logger.log(
|
||||
f"[OVERLAY] Tesseract (fallback) detected {text_regions} text regions, {total_chars} chars in {Path(file_path).name}",
|
||||
"debug"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Tesseract OCR failed: {e}", module="Perceptual")
|
||||
|
||||
return (text_regions, total_chars)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Text overlay detection failed: {e}", module="Perceptual")
|
||||
return (0, 0)
|
||||
finally:
|
||||
# Clean up memory - these are large numpy arrays
|
||||
if gray is not None:
|
||||
del gray
|
||||
if image is not None:
|
||||
del image
|
||||
gc.collect()
|
||||
|
||||
def _get_quality_metrics(self, file_path: str) -> dict:
|
||||
"""Get quality metrics for file"""
|
||||
import subprocess
|
||||
|
||||
metrics = {
|
||||
'resolution': 0,
|
||||
'width': 0,
|
||||
'height': 0,
|
||||
'file_size': 0,
|
||||
'bitrate': 0
|
||||
}
|
||||
|
||||
try:
|
||||
# Get file size
|
||||
metrics['file_size'] = Path(file_path).stat().st_size
|
||||
|
||||
# Use ffprobe for video metadata
|
||||
cmd = [
|
||||
'ffprobe',
|
||||
'-v', 'quiet',
|
||||
'-print_format', 'json',
|
||||
'-show_format',
|
||||
'-show_streams',
|
||||
file_path
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
|
||||
if result.returncode == 0:
|
||||
data = json.loads(result.stdout)
|
||||
|
||||
# Get video stream
|
||||
video_stream = next((s for s in data.get('streams', []) if s.get('codec_type') == 'video'), None)
|
||||
if video_stream:
|
||||
metrics['width'] = int(video_stream.get('width', 0))
|
||||
metrics['height'] = int(video_stream.get('height', 0))
|
||||
metrics['resolution'] = metrics['width'] * metrics['height']
|
||||
|
||||
# Get bitrate
|
||||
format_info = data.get('format', {})
|
||||
if 'bit_rate' in format_info:
|
||||
metrics['bitrate'] = int(format_info['bit_rate']) // 1000
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Failed to get quality metrics: {e}", module="Perceptual")
|
||||
|
||||
return metrics
|
||||
|
||||
def _calculate_clean_score(self, text_count: int, text_chars: int) -> float:
|
||||
"""
|
||||
Calculate cleanliness score (0-100)
|
||||
|
||||
Higher score = cleaner (less text/overlays)
|
||||
"""
|
||||
# Base score starts at 100 (perfectly clean)
|
||||
score = 100.0
|
||||
|
||||
# Penalize for text regions (each region -10 points, max -50)
|
||||
text_penalty = min(text_count * 10, 50)
|
||||
score -= text_penalty
|
||||
|
||||
# Penalize for character count (each 10 chars -5 points, max -40)
|
||||
char_penalty = min((text_chars // 10) * 5, 40)
|
||||
score -= char_penalty
|
||||
|
||||
# Ensure score is between 0-100
|
||||
return max(0.0, min(100.0, score))
|
||||
|
||||
def _calculate_quality_score(self, metrics: dict) -> float:
|
||||
"""
|
||||
Calculate quality score (0-100)
|
||||
|
||||
Based on resolution and file size
|
||||
"""
|
||||
score = 0.0
|
||||
|
||||
# Resolution score (0-60 points)
|
||||
# 1080p = 2,073,600 pixels = 60 points
|
||||
# 720p = 921,600 pixels = 27 points
|
||||
resolution = metrics.get('resolution', 0)
|
||||
if resolution > 0:
|
||||
resolution_score = min((resolution / 2_073_600) * 60, 60)
|
||||
score += resolution_score
|
||||
|
||||
# File size score (0-40 points)
|
||||
# 10MB = 40 points
|
||||
# 5MB = 20 points
|
||||
file_size = metrics.get('file_size', 0)
|
||||
if file_size > 0:
|
||||
size_mb = file_size / (1024 * 1024)
|
||||
size_score = min((size_mb / 10) * 40, 40)
|
||||
score += size_score
|
||||
|
||||
return min(100.0, score)
|
||||
|
||||
def _find_similar_files(self, phash: str, platform: str, source: str, threshold: int) -> List[dict]:
|
||||
"""Find files with similar perceptual hash"""
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get all Instagram files (all methods now use platform='instagram')
|
||||
self.logger.debug(f"[PERCEPTUAL_SEARCH] Querying database:", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL_SEARCH] Platform: instagram", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL_SEARCH] Source: {source}", module="Perceptual")
|
||||
self.logger.debug(f"[PERCEPTUAL_SEARCH] Threshold: {threshold}", module="Perceptual")
|
||||
|
||||
# Search all Instagram content (regardless of method)
|
||||
# This catches reposts/duplicates from different accounts
|
||||
cursor.execute("""
|
||||
SELECT id, file_path, filename, perceptual_hash,
|
||||
text_overlay_count, text_overlay_chars,
|
||||
quality_score, clean_score, resolution, file_size
|
||||
FROM instagram_perceptual_hashes
|
||||
WHERE platform = 'instagram'
|
||||
""")
|
||||
|
||||
all_rows = cursor.fetchall()
|
||||
self.logger.debug(f"[PERCEPTUAL_SEARCH] Database returned {len(all_rows)} existing files (checking across all sources)", module="Perceptual")
|
||||
|
||||
results = []
|
||||
checked_count = 0
|
||||
within_threshold = 0
|
||||
missing_files = 0
|
||||
|
||||
for row in all_rows:
|
||||
existing_hash = row[3]
|
||||
existing_filename = row[2]
|
||||
|
||||
# Calculate Hamming distance
|
||||
distance = self._hamming_distance(phash, existing_hash)
|
||||
checked_count += 1
|
||||
|
||||
if distance <= threshold:
|
||||
within_threshold += 1
|
||||
# Check if file still exists
|
||||
if Path(row[1]).exists():
|
||||
self.logger.debug(
|
||||
f"[PERCEPTUAL_SEARCH] MATCH: {existing_filename} (distance: {distance})",
|
||||
module="Perceptual"
|
||||
)
|
||||
results.append({
|
||||
'id': row[0],
|
||||
'file_path': row[1],
|
||||
'filename': row[2],
|
||||
'perceptual_hash': row[3],
|
||||
'text_overlay_count': row[4],
|
||||
'text_overlay_chars': row[5],
|
||||
'quality_score': row[6],
|
||||
'clean_score': row[7],
|
||||
'resolution': row[8],
|
||||
'file_size': row[9],
|
||||
'hamming_distance': distance
|
||||
})
|
||||
else:
|
||||
missing_files += 1
|
||||
self.logger.debug(
|
||||
f"[PERCEPTUAL_SEARCH] MATCH but file missing: {existing_filename} (distance: {distance})",
|
||||
module="Perceptual"
|
||||
)
|
||||
|
||||
self.logger.debug(
|
||||
f"[PERCEPTUAL_SEARCH] Checked {checked_count} hashes, "
|
||||
f"{within_threshold} within threshold, "
|
||||
f"{missing_files} missing files, "
|
||||
f"{len(results)} valid matches",
|
||||
module="Perceptual"
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to find similar files: {e}", module="Perceptual")
|
||||
return []
|
||||
|
||||
def _hamming_distance(self, hash1: str, hash2: str) -> int:
|
||||
"""Calculate Hamming distance between two hashes"""
|
||||
if len(hash1) != len(hash2):
|
||||
return 999 # Invalid comparison
|
||||
|
||||
return sum(c1 != c2 for c1, c2 in zip(hash1, hash2))
|
||||
|
||||
def _get_best_existing_file(self, similar_files: List[dict]) -> dict:
|
||||
"""Get the best existing file from similar files (highest clean + quality score)"""
|
||||
if not similar_files:
|
||||
return None
|
||||
|
||||
# Sort by clean score (primary), then quality score (secondary)
|
||||
sorted_files = sorted(
|
||||
similar_files,
|
||||
key=lambda f: (f['clean_score'], f['quality_score']),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
return sorted_files[0]
|
||||
|
||||
def _compare_files(self, new_clean: float, new_quality: float,
|
||||
existing_clean: float, existing_quality: float,
|
||||
settings: dict) -> str:
|
||||
"""
|
||||
Compare new file vs existing file
|
||||
|
||||
Returns: "new_better", "existing_better", or "same"
|
||||
"""
|
||||
clean_weight = settings.get('clean_score_weight', 3)
|
||||
quality_weight = settings.get('quality_score_weight', 1)
|
||||
min_difference = settings.get('min_text_difference', 5)
|
||||
|
||||
# IMPORTANT: Check for extreme quality differences first
|
||||
# If one file has significantly higher quality, prefer it unless clean score is terrible
|
||||
# This prevents low-resolution files from winning just because they have less detected text
|
||||
min_acceptable_clean = settings.get('min_acceptable_clean', 30)
|
||||
quality_ratio_threshold = settings.get('quality_ratio_threshold', 2.0)
|
||||
|
||||
# Check if new file has dramatically better quality
|
||||
if new_quality > 0 and existing_quality > 0:
|
||||
quality_ratio = new_quality / existing_quality
|
||||
reverse_ratio = existing_quality / new_quality
|
||||
|
||||
# New file has 2x+ better quality and acceptable clean score
|
||||
if quality_ratio >= quality_ratio_threshold and new_clean >= min_acceptable_clean:
|
||||
self.logger.debug(
|
||||
f"[PERCEPTUAL] New file wins: {quality_ratio:.1f}x better quality "
|
||||
f"(new: Q={new_quality:.1f}/C={new_clean:.1f}, existing: Q={existing_quality:.1f}/C={existing_clean:.1f})",
|
||||
module="Perceptual"
|
||||
)
|
||||
return "new_better"
|
||||
|
||||
# Existing file has 2x+ better quality and acceptable clean score
|
||||
if reverse_ratio >= quality_ratio_threshold and existing_clean >= min_acceptable_clean:
|
||||
self.logger.debug(
|
||||
f"[PERCEPTUAL] Existing file wins: {reverse_ratio:.1f}x better quality "
|
||||
f"(existing: Q={existing_quality:.1f}/C={existing_clean:.1f}, new: Q={new_quality:.1f}/C={new_clean:.1f})",
|
||||
module="Perceptual"
|
||||
)
|
||||
return "existing_better"
|
||||
|
||||
# Standard weighted comparison for cases without extreme quality differences
|
||||
new_score = (new_clean * clean_weight) + (new_quality * quality_weight)
|
||||
existing_score = (existing_clean * clean_weight) + (existing_quality * quality_weight)
|
||||
|
||||
# Check if difference is significant
|
||||
score_diff = abs(new_score - existing_score)
|
||||
min_score_diff = min_difference * clean_weight # Scale by weight
|
||||
|
||||
if new_score > existing_score and score_diff >= min_score_diff:
|
||||
return "new_better"
|
||||
elif existing_score > new_score and score_diff >= min_score_diff:
|
||||
return "existing_better"
|
||||
else:
|
||||
return "same"
|
||||
|
||||
def _store_perceptual_hash(self, file_path: str, platform: str, source: str, content_type: str,
|
||||
phash: str, text_count: int, text_chars: int,
|
||||
quality_score: float, clean_score: float, quality_metrics: dict):
|
||||
"""Store perceptual hash and metadata in database (or update if exists)"""
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check if hash already exists for this file
|
||||
cursor.execute("""
|
||||
SELECT id FROM instagram_perceptual_hashes
|
||||
WHERE file_path = ?
|
||||
""", (str(file_path),))
|
||||
|
||||
existing = cursor.fetchone()
|
||||
|
||||
if existing:
|
||||
# Update existing entry
|
||||
cursor.execute("""
|
||||
UPDATE instagram_perceptual_hashes
|
||||
SET filename = ?,
|
||||
platform = ?,
|
||||
source = ?,
|
||||
content_type = ?,
|
||||
perceptual_hash = ?,
|
||||
text_overlay_count = ?,
|
||||
text_overlay_chars = ?,
|
||||
quality_score = ?,
|
||||
clean_score = ?,
|
||||
resolution = ?,
|
||||
file_size = ?,
|
||||
width = ?,
|
||||
height = ?,
|
||||
created_at = CURRENT_TIMESTAMP
|
||||
WHERE id = ?
|
||||
""", (
|
||||
Path(file_path).name,
|
||||
platform,
|
||||
source,
|
||||
content_type or 'unknown',
|
||||
phash,
|
||||
text_count,
|
||||
text_chars,
|
||||
quality_score,
|
||||
clean_score,
|
||||
quality_metrics.get('resolution', 0),
|
||||
quality_metrics.get('file_size', 0),
|
||||
quality_metrics.get('width', 0),
|
||||
quality_metrics.get('height', 0),
|
||||
existing[0]
|
||||
))
|
||||
self.logger.debug(f"[PERCEPTUAL] Updated hash for {Path(file_path).name}", module="Perceptual")
|
||||
else:
|
||||
# Insert new entry
|
||||
entry_id = str(uuid.uuid4())
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO instagram_perceptual_hashes
|
||||
(id, file_path, filename, platform, source, content_type,
|
||||
perceptual_hash, text_overlay_count, text_overlay_chars,
|
||||
quality_score, clean_score, resolution, file_size, width, height)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
entry_id,
|
||||
str(file_path),
|
||||
Path(file_path).name,
|
||||
platform,
|
||||
source,
|
||||
content_type or 'unknown',
|
||||
phash,
|
||||
text_count,
|
||||
text_chars,
|
||||
quality_score,
|
||||
clean_score,
|
||||
quality_metrics.get('resolution', 0),
|
||||
quality_metrics.get('file_size', 0),
|
||||
quality_metrics.get('width', 0),
|
||||
quality_metrics.get('height', 0)
|
||||
))
|
||||
self.logger.debug(f"[PERCEPTUAL] Stored hash for {Path(file_path).name}", module="Perceptual")
|
||||
|
||||
conn.commit()
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to store perceptual hash: {e}", module="Perceptual")
|
||||
# Note: Connection context manager handles rollback automatically on exception
|
||||
|
||||
def _replace_perceptual_hash_entry(self, old_id: str, new_file_path: str,
|
||||
new_phash: str, new_text_count: int, new_text_chars: int,
|
||||
new_quality_score: float, new_clean_score: float,
|
||||
new_quality_metrics: dict):
|
||||
"""Replace old hash entry with new file data"""
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
UPDATE instagram_perceptual_hashes
|
||||
SET file_path = ?,
|
||||
filename = ?,
|
||||
perceptual_hash = ?,
|
||||
text_overlay_count = ?,
|
||||
text_overlay_chars = ?,
|
||||
quality_score = ?,
|
||||
clean_score = ?,
|
||||
resolution = ?,
|
||||
file_size = ?,
|
||||
width = ?,
|
||||
height = ?,
|
||||
created_at = CURRENT_TIMESTAMP
|
||||
WHERE id = ?
|
||||
""", (
|
||||
str(new_file_path),
|
||||
Path(new_file_path).name,
|
||||
new_phash,
|
||||
new_text_count,
|
||||
new_text_chars,
|
||||
new_quality_score,
|
||||
new_clean_score,
|
||||
new_quality_metrics.get('resolution', 0),
|
||||
new_quality_metrics.get('file_size', 0),
|
||||
new_quality_metrics.get('width', 0),
|
||||
new_quality_metrics.get('height', 0),
|
||||
old_id
|
||||
))
|
||||
|
||||
conn.commit()
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to replace perceptual hash entry: {e}", module="Perceptual")
|
||||
|
||||
def _move_to_recycle(self, file_path: str, reason: str, **metadata):
|
||||
"""Move file to recycle bin"""
|
||||
try:
|
||||
self.db.move_to_recycle_bin(
|
||||
file_path=file_path,
|
||||
deleted_from='instagram_perceptual_duplicate_detection',
|
||||
deleted_by='system',
|
||||
metadata={
|
||||
'reason': reason,
|
||||
**metadata
|
||||
}
|
||||
)
|
||||
|
||||
self.logger.debug(f"[PERCEPTUAL] Moved to recycle: {Path(file_path).name}", module="Perceptual")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to move file to recycle: {e}", module="Perceptual")
|
||||
# Fallback to delete if recycle fails
|
||||
try:
|
||||
Path(file_path).unlink()
|
||||
except Exception:
|
||||
pass
|
||||
163
modules/instagram_rate_limiter.py
Normal file
163
modules/instagram_rate_limiter.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""
|
||||
Shared Instagram API rate limiter.
|
||||
|
||||
Tracks authenticated API calls in a rolling 1-hour window and enforces
|
||||
a configurable max rate. Both the main scraper and paid content modules
|
||||
use this to avoid exceeding Instagram's rate threshold.
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from collections import deque
|
||||
|
||||
logger = logging.getLogger('media_downloader')
|
||||
|
||||
_PAUSE_FILE = '/opt/media-downloader/data/.ig_paused_until'
|
||||
|
||||
|
||||
class InstagramBlockedError(Exception):
|
||||
"""Raised when Instagram API calls are paused due to account restriction."""
|
||||
pass
|
||||
|
||||
|
||||
class _InstagramRateLimiter:
|
||||
def __init__(self, max_calls_per_hour=180, window_seconds=3600):
|
||||
self.max_calls = max_calls_per_hour
|
||||
self.window = window_seconds
|
||||
self._timestamps = deque()
|
||||
self._lock = threading.Lock()
|
||||
self._operation_lock = threading.Lock() # Cross-module mutex
|
||||
self._paused_until = 0 # Unix timestamp — block all calls until this time
|
||||
self._load_pause_state()
|
||||
|
||||
def _load_pause_state(self):
|
||||
"""Load pause state from disk (survives restarts)."""
|
||||
try:
|
||||
if os.path.exists(_PAUSE_FILE):
|
||||
with open(_PAUSE_FILE) as f:
|
||||
ts = float(f.read().strip())
|
||||
if ts > time.time():
|
||||
self._paused_until = ts
|
||||
logger.warning(
|
||||
f"[IG-RateLimit] Loaded pause state — blocked until "
|
||||
f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))}"
|
||||
)
|
||||
else:
|
||||
# Expired, clean up
|
||||
os.remove(_PAUSE_FILE)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def pause_until(self, timestamp: float):
|
||||
"""Block all Instagram API calls until the given unix timestamp."""
|
||||
self._paused_until = timestamp
|
||||
try:
|
||||
os.makedirs(os.path.dirname(_PAUSE_FILE), exist_ok=True)
|
||||
with open(_PAUSE_FILE, 'w') as f:
|
||||
f.write(str(timestamp))
|
||||
except Exception:
|
||||
pass
|
||||
logger.warning(
|
||||
f"[IG-RateLimit] All Instagram API calls PAUSED until "
|
||||
f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp))}"
|
||||
)
|
||||
|
||||
def resume(self):
|
||||
"""Resume Instagram API calls."""
|
||||
self._paused_until = 0
|
||||
try:
|
||||
if os.path.exists(_PAUSE_FILE):
|
||||
os.remove(_PAUSE_FILE)
|
||||
except Exception:
|
||||
pass
|
||||
logger.info("[IG-RateLimit] Instagram API calls RESUMED")
|
||||
|
||||
@property
|
||||
def is_paused(self):
|
||||
if self._paused_until and time.time() < self._paused_until:
|
||||
return True
|
||||
return False
|
||||
|
||||
def track_call(self):
|
||||
"""Record an API call timestamp."""
|
||||
with self._lock:
|
||||
self._timestamps.append(time.time())
|
||||
|
||||
def wait_if_needed(self):
|
||||
"""Block if approaching rate limit. Call before each authenticated API request.
|
||||
|
||||
Raises InstagramBlockedError if calls are paused due to account restriction.
|
||||
"""
|
||||
# Check kill switch first (outside lock — fast path)
|
||||
# Re-check file if not paused in memory (another process may have set it)
|
||||
if not self._paused_until:
|
||||
self._load_pause_state()
|
||||
if self._paused_until:
|
||||
now = time.time()
|
||||
if now < self._paused_until:
|
||||
remaining = self._paused_until - now
|
||||
hours = remaining / 3600
|
||||
raise InstagramBlockedError(
|
||||
f"Instagram API paused — account restricted. "
|
||||
f"Resuming in {hours:.1f}h"
|
||||
)
|
||||
else:
|
||||
# Restriction expired, auto-resume
|
||||
self.resume()
|
||||
|
||||
with self._lock:
|
||||
now = time.time()
|
||||
cutoff = now - self.window
|
||||
# Purge old entries
|
||||
while self._timestamps and self._timestamps[0] < cutoff:
|
||||
self._timestamps.popleft()
|
||||
|
||||
count = len(self._timestamps)
|
||||
|
||||
if count >= self.max_calls:
|
||||
# At limit — wait until the oldest call in window expires
|
||||
wait_time = self._timestamps[0] - cutoff + 1
|
||||
logger.warning(
|
||||
f"[IG-RateLimit] At limit ({count}/{self.max_calls} calls/hr) — "
|
||||
f"waiting {wait_time:.0f}s"
|
||||
)
|
||||
# Release lock while sleeping
|
||||
self._lock.release()
|
||||
try:
|
||||
time.sleep(wait_time)
|
||||
finally:
|
||||
self._lock.acquire()
|
||||
elif count >= self.max_calls * 0.8:
|
||||
# Approaching limit (80%+) — add progressive delay
|
||||
ratio = count / self.max_calls
|
||||
delay = 2 + (ratio - 0.8) * 40 # 2s at 80%, 10s at 100%
|
||||
logger.info(
|
||||
f"[IG-RateLimit] Approaching limit ({count}/{self.max_calls} calls/hr) — "
|
||||
f"adding {delay:.1f}s delay"
|
||||
)
|
||||
self._lock.release()
|
||||
try:
|
||||
time.sleep(delay)
|
||||
finally:
|
||||
self._lock.acquire()
|
||||
|
||||
self._timestamps.append(time.time())
|
||||
|
||||
@property
|
||||
def operation_lock(self):
|
||||
"""Lock for serializing Instagram operations (main scraper vs paid content)."""
|
||||
return self._operation_lock
|
||||
|
||||
@property
|
||||
def calls_in_window(self):
|
||||
with self._lock:
|
||||
now = time.time()
|
||||
cutoff = now - self.window
|
||||
while self._timestamps and self._timestamps[0] < cutoff:
|
||||
self._timestamps.popleft()
|
||||
return len(self._timestamps)
|
||||
|
||||
|
||||
# Module-level singleton
|
||||
rate_limiter = _InstagramRateLimiter(max_calls_per_hour=180)
|
||||
782
modules/instagram_repost_detector.py
Normal file
782
modules/instagram_repost_detector.py
Normal file
@@ -0,0 +1,782 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Instagram Story Repost Detector Module
|
||||
|
||||
Detects when Instagram stories are reposts/screenshots of other users' content,
|
||||
then replaces low-quality reposts with high-quality originals from the source.
|
||||
|
||||
Features:
|
||||
- OCR-based repost detection (@username extraction)
|
||||
- ImgInn for downloading both stories and posts
|
||||
- Perceptual hash matching for content identification
|
||||
- Smart account filtering (monitored vs non-monitored)
|
||||
- Automatic cleanup of temporary downloads
|
||||
- Database tracking of all replacements
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, List, Tuple
|
||||
from datetime import datetime, timedelta
|
||||
from modules.base_module import LoggingMixin
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
# Module-level logger for import-time messages
|
||||
_module_logger = get_logger('RepostDetector')
|
||||
|
||||
# Optional imports - fail gracefully if not available
|
||||
from PIL import Image # Always needed
|
||||
|
||||
# OCR disabled — not currently needed
|
||||
EASYOCR_AVAILABLE = False
|
||||
TESSERACT_AVAILABLE = False
|
||||
|
||||
try:
|
||||
import cv2
|
||||
import numpy as np
|
||||
CV2_AVAILABLE = True
|
||||
except ImportError:
|
||||
CV2_AVAILABLE = False
|
||||
_module_logger.warning("opencv-python not available - video processing disabled", module='RepostDetector')
|
||||
|
||||
try:
|
||||
import imagehash
|
||||
IMAGEHASH_AVAILABLE = True
|
||||
except ImportError:
|
||||
IMAGEHASH_AVAILABLE = False
|
||||
_module_logger.warning("imagehash not available - perceptual hashing disabled", module='RepostDetector')
|
||||
|
||||
|
||||
class InstagramRepostDetector(LoggingMixin):
|
||||
"""
|
||||
Detects and replaces Instagram story reposts with original content
|
||||
"""
|
||||
|
||||
def __init__(self, unified_db, log_callback=None):
|
||||
"""
|
||||
Initialize the detector
|
||||
|
||||
Args:
|
||||
unified_db: UnifiedDatabase instance
|
||||
log_callback: Optional logging callback function(message, level)
|
||||
"""
|
||||
# Initialize logging via mixin
|
||||
self._init_logger('RepostDetector', log_callback, default_module='RepostDetector')
|
||||
|
||||
self.db = unified_db
|
||||
self.temp_download_path = Path("/tmp/repost_detection")
|
||||
self.last_original_username = None
|
||||
self.easyocr_reader = None
|
||||
|
||||
# Ensure temp directory exists
|
||||
self.temp_download_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Initialize EasyOCR reader (lazy loading - only when needed)
|
||||
if EASYOCR_AVAILABLE:
|
||||
try:
|
||||
# Suppress PyTorch pin_memory warning (we're using CPU anyway)
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore', category=UserWarning, module='torch.utils.data.dataloader')
|
||||
|
||||
self.easyocr_reader = easyocr.Reader(['en'], gpu=False, verbose=False)
|
||||
self.log("EasyOCR initialized for text detection", "info")
|
||||
except Exception as e:
|
||||
self.log(f"Failed to initialize EasyOCR: {e}", "warning")
|
||||
self.easyocr_reader = None
|
||||
|
||||
# Check dependencies
|
||||
self._check_dependencies()
|
||||
|
||||
def _check_dependencies(self):
|
||||
"""Check if all required dependencies are available"""
|
||||
missing = []
|
||||
|
||||
if not TESSERACT_AVAILABLE:
|
||||
missing.append("pytesseract/PIL (pip3 install pytesseract pillow)")
|
||||
if not CV2_AVAILABLE:
|
||||
missing.append("opencv-python (pip3 install opencv-python)")
|
||||
if not IMAGEHASH_AVAILABLE:
|
||||
missing.append("imagehash (pip3 install imagehash)")
|
||||
|
||||
if missing:
|
||||
self.log(f"Missing dependencies: {', '.join(missing)}", "warning")
|
||||
self.log("Repost detection will be disabled until dependencies are installed", "warning")
|
||||
|
||||
def check_and_replace_repost(self, file_path: str, source_username: str) -> Optional[str]:
|
||||
"""
|
||||
Check if story is a repost and replace with original
|
||||
|
||||
Args:
|
||||
file_path: Path to potential repost file
|
||||
source_username: Username who posted this story (e.g., evalongoria)
|
||||
|
||||
Returns:
|
||||
Path to replacement file if found, None otherwise
|
||||
"""
|
||||
# Check dependencies
|
||||
if not all([TESSERACT_AVAILABLE, CV2_AVAILABLE, IMAGEHASH_AVAILABLE]):
|
||||
self.log("Cannot process - missing dependencies", "debug")
|
||||
return None
|
||||
|
||||
file_path = Path(file_path)
|
||||
if not file_path.exists():
|
||||
self.log(f"File not found: {file_path}", "error")
|
||||
return None
|
||||
|
||||
self.log(f"Checking for repost: {file_path.name}", "info")
|
||||
|
||||
# Step 1: OCR to extract original @username
|
||||
original_username = self._extract_username_from_repost(str(file_path))
|
||||
if not original_username:
|
||||
self.log(f"No @username detected - not a repost", "debug")
|
||||
return None
|
||||
|
||||
# Check if user is reposting their own content
|
||||
if original_username.lower() == source_username.lower():
|
||||
self.log(f"@{source_username} is reposting their own content - skipping", "debug")
|
||||
return None
|
||||
|
||||
self.log(f"Detected repost from @{original_username} in @{source_username}'s story", "info")
|
||||
self.last_original_username = original_username
|
||||
|
||||
# Step 2: Check if original user is monitored
|
||||
is_monitored = self._is_monitored_account(original_username)
|
||||
|
||||
# Step 3: Always save repost sources permanently (for face recognition + quality)
|
||||
# Even non-monitored accounts get saved - they were discovered via reposts
|
||||
download_path = Path("/opt/immich/md/instagram") / original_username
|
||||
add_to_database = True
|
||||
|
||||
if is_monitored:
|
||||
self.log(f"@{original_username} is monitored - checking existing content", "info")
|
||||
else:
|
||||
self.log(f"@{original_username} NOT monitored - but saving permanently (discovered via repost)", "info")
|
||||
|
||||
# Step 4: Check if we already fetched this user's content today
|
||||
if not self._already_fetched_today(original_username):
|
||||
# Step 5: Download stories + recent posts
|
||||
self.log(f"Downloading content from @{original_username} via ImgInn...", "info")
|
||||
success = self._download_content_via_imginn(
|
||||
username=original_username,
|
||||
destination=download_path,
|
||||
add_to_database=add_to_database
|
||||
)
|
||||
|
||||
if not success:
|
||||
self.log(f"Failed to download content from @{original_username}", "error")
|
||||
return None
|
||||
else:
|
||||
self.log(f"Content from @{original_username} already fetched today - using cache", "info")
|
||||
|
||||
# Step 6: Find matching original via perceptual hash
|
||||
original_file = self._find_matching_original(
|
||||
repost_path=str(file_path),
|
||||
search_dir=download_path
|
||||
)
|
||||
|
||||
if not original_file:
|
||||
self.log(f"No matching original found for {file_path.name}", "warning")
|
||||
# Keep all downloaded files - they'll be processed by move manager (face recognition, etc.)
|
||||
self.log(f"Keeping all downloaded content from @{original_username} for processing", "info")
|
||||
return None
|
||||
|
||||
# Step 7: Replace repost with original
|
||||
replacement = self._replace_repost_with_original(
|
||||
repost_path=str(file_path),
|
||||
original_path=original_file
|
||||
)
|
||||
|
||||
# All files are kept permanently - move manager will process them
|
||||
self.log(f"All content from @{original_username} saved to {download_path}", "info")
|
||||
|
||||
return replacement
|
||||
|
||||
def _extract_username_region(self, img: Image.Image) -> Image.Image:
|
||||
"""Extract just the username region (top-left) and scale up for better OCR"""
|
||||
if not CV2_AVAILABLE:
|
||||
# Fallback: just crop using PIL
|
||||
width, height = img.size
|
||||
# Crop top 8% of image where username appears
|
||||
return img.crop((0, 0, width, int(height * 0.08)))
|
||||
|
||||
try:
|
||||
# Convert PIL to OpenCV format
|
||||
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
||||
|
||||
# Instagram usernames appear in top-left corner
|
||||
# Crop to top 8% where username text is located
|
||||
height, width = img_cv.shape[:2]
|
||||
username_region = img_cv[0:int(height * 0.08), :]
|
||||
|
||||
# Convert to grayscale for better OCR
|
||||
gray = cv2.cvtColor(username_region, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Scale up 4x for better OCR on small text
|
||||
# Instagram story usernames are quite small
|
||||
scaled = cv2.resize(gray, None, fx=4, fy=4, interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
# Convert back to PIL
|
||||
return Image.fromarray(scaled)
|
||||
except Exception as e:
|
||||
self.log(f"Username region extraction failed: {e}", "debug")
|
||||
return img
|
||||
|
||||
def _extract_username_from_repost(self, file_path: str) -> Optional[str]:
|
||||
"""
|
||||
Extract @username from repost overlay using OCR (EasyOCR primary, Tesseract fallback)
|
||||
|
||||
Handles both images and videos (multi-frame extraction for videos)
|
||||
"""
|
||||
# Check if we have any OCR available
|
||||
if not self.easyocr_reader and not TESSERACT_AVAILABLE:
|
||||
self.log("No OCR engine available", "warning")
|
||||
return None
|
||||
|
||||
try:
|
||||
# For images: Use EasyOCR (much better than Tesseract for Instagram overlays)
|
||||
if file_path.endswith(('.jpg', '.jpeg', '.png', '.webp', '.heic')):
|
||||
|
||||
# Try EasyOCR first (best for Instagram stories)
|
||||
if self.easyocr_reader:
|
||||
try:
|
||||
results = self.easyocr_reader.readtext(file_path)
|
||||
|
||||
# EasyOCR returns list of (bbox, text, confidence)
|
||||
all_text = []
|
||||
for bbox, text, conf in results:
|
||||
if conf > 0.5: # Only use detections with >50% confidence
|
||||
all_text.append(text)
|
||||
|
||||
text = " ".join(all_text)
|
||||
if text.strip():
|
||||
self.log(f"EasyOCR detected text: {text[:100]}", "debug")
|
||||
except Exception as e:
|
||||
self.log(f"EasyOCR failed: {e}, falling back to Tesseract", "debug")
|
||||
text = ""
|
||||
else:
|
||||
text = ""
|
||||
|
||||
# Fallback to Tesseract if EasyOCR didn't find anything
|
||||
if not text.strip() and TESSERACT_AVAILABLE:
|
||||
with Image.open(file_path) as img:
|
||||
username_region = self._extract_username_region(img)
|
||||
|
||||
for config in ['--psm 7', '--psm 11', '--psm 6']:
|
||||
try:
|
||||
ocr_result = pytesseract.image_to_string(username_region, config=config)
|
||||
if ocr_result and len(ocr_result.strip()) > 2:
|
||||
text = ocr_result
|
||||
self.log(f"Tesseract (fallback) text: {text[:100]}", "debug")
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# For videos: extract multiple frames and OCR each
|
||||
elif file_path.endswith(('.mp4', '.mov', '.avi', '.mkv', '.webm')):
|
||||
if not CV2_AVAILABLE:
|
||||
self.log("OpenCV not available - cannot process video", "warning")
|
||||
return None
|
||||
|
||||
video = cv2.VideoCapture(file_path)
|
||||
if not video.isOpened():
|
||||
self.log(f"Failed to open video: {file_path}", "warning")
|
||||
return None
|
||||
|
||||
frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
if frame_count == 0:
|
||||
self.log(f"Video has no frames: {file_path}", "warning")
|
||||
return None
|
||||
|
||||
# Check frames at 0%, 10%, and 50% positions
|
||||
frames_to_check = [
|
||||
0,
|
||||
max(0, int(frame_count * 0.1)),
|
||||
max(0, int(frame_count * 0.5))
|
||||
]
|
||||
|
||||
text = ""
|
||||
for frame_num in frames_to_check:
|
||||
video.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
|
||||
ret, frame = video.read()
|
||||
if ret:
|
||||
# Try EasyOCR first
|
||||
if self.easyocr_reader:
|
||||
try:
|
||||
results = self.easyocr_reader.readtext(frame)
|
||||
for bbox, frame_text, conf in results:
|
||||
if conf > 0.5:
|
||||
text += frame_text + " "
|
||||
except Exception as e:
|
||||
self.log(f"EasyOCR video frame failed: {e}", "debug")
|
||||
|
||||
# Fallback to Tesseract if needed
|
||||
if not text.strip() and TESSERACT_AVAILABLE:
|
||||
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
img = Image.fromarray(frame_rgb)
|
||||
username_region = self._extract_username_region(img)
|
||||
|
||||
for config in ['--psm 7', '--psm 11', '--psm 6']:
|
||||
try:
|
||||
ocr_result = pytesseract.image_to_string(username_region, config=config)
|
||||
if ocr_result and len(ocr_result.strip()) > 2:
|
||||
text += ocr_result + "\n"
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
video.release()
|
||||
self.log(f"OCR text (video, {len(frames_to_check)} frames): {text[:100]}...", "debug")
|
||||
else:
|
||||
self.log(f"Unsupported file format: {file_path}", "debug")
|
||||
return None
|
||||
|
||||
# Parse text to find @username or just username
|
||||
# Pattern 1: @ followed by username characters (including space which might be underscore)
|
||||
# Instagram usernames can have underscores, but OCR sometimes reads them as spaces
|
||||
matches = re.findall(r'@([a-zA-Z0-9._ ]+)', text)
|
||||
|
||||
if matches:
|
||||
# Clean up: remove trailing spaces, convert spaces to underscores
|
||||
username = matches[0].strip().replace(' ', '_')
|
||||
# Remove any characters that aren't valid in Instagram usernames
|
||||
username = re.sub(r'[^a-zA-Z0-9._]', '', username)
|
||||
# Remove trailing dots/underscores
|
||||
username = username.rstrip('._')
|
||||
|
||||
if len(username) >= 3: # Valid Instagram username minimum
|
||||
self.log(f"Extracted username (with @): @{username}", "info")
|
||||
return username
|
||||
|
||||
# Pattern 2: Instagram username without @ (at least 3 chars, lowercase letters, numbers, dots, underscores)
|
||||
# Filter out common OCR noise and make sure it's a valid Instagram username pattern
|
||||
lines = text.split('\n')
|
||||
for line in lines:
|
||||
line = line.strip().lower()
|
||||
# Match Instagram username pattern: 3-30 chars, alphanumeric + dots/underscores
|
||||
if re.match(r'^[a-z0-9._]{3,30}$', line):
|
||||
# Additional filter: likely an Instagram username (not random text)
|
||||
# Instagram usernames don't end with dots and must contain letters
|
||||
if not line.endswith('.') and re.search(r'[a-z]', line):
|
||||
self.log(f"Extracted username (without @): @{line}", "info")
|
||||
return line
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"OCR extraction failed: {e}", "warning")
|
||||
|
||||
return None
|
||||
|
||||
def _is_monitored_account(self, username: str) -> bool:
|
||||
"""
|
||||
Check if username is in search_monitors
|
||||
|
||||
Returns True if user is being actively monitored for downloads
|
||||
"""
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check search_monitors table
|
||||
cursor.execute("""
|
||||
SELECT 1 FROM search_monitors
|
||||
WHERE platform IN ('instagram', 'instaloader', 'fastdl', 'imginn')
|
||||
AND source = ?
|
||||
AND active = 1
|
||||
LIMIT 1
|
||||
""", (username,))
|
||||
|
||||
return cursor.fetchone() is not None
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error checking monitored status: {e}", "error")
|
||||
return False
|
||||
|
||||
def _already_fetched_today(self, username: str) -> bool:
|
||||
"""
|
||||
Check if we already downloaded this user's content today
|
||||
|
||||
Uses repost_fetch_cache table to track fetches
|
||||
"""
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Create cache table if doesn't exist
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS repost_fetch_cache (
|
||||
username TEXT PRIMARY KEY,
|
||||
last_fetched TEXT NOT NULL,
|
||||
content_count INTEGER DEFAULT 0
|
||||
)
|
||||
""")
|
||||
|
||||
# Check if fetched in last 12 hours
|
||||
cursor.execute("""
|
||||
SELECT last_fetched FROM repost_fetch_cache
|
||||
WHERE username = ?
|
||||
AND datetime(last_fetched) > datetime('now', '-12 hours')
|
||||
""", (username,))
|
||||
|
||||
result = cursor.fetchone()
|
||||
return result is not None
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error checking fetch cache: {e}", "error")
|
||||
return False
|
||||
|
||||
def _mark_fetched(self, username: str, content_count: int = 0):
|
||||
"""Mark that we fetched this user's content"""
|
||||
try:
|
||||
with self.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
INSERT OR REPLACE INTO repost_fetch_cache
|
||||
(username, last_fetched, content_count)
|
||||
VALUES (?, ?, ?)
|
||||
""", (username, datetime.now().isoformat(), content_count))
|
||||
|
||||
self.log(f"Marked @{username} as fetched ({content_count} items)", "debug")
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error marking fetch: {e}", "error")
|
||||
|
||||
def _download_content_via_imginn(self, username: str, destination: Path, add_to_database: bool) -> bool:
|
||||
"""
|
||||
Download stories AND recent posts from user via ImgInn
|
||||
|
||||
Args:
|
||||
username: Instagram username
|
||||
destination: Where to save (normal path or /tmp)
|
||||
add_to_database: If False, skip database recording (temp processing)
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Import imginn module
|
||||
from modules.imginn_module import ImgInnDownloader
|
||||
|
||||
# Initialize ImgInn with or without database
|
||||
imginn = ImgInnDownloader(
|
||||
unified_db=self.db if add_to_database else None,
|
||||
log_callback=lambda msg, lvl: self.log(msg, lvl)
|
||||
)
|
||||
|
||||
# Create destination directories
|
||||
stories_dir = destination / "stories"
|
||||
posts_dir = destination / "posts"
|
||||
stories_dir.mkdir(parents=True, exist_ok=True)
|
||||
posts_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download stories
|
||||
self.log(f"Downloading stories from @{username} via ImgInn...", "info")
|
||||
stories_files = imginn.download_stories(
|
||||
username=username,
|
||||
max_stories=50,
|
||||
output_dir=stories_dir,
|
||||
skip_database=not add_to_database
|
||||
)
|
||||
stories_count = len(stories_files) if isinstance(stories_files, list) else 0
|
||||
|
||||
# Download recent posts (last 24 hours)
|
||||
self.log(f"Downloading recent posts from @{username} via ImgInn...", "info")
|
||||
posts_files = imginn.download_posts(
|
||||
username=username,
|
||||
max_posts=50,
|
||||
output_dir=posts_dir,
|
||||
max_age_hours=24,
|
||||
skip_database=not add_to_database
|
||||
)
|
||||
posts_count = len(posts_files) if isinstance(posts_files, list) else 0
|
||||
|
||||
total_count = stories_count + posts_count
|
||||
self.log(f"Downloaded {total_count} items ({stories_count} stories, {posts_count} posts)", "info")
|
||||
|
||||
# Mark this fetch in cache
|
||||
self._mark_fetched(username, total_count)
|
||||
|
||||
return total_count > 0
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"ImgInn download failed: {e}", "error")
|
||||
return False
|
||||
|
||||
def _find_matching_original(self, repost_path: str, search_dir: Path) -> Optional[str]:
|
||||
"""
|
||||
Find matching original content using perceptual hashing
|
||||
|
||||
Searches both stories/ and posts/ subdirectories
|
||||
|
||||
Args:
|
||||
repost_path: Path to the repost file (e.g., evalongoria story)
|
||||
search_dir: Directory to search (e.g., /tmp/.../globalgiftfoundation/)
|
||||
|
||||
Returns:
|
||||
Path to best matching original, or None
|
||||
"""
|
||||
if not IMAGEHASH_AVAILABLE:
|
||||
self.log("imagehash not available - cannot match", "warning")
|
||||
return None
|
||||
|
||||
# Calculate hash of repost
|
||||
repost_hash = self._get_perceptual_hash(repost_path)
|
||||
if not repost_hash:
|
||||
self.log(f"Failed to calculate hash for repost: {repost_path}", "warning")
|
||||
return None
|
||||
|
||||
self.log(f"Repost hash: {repost_hash}", "debug")
|
||||
|
||||
# Search both stories and posts
|
||||
best_match = None
|
||||
best_distance = 999
|
||||
threshold = 10 # Hamming distance threshold (0-64 scale)
|
||||
|
||||
for subdir in ["stories", "posts"]:
|
||||
content_dir = search_dir / subdir
|
||||
if not content_dir.exists():
|
||||
self.log(f"Directory not found: {content_dir}", "debug")
|
||||
continue
|
||||
|
||||
files = list(content_dir.rglob("*"))
|
||||
self.log(f"Checking {len(files)} files in {content_dir}", "debug")
|
||||
|
||||
for file_path in files:
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
|
||||
# Skip non-media files
|
||||
if file_path.suffix.lower() not in ['.jpg', '.jpeg', '.png', '.mp4', '.mov', '.avi', '.webp']:
|
||||
continue
|
||||
|
||||
# Calculate hash
|
||||
candidate_hash = self._get_perceptual_hash(str(file_path))
|
||||
if not candidate_hash:
|
||||
continue
|
||||
|
||||
# Compare (Hamming distance)
|
||||
distance = repost_hash - candidate_hash
|
||||
|
||||
self.log(f" {file_path.name}: distance={distance}", "debug")
|
||||
|
||||
if distance < threshold and distance < best_distance:
|
||||
best_distance = distance
|
||||
best_match = str(file_path)
|
||||
self.log(f"Better match found: {file_path.name} (distance: {distance})", "info")
|
||||
|
||||
if best_match:
|
||||
self.log(f"✓ Found original: {Path(best_match).name} (distance: {best_distance})", "success")
|
||||
return best_match
|
||||
else:
|
||||
self.log(f"✗ No matching original found for {Path(repost_path).name}", "warning")
|
||||
return None
|
||||
|
||||
def _get_perceptual_hash(self, file_path: str):
|
||||
"""Calculate perceptual hash for image or video"""
|
||||
if not IMAGEHASH_AVAILABLE:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Image: direct hash
|
||||
if file_path.endswith(('.jpg', '.jpeg', '.png', '.webp', '.heic')):
|
||||
with Image.open(file_path) as img:
|
||||
return imagehash.dhash(img) # Difference hash (good for cropped/resized)
|
||||
|
||||
# Video: hash middle frame
|
||||
elif file_path.endswith(('.mp4', '.mov', '.avi', '.mkv', '.webm')):
|
||||
if not CV2_AVAILABLE:
|
||||
return None
|
||||
|
||||
video = cv2.VideoCapture(file_path)
|
||||
if not video.isOpened():
|
||||
return None
|
||||
|
||||
frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
if frame_count == 0:
|
||||
video.release()
|
||||
return None
|
||||
|
||||
# Use middle frame
|
||||
video.set(cv2.CAP_PROP_POS_FRAMES, frame_count // 2)
|
||||
ret, frame = video.read()
|
||||
|
||||
video.release()
|
||||
|
||||
if ret:
|
||||
# Convert BGR to RGB
|
||||
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
img = Image.fromarray(frame_rgb)
|
||||
result = imagehash.dhash(img)
|
||||
img.close()
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Hash calculation failed for {Path(file_path).name}: {e}", "debug")
|
||||
|
||||
return None
|
||||
|
||||
def _replace_repost_with_original(self, repost_path: str, original_path: str) -> str:
|
||||
"""
|
||||
Replace repost file with original high-quality file
|
||||
|
||||
Workflow:
|
||||
1. Move repost to recycle bin (preserves it, not deleted)
|
||||
2. Return path to ORIGINAL file with its original filename/metadata
|
||||
3. Move module processes original as if it was downloaded directly
|
||||
|
||||
Args:
|
||||
repost_path: Path to repost (e.g., evalongoria_story6.mp4)
|
||||
original_path: Path to original (e.g., globalgiftfoundation_20251109_100000.jpg)
|
||||
|
||||
Returns:
|
||||
Path to original file (keeps original filename and metadata)
|
||||
"""
|
||||
import os
|
||||
repost_file = Path(repost_path)
|
||||
original_file = Path(original_path)
|
||||
|
||||
# Move repost to recycle bin (not delete - can recover if mistake)
|
||||
if self.db:
|
||||
try:
|
||||
recycle_id = self.db.move_to_recycle_bin(
|
||||
file_path=str(repost_file),
|
||||
deleted_from='repost_detection',
|
||||
deleted_by='system',
|
||||
metadata={
|
||||
'reason': 'replaced_with_original',
|
||||
'original_source': str(original_file),
|
||||
'original_username': self.last_original_username
|
||||
}
|
||||
)
|
||||
if recycle_id:
|
||||
self.log(f"Moved repost to recycle bin: {repost_file.name} (ID: {recycle_id[:8]}...)", "info")
|
||||
else:
|
||||
self.log(f"Failed to move repost to recycle bin, will delete instead", "warning")
|
||||
# Fallback: delete if recycle bin fails
|
||||
repost_file.unlink()
|
||||
except Exception as e:
|
||||
self.log(f"Recycle bin failed: {e}, deleting repost", "warning")
|
||||
try:
|
||||
repost_file.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
# No database - just delete
|
||||
try:
|
||||
repost_file.unlink()
|
||||
self.log(f"Deleted repost: {repost_file.name}", "debug")
|
||||
except Exception as e:
|
||||
self.log(f"Failed to delete repost: {e}", "warning")
|
||||
|
||||
# Return path to ORIGINAL file with its original filename and metadata
|
||||
# Move module will process it as if it was downloaded directly from the original source
|
||||
self.log(f"Replacing repost with original: {repost_file.name} → {original_file.name}", "info")
|
||||
|
||||
# Update database to track replacement
|
||||
self._record_repost_replacement(
|
||||
repost_path=str(repost_file),
|
||||
original_path=str(original_file),
|
||||
replacement_path=str(original_file) # Same as original - keeps original filename
|
||||
)
|
||||
|
||||
return str(original_file)
|
||||
|
||||
def _record_repost_replacement(self, repost_path: str, original_path: str, replacement_path: str):
|
||||
"""
|
||||
Track repost replacements in database
|
||||
|
||||
Creates repost_replacements table to track what was replaced
|
||||
"""
|
||||
try:
|
||||
with self.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Create tracking table
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS repost_replacements (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
repost_path TEXT NOT NULL,
|
||||
repost_filename TEXT NOT NULL,
|
||||
repost_source TEXT NOT NULL,
|
||||
original_path TEXT NOT NULL,
|
||||
original_username TEXT NOT NULL,
|
||||
replacement_path TEXT NOT NULL,
|
||||
detected_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
hash_distance INTEGER
|
||||
)
|
||||
""")
|
||||
|
||||
# Extract usernames
|
||||
repost_source = Path(repost_path).parent.name
|
||||
original_username = self.last_original_username or "unknown"
|
||||
|
||||
# Insert record
|
||||
cursor.execute("""
|
||||
INSERT INTO repost_replacements
|
||||
(repost_path, repost_filename, repost_source, original_path, original_username, replacement_path)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
repost_path,
|
||||
Path(repost_path).name,
|
||||
repost_source,
|
||||
original_path,
|
||||
original_username,
|
||||
replacement_path
|
||||
))
|
||||
|
||||
self.log(f"Recorded replacement: {repost_source} → @{original_username}", "debug")
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Failed to record replacement: {e}", "error")
|
||||
|
||||
def _cleanup_temp_downloads(self, temp_dir: Path, keep_file: str = None):
|
||||
"""
|
||||
Clean up temporary downloads for non-monitored accounts
|
||||
|
||||
Args:
|
||||
temp_dir: Directory to clean (e.g., /tmp/repost_detection/username/)
|
||||
keep_file: Optional file to preserve (the matched original)
|
||||
"""
|
||||
if not temp_dir.exists():
|
||||
return
|
||||
|
||||
keep_path = Path(keep_file) if keep_file else None
|
||||
deleted_count = 0
|
||||
|
||||
try:
|
||||
# Delete all files except the keeper
|
||||
for file_path in temp_dir.rglob("*"):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
|
||||
if keep_path and file_path == keep_path:
|
||||
continue # Skip the matched file
|
||||
|
||||
try:
|
||||
file_path.unlink()
|
||||
deleted_count += 1
|
||||
except Exception as e:
|
||||
self.log(f"Failed to delete temp file {file_path.name}: {e}", "debug")
|
||||
|
||||
# Remove empty directories
|
||||
for subdir in [temp_dir / "stories", temp_dir / "posts"]:
|
||||
if subdir.exists() and not any(subdir.iterdir()):
|
||||
subdir.rmdir()
|
||||
|
||||
if not any(temp_dir.iterdir()):
|
||||
temp_dir.rmdir()
|
||||
|
||||
self.log(f"Cleaned up {deleted_count} temporary files", "info")
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Failed to cleanup directories: {e}", "debug")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Instagram Repost Detector Module")
|
||||
print("This module should be imported, not run directly")
|
||||
print("\nDependencies:")
|
||||
print(f" - pytesseract/PIL: {'✓' if TESSERACT_AVAILABLE else '✗'}")
|
||||
print(f" - opencv-python: {'✓' if CV2_AVAILABLE else '✗'}")
|
||||
print(f" - imagehash: {'✓' if IMAGEHASH_AVAILABLE else '✗'}")
|
||||
461
modules/instagram_utils.py
Normal file
461
modules/instagram_utils.py
Normal file
@@ -0,0 +1,461 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Instagram Utilities Module
|
||||
|
||||
Shared utility functions for Instagram downloaders (imginn, fastdl, toolzu, instaloader).
|
||||
Centralizes common functionality like media ID extraction to avoid code duplication.
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Set, Dict, Any
|
||||
|
||||
|
||||
def extract_instagram_media_id(filename_or_id: str) -> str:
|
||||
"""Extract the actual Instagram media ID from a filename or ID string.
|
||||
|
||||
Instagram image filenames follow the pattern:
|
||||
{user_id}_{media_id}_{post_id}_n.ext
|
||||
Where media_id is a 17-18 digit number starting with 18xxxxxxx
|
||||
|
||||
For video stories with AQ... format, these are story keys and
|
||||
we use the whole key as the media ID.
|
||||
|
||||
Args:
|
||||
filename_or_id: A filename like '591164014_18551181784006538_2284814566270897032_n'
|
||||
or just a media ID string
|
||||
|
||||
Returns:
|
||||
The extracted Instagram media ID (17-18 digit number) or the original string
|
||||
if no pattern matches
|
||||
|
||||
Examples:
|
||||
>>> extract_instagram_media_id('591164014_18551181784006538_2284814566270897032_n')
|
||||
'18551181784006538'
|
||||
>>> extract_instagram_media_id('18551181784006538')
|
||||
'18551181784006538'
|
||||
>>> extract_instagram_media_id('AQOOlj6M4PlGHBuYl02KzwUXefsdiou9q3ooFiNF4cUy3DEY6QKxROoUe9BKCeVJA4UF5BiVPIuqXheCU')
|
||||
'AQOOlj6M4PlGHBuYl02KzwUXefsdiou9q3ooFiNF4cUy3DEY6QKxROoUe9BKCeVJA4UF5BiVPIuqXheCU'
|
||||
"""
|
||||
if not filename_or_id:
|
||||
return filename_or_id
|
||||
|
||||
# Pattern 1: Standard Instagram image format with underscore separators
|
||||
# {user_id}_{media_id}_{post_id}_n
|
||||
# Media ID is the 17-18 digit number starting with 18
|
||||
# Use underscore or start/end as boundaries (not \b which doesn't work with underscores)
|
||||
ig_media_id_pattern = r'(?:^|_)(18\d{15,17})(?:_|$)'
|
||||
match = re.search(ig_media_id_pattern, filename_or_id)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# Pattern 2: If it's already a valid media ID (17-18 digits starting with 18)
|
||||
if re.match(r'^18\d{15,17}$', filename_or_id):
|
||||
return filename_or_id
|
||||
|
||||
# Pattern 3: Story key format (AQ... encoded string) - use as-is
|
||||
if filename_or_id.startswith('AQ') and len(filename_or_id) > 50:
|
||||
return filename_or_id
|
||||
|
||||
# Pattern 4: Short post code format (like DRkaDSFD-U2) - use as-is
|
||||
if re.match(r'^[A-Za-z0-9_-]{10,15}$', filename_or_id):
|
||||
return filename_or_id
|
||||
|
||||
# No pattern matched - return original string
|
||||
return filename_or_id
|
||||
|
||||
|
||||
def extract_media_id_from_url(url: str) -> Optional[str]:
|
||||
"""Extract Instagram media ID from a CDN URL.
|
||||
|
||||
Instagram CDN URLs contain media IDs in patterns like:
|
||||
561378837_18538674661006538_479694548187839800_n.jpg
|
||||
|
||||
The second number (18538674661006538) is the Instagram media ID.
|
||||
|
||||
Args:
|
||||
url: Instagram CDN URL string
|
||||
|
||||
Returns:
|
||||
Media ID string or None if not found
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
|
||||
# Pattern: number_MEDIAID_number_n.jpg or .mp4
|
||||
pattern = r'(\d+)_(\d{17,19})_\d+_n\.(jpg|mp4|jpeg|png)'
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
return match.group(2) # Return the media ID
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_media_ids_from_url(url: str) -> list:
|
||||
"""Extract all Instagram media IDs from a URL.
|
||||
|
||||
Similar to extract_media_id_from_url but returns all matches as a list.
|
||||
|
||||
Args:
|
||||
url: URL string that may contain Instagram media IDs
|
||||
|
||||
Returns:
|
||||
List of media IDs found in the URL
|
||||
"""
|
||||
if not url:
|
||||
return []
|
||||
|
||||
# Pattern: number_MEDIAID_number_n.jpg
|
||||
pattern = r'(\d+)_(\d{17,19})_\d+_n\.(jpg|mp4|jpeg|png)'
|
||||
matches = re.findall(pattern, url)
|
||||
|
||||
if matches:
|
||||
# Return the media ID (second capture group) from each match
|
||||
return [match[1] for match in matches]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def extract_post_shortcode(url: str) -> Optional[str]:
|
||||
"""Extract Instagram post shortcode from a URL.
|
||||
|
||||
Args:
|
||||
url: Instagram URL like https://www.instagram.com/p/ABC123/
|
||||
|
||||
Returns:
|
||||
Shortcode string or None if not found
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
|
||||
match = re.search(r'/p/([^/]+)/?', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def media_id_to_shortcode(media_id: str) -> str:
|
||||
"""Convert Instagram media ID to shortcode.
|
||||
|
||||
Args:
|
||||
media_id: Numeric media ID string
|
||||
|
||||
Returns:
|
||||
Instagram shortcode string
|
||||
"""
|
||||
alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
|
||||
|
||||
try:
|
||||
media_id_int = int(media_id)
|
||||
except (ValueError, TypeError):
|
||||
return media_id # Return as-is if not a valid number
|
||||
|
||||
shortcode = ''
|
||||
while media_id_int > 0:
|
||||
remainder = media_id_int % 64
|
||||
media_id_int = media_id_int // 64
|
||||
shortcode = alphabet[remainder] + shortcode
|
||||
|
||||
return shortcode or 'A'
|
||||
|
||||
|
||||
def scan_existing_files_for_media_ids(output_dir: Path, profile_name: str = None,
|
||||
min_file_size: int = 0, recursive: bool = True) -> Set[str]:
|
||||
"""Scan existing files and extract media IDs for duplicate detection.
|
||||
|
||||
Scans image and video files in the output directory, extracts both the
|
||||
full media ID string and the normalized Instagram media ID (18-digit number).
|
||||
|
||||
Args:
|
||||
output_dir: Directory to scan for existing files
|
||||
profile_name: Optional profile name to filter files
|
||||
min_file_size: Minimum file size in bytes (skip smaller files as corrupted)
|
||||
recursive: If True, search subdirectories (rglob), otherwise only top level (glob)
|
||||
|
||||
Returns:
|
||||
Set of media IDs (both full and normalized) found in existing files
|
||||
"""
|
||||
media_ids = set()
|
||||
|
||||
if not output_dir.exists():
|
||||
return media_ids
|
||||
|
||||
glob_func = output_dir.rglob if recursive else output_dir.glob
|
||||
|
||||
for pattern in ["*.jpg", "*.jpeg", "*.png", "*.heic", "*.mp4", "*.mov"]:
|
||||
for filepath in glob_func(pattern):
|
||||
# Skip files smaller than min_file_size (likely corrupted/incomplete)
|
||||
if min_file_size > 0:
|
||||
try:
|
||||
if filepath.stat().st_size < min_file_size:
|
||||
continue
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
filename = filepath.stem
|
||||
|
||||
# Format is: profile_YYYYMMDD_HHMMSS_mediaid
|
||||
# Split into parts: [profile, date, time, ...rest is media_id]
|
||||
parts = filename.split('_', 3)
|
||||
|
||||
if len(parts) >= 4:
|
||||
# Check profile name if provided
|
||||
if profile_name and parts[0] != profile_name:
|
||||
continue
|
||||
media_id_full = parts[3]
|
||||
elif len(parts) > 1:
|
||||
media_id_full = parts[-1]
|
||||
else:
|
||||
media_id_full = filename
|
||||
|
||||
if media_id_full:
|
||||
# Add the full media ID string
|
||||
media_ids.add(media_id_full)
|
||||
|
||||
# Also add the normalized Instagram media ID (18-digit number)
|
||||
normalized_id = extract_instagram_media_id(media_id_full)
|
||||
if normalized_id and normalized_id != media_id_full:
|
||||
media_ids.add(normalized_id)
|
||||
|
||||
return media_ids
|
||||
|
||||
|
||||
def parse_instagram_filename(filename: str) -> dict:
|
||||
"""Parse an Instagram filename into its components.
|
||||
|
||||
Args:
|
||||
filename: Filename like 'evalongoria_20251205_120406_591164014_18551181784006538_2284814566270897032_n_story1.jpg'
|
||||
|
||||
Returns:
|
||||
Dictionary with parsed components:
|
||||
- username: str or None
|
||||
- date: str or None (YYYYMMDD format)
|
||||
- time: str or None (HHMMSS format)
|
||||
- media_id_full: str or None (full ID after date/time)
|
||||
- media_id: str or None (normalized 18-digit Instagram media ID)
|
||||
- suffix: str or None (e.g., 'story1')
|
||||
- extension: str or None
|
||||
"""
|
||||
result = {
|
||||
'username': None,
|
||||
'date': None,
|
||||
'time': None,
|
||||
'media_id_full': None,
|
||||
'media_id': None,
|
||||
'suffix': None,
|
||||
'extension': None
|
||||
}
|
||||
|
||||
if not filename:
|
||||
return result
|
||||
|
||||
# Get extension
|
||||
path = Path(filename)
|
||||
result['extension'] = path.suffix.lower() if path.suffix else None
|
||||
basename = path.stem
|
||||
|
||||
# Split into parts
|
||||
parts = basename.split('_')
|
||||
|
||||
if len(parts) >= 4:
|
||||
result['username'] = parts[0]
|
||||
|
||||
# Check if parts[1] and parts[2] look like date/time
|
||||
if len(parts[1]) == 8 and parts[1].isdigit():
|
||||
result['date'] = parts[1]
|
||||
if len(parts[2]) == 6 and parts[2].isdigit():
|
||||
result['time'] = parts[2]
|
||||
|
||||
# Everything after date/time is the media ID (possibly with suffix)
|
||||
media_id_full = '_'.join(parts[3:])
|
||||
result['media_id_full'] = media_id_full
|
||||
|
||||
# Check for story suffix
|
||||
if '_story' in media_id_full:
|
||||
media_part, suffix_part = media_id_full.rsplit('_story', 1)
|
||||
result['media_id_full'] = media_part
|
||||
result['suffix'] = f'story{suffix_part}'
|
||||
|
||||
# Extract normalized media ID
|
||||
result['media_id'] = extract_instagram_media_id(result['media_id_full'])
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def record_instagram_download(db, media_id: str, username: str, content_type: str,
|
||||
filename: str, url: str = None, download_url: str = None,
|
||||
post_date: datetime = None, file_path: str = None,
|
||||
method: str = None, extra_metadata: Dict[str, Any] = None) -> bool:
|
||||
"""Record an Instagram download in the database with normalized media_id.
|
||||
|
||||
This is the centralized function for recording Instagram downloads across all
|
||||
Instagram downloader modules (imginn, fastdl, toolzu, instaloader). It ensures
|
||||
the media_id is always normalized for cross-module duplicate detection.
|
||||
|
||||
Args:
|
||||
db: Database instance (UnifiedDatabase or adapter with record_download method)
|
||||
media_id: The media ID (will be normalized automatically)
|
||||
username: Instagram username
|
||||
content_type: Type of content (posts, stories, reels, highlights)
|
||||
filename: Filename of the downloaded file
|
||||
url: Original Instagram URL (e.g., https://instagram.com/p/ABC123/)
|
||||
download_url: Direct download URL (CDN URL)
|
||||
post_date: Post date/time
|
||||
file_path: Full file path on disk
|
||||
method: Download method (imginn, fastdl, toolzu, instaloader)
|
||||
extra_metadata: Additional metadata to include
|
||||
|
||||
Returns:
|
||||
True if successfully recorded, False otherwise
|
||||
"""
|
||||
if not db:
|
||||
return False
|
||||
|
||||
# Normalize the media_id for consistent cross-module detection
|
||||
normalized_media_id = extract_instagram_media_id(media_id) if media_id else media_id
|
||||
|
||||
# Build metadata with normalized media_id
|
||||
metadata = {
|
||||
'media_id': normalized_media_id,
|
||||
'original_media_id': media_id if media_id != normalized_media_id else None,
|
||||
}
|
||||
|
||||
# Add extra metadata if provided
|
||||
if extra_metadata:
|
||||
metadata.update(extra_metadata)
|
||||
|
||||
# Remove None values
|
||||
metadata = {k: v for k, v in metadata.items() if v is not None}
|
||||
|
||||
# Determine URL for database (use download_url or construct from media_id)
|
||||
db_url = url or download_url or f"instagram://{normalized_media_id}"
|
||||
|
||||
# Calculate file hash if file_path provided
|
||||
file_hash = None
|
||||
if file_path:
|
||||
try:
|
||||
from modules.unified_database import UnifiedDatabase
|
||||
file_hash = UnifiedDatabase.get_file_hash(file_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
# Try to use the db's record_download method directly
|
||||
if hasattr(db, 'record_download'):
|
||||
return db.record_download(
|
||||
url=db_url,
|
||||
platform='instagram',
|
||||
source=username,
|
||||
content_type=content_type,
|
||||
filename=filename,
|
||||
file_path=file_path,
|
||||
file_hash=file_hash,
|
||||
post_date=post_date,
|
||||
metadata=metadata,
|
||||
method=method
|
||||
)
|
||||
# Fallback for adapter-style databases
|
||||
elif hasattr(db, 'mark_downloaded'):
|
||||
return db.mark_downloaded(
|
||||
username=username,
|
||||
url=db_url,
|
||||
filename=filename,
|
||||
post_date=post_date,
|
||||
metadata=metadata,
|
||||
file_path=file_path,
|
||||
content_type=content_type
|
||||
)
|
||||
else:
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def is_instagram_downloaded(db, media_id: str, username: str = None) -> bool:
|
||||
"""Check if Instagram content is already downloaded by media_id.
|
||||
|
||||
Checks for both the original and normalized media_id to ensure cross-module
|
||||
duplicate detection works correctly.
|
||||
|
||||
Args:
|
||||
db: Database instance (UnifiedDatabase or adapter)
|
||||
media_id: The media ID to check (will check both original and normalized)
|
||||
username: Optional username to scope the check
|
||||
|
||||
Returns:
|
||||
True if already downloaded, False otherwise
|
||||
"""
|
||||
if not db or not media_id:
|
||||
return False
|
||||
|
||||
# Normalize the media_id
|
||||
normalized_media_id = extract_instagram_media_id(media_id)
|
||||
|
||||
# Check if this looks like a shortcode (10-15 alphanumeric chars, no 18xxx pattern)
|
||||
is_shortcode = (normalized_media_id == media_id and
|
||||
re.match(r'^[A-Za-z0-9_-]{10,15}$', media_id) and
|
||||
not re.match(r'^18\d{15,17}$', media_id))
|
||||
|
||||
try:
|
||||
# Check if db has get_connection (UnifiedDatabase) - query directly
|
||||
if hasattr(db, 'get_connection'):
|
||||
with db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
# Check both normalized and original media_id
|
||||
# Also verify file_path is set (download was actually completed)
|
||||
if normalized_media_id != media_id:
|
||||
cursor.execute('''
|
||||
SELECT 1 FROM downloads
|
||||
WHERE platform = 'instagram'
|
||||
AND (media_id = ? OR media_id = ?)
|
||||
AND file_path IS NOT NULL AND file_path != ''
|
||||
LIMIT 1
|
||||
''', (normalized_media_id, media_id))
|
||||
else:
|
||||
cursor.execute('''
|
||||
SELECT 1 FROM downloads
|
||||
WHERE platform = 'instagram'
|
||||
AND media_id = ?
|
||||
AND file_path IS NOT NULL AND file_path != ''
|
||||
LIMIT 1
|
||||
''', (normalized_media_id,))
|
||||
if cursor.fetchone() is not None:
|
||||
return True
|
||||
|
||||
# For shortcodes, also check the metadata JSON column
|
||||
if is_shortcode:
|
||||
cursor.execute('''
|
||||
SELECT 1 FROM downloads
|
||||
WHERE platform = 'instagram'
|
||||
AND metadata LIKE ?
|
||||
AND file_path IS NOT NULL AND file_path != ''
|
||||
LIMIT 1
|
||||
''', (f'%"shortcode": "{media_id}"%',))
|
||||
if cursor.fetchone() is not None:
|
||||
return True
|
||||
|
||||
# Check recycle bin — files previously downloaded then deleted
|
||||
# should not be re-downloaded
|
||||
cursor.execute('''
|
||||
SELECT 1 FROM recycle_bin
|
||||
WHERE original_filename LIKE ?
|
||||
LIMIT 1
|
||||
''', (f'%{normalized_media_id}%',))
|
||||
if cursor.fetchone() is not None:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
# Fallback for adapters with is_already_downloaded method
|
||||
elif hasattr(db, 'is_already_downloaded'):
|
||||
if db.is_already_downloaded(normalized_media_id):
|
||||
return True
|
||||
# Also check original if different
|
||||
if normalized_media_id != media_id and db.is_already_downloaded(media_id):
|
||||
return True
|
||||
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
1259
modules/instaloader_module.py
Executable file
1259
modules/instaloader_module.py
Executable file
File diff suppressed because it is too large
Load Diff
535
modules/media_identifier.py
Normal file
535
modules/media_identifier.py
Normal file
@@ -0,0 +1,535 @@
|
||||
"""
|
||||
Media Identifier Module
|
||||
|
||||
Parses media filenames using guessit and matches them against TMDB for metadata enrichment.
|
||||
Generates organized file paths for TV Shows and Movies.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('MediaIdentifier')
|
||||
|
||||
# Try to import guessit, but gracefully handle if not installed
|
||||
try:
|
||||
import guessit
|
||||
GUESSIT_AVAILABLE = True
|
||||
except ImportError:
|
||||
GUESSIT_AVAILABLE = False
|
||||
logger.warning("guessit not installed - filename parsing will be limited")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedMedia:
|
||||
"""Represents parsed media information from a filename."""
|
||||
title: str
|
||||
media_type: str # 'movie' or 'episode' (TV)
|
||||
year: Optional[int] = None
|
||||
season: Optional[int] = None
|
||||
episode: Optional[int] = None
|
||||
quality: Optional[str] = None
|
||||
source: Optional[str] = None
|
||||
codec: Optional[str] = None
|
||||
release_group: Optional[str] = None
|
||||
original_filename: str = ""
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
'title': self.title,
|
||||
'media_type': self.media_type,
|
||||
'year': self.year,
|
||||
'season': self.season,
|
||||
'episode': self.episode,
|
||||
'quality': self.quality,
|
||||
'source': self.source,
|
||||
'codec': self.codec,
|
||||
'release_group': self.release_group,
|
||||
'original_filename': self.original_filename,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class TMDBMatch:
|
||||
"""Represents a TMDB match for parsed media."""
|
||||
tmdb_id: int
|
||||
title: str
|
||||
original_title: Optional[str]
|
||||
media_type: str # 'movie' or 'tv'
|
||||
year: Optional[int] = None
|
||||
poster_path: Optional[str] = None
|
||||
overview: Optional[str] = None
|
||||
# For TV episodes
|
||||
season_number: Optional[int] = None
|
||||
episode_number: Optional[int] = None
|
||||
episode_title: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
'tmdb_id': self.tmdb_id,
|
||||
'title': self.title,
|
||||
'original_title': self.original_title,
|
||||
'media_type': self.media_type,
|
||||
'year': self.year,
|
||||
'poster_path': self.poster_path,
|
||||
'overview': self.overview,
|
||||
'season_number': self.season_number,
|
||||
'episode_number': self.episode_number,
|
||||
'episode_title': self.episode_title,
|
||||
}
|
||||
|
||||
|
||||
class MediaIdentifier:
|
||||
"""
|
||||
Identifies media from filenames and matches against TMDB.
|
||||
|
||||
Uses guessit for filename parsing and TMDB API for metadata enrichment.
|
||||
"""
|
||||
|
||||
TMDB_BASE_URL = "https://api.themoviedb.org/3"
|
||||
TMDB_IMAGE_BASE = "https://image.tmdb.org/t/p/w500"
|
||||
|
||||
# Quality normalization patterns
|
||||
QUALITY_MAP = {
|
||||
'2160p': '2160p',
|
||||
'4k': '2160p',
|
||||
'uhd': '2160p',
|
||||
'1080p': '1080p',
|
||||
'fullhd': '1080p',
|
||||
'fhd': '1080p',
|
||||
'720p': '720p',
|
||||
'hd': '720p',
|
||||
'480p': '480p',
|
||||
'sd': '480p',
|
||||
'360p': '360p',
|
||||
}
|
||||
|
||||
def __init__(self, tmdb_api_key: str):
|
||||
"""
|
||||
Initialize the MediaIdentifier.
|
||||
|
||||
Args:
|
||||
tmdb_api_key: TMDB API key for lookups
|
||||
"""
|
||||
self.api_key = tmdb_api_key
|
||||
self.session = requests.Session()
|
||||
|
||||
def parse_filename(self, filename: str) -> Optional[ParsedMedia]:
|
||||
"""
|
||||
Parse a media filename to extract metadata.
|
||||
|
||||
Args:
|
||||
filename: The filename to parse (without path)
|
||||
|
||||
Returns:
|
||||
ParsedMedia object with extracted information, or None if parsing fails
|
||||
"""
|
||||
if not filename:
|
||||
return None
|
||||
|
||||
# Strip path if present
|
||||
filename = Path(filename).name
|
||||
|
||||
if GUESSIT_AVAILABLE:
|
||||
return self._parse_with_guessit(filename)
|
||||
else:
|
||||
return self._parse_fallback(filename)
|
||||
|
||||
def _parse_with_guessit(self, filename: str) -> Optional[ParsedMedia]:
|
||||
"""Parse filename using guessit library."""
|
||||
try:
|
||||
result = guessit.guessit(filename)
|
||||
|
||||
# Determine media type
|
||||
media_type = result.get('type', 'movie')
|
||||
if media_type == 'episode':
|
||||
media_type = 'episode'
|
||||
else:
|
||||
media_type = 'movie'
|
||||
|
||||
# Extract title
|
||||
title = result.get('title', '')
|
||||
if not title:
|
||||
return None
|
||||
|
||||
# Extract quality
|
||||
quality = None
|
||||
screen_size = result.get('screen_size')
|
||||
if screen_size:
|
||||
quality = self.QUALITY_MAP.get(str(screen_size).lower(), str(screen_size))
|
||||
|
||||
return ParsedMedia(
|
||||
title=title,
|
||||
media_type=media_type,
|
||||
year=result.get('year'),
|
||||
season=result.get('season'),
|
||||
episode=result.get('episode'),
|
||||
quality=quality,
|
||||
source=result.get('source'),
|
||||
codec=result.get('video_codec'),
|
||||
release_group=result.get('release_group'),
|
||||
original_filename=filename,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"guessit parsing failed for '{filename}': {e}")
|
||||
return self._parse_fallback(filename)
|
||||
|
||||
def _parse_fallback(self, filename: str) -> Optional[ParsedMedia]:
|
||||
"""
|
||||
Fallback parser when guessit is not available.
|
||||
Uses regex patterns to extract common media info.
|
||||
"""
|
||||
try:
|
||||
# Remove extension
|
||||
name = Path(filename).stem
|
||||
|
||||
# Replace common separators with spaces
|
||||
name = re.sub(r'[._]', ' ', name)
|
||||
|
||||
# Try to extract TV show pattern: Show Name S01E02 or Show.Name.1x02
|
||||
tv_pattern = r'^(.+?)[\s\.]+[Ss](\d{1,2})[Ee](\d{1,2})'
|
||||
tv_match = re.match(tv_pattern, name)
|
||||
|
||||
if tv_match:
|
||||
title = tv_match.group(1).strip()
|
||||
season = int(tv_match.group(2))
|
||||
episode = int(tv_match.group(3))
|
||||
|
||||
# Extract quality
|
||||
quality = self._extract_quality(name)
|
||||
|
||||
return ParsedMedia(
|
||||
title=title,
|
||||
media_type='episode',
|
||||
season=season,
|
||||
episode=episode,
|
||||
quality=quality,
|
||||
original_filename=filename,
|
||||
)
|
||||
|
||||
# Try alternative TV pattern: 1x02 format
|
||||
alt_tv_pattern = r'^(.+?)[\s\.]+(\d{1,2})x(\d{1,2})'
|
||||
alt_match = re.match(alt_tv_pattern, name)
|
||||
|
||||
if alt_match:
|
||||
title = alt_match.group(1).strip()
|
||||
season = int(alt_match.group(2))
|
||||
episode = int(alt_match.group(3))
|
||||
|
||||
quality = self._extract_quality(name)
|
||||
|
||||
return ParsedMedia(
|
||||
title=title,
|
||||
media_type='episode',
|
||||
season=season,
|
||||
episode=episode,
|
||||
quality=quality,
|
||||
original_filename=filename,
|
||||
)
|
||||
|
||||
# Assume movie - extract title and year
|
||||
# Pattern: Movie Title (2023) or Movie.Title.2023
|
||||
movie_pattern = r'^(.+?)[\s\.]+\(?(\d{4})\)?'
|
||||
movie_match = re.match(movie_pattern, name)
|
||||
|
||||
if movie_match:
|
||||
title = movie_match.group(1).strip()
|
||||
year = int(movie_match.group(2))
|
||||
else:
|
||||
# Just use the name as title
|
||||
title = name.split()[0] if name.split() else name
|
||||
year = None
|
||||
|
||||
quality = self._extract_quality(name)
|
||||
|
||||
return ParsedMedia(
|
||||
title=title,
|
||||
media_type='movie',
|
||||
year=year,
|
||||
quality=quality,
|
||||
original_filename=filename,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Fallback parsing failed for '{filename}': {e}")
|
||||
return None
|
||||
|
||||
def _extract_quality(self, text: str) -> Optional[str]:
|
||||
"""Extract quality from text."""
|
||||
text_lower = text.lower()
|
||||
for pattern, quality in self.QUALITY_MAP.items():
|
||||
if pattern in text_lower:
|
||||
return quality
|
||||
return None
|
||||
|
||||
def match_tmdb(self, parsed: ParsedMedia) -> Optional[TMDBMatch]:
|
||||
"""
|
||||
Match parsed media against TMDB.
|
||||
|
||||
Args:
|
||||
parsed: ParsedMedia object from parse_filename
|
||||
|
||||
Returns:
|
||||
TMDBMatch object if found, None otherwise
|
||||
"""
|
||||
if not parsed:
|
||||
return None
|
||||
|
||||
try:
|
||||
if parsed.media_type == 'episode':
|
||||
return self._match_tv_show(parsed)
|
||||
else:
|
||||
return self._match_movie(parsed)
|
||||
except Exception as e:
|
||||
logger.error(f"TMDB matching failed for '{parsed.title}': {e}")
|
||||
return None
|
||||
|
||||
def _match_tv_show(self, parsed: ParsedMedia) -> Optional[TMDBMatch]:
|
||||
"""Match a TV show episode against TMDB."""
|
||||
try:
|
||||
# Search for the TV show
|
||||
search_url = f"{self.TMDB_BASE_URL}/search/tv"
|
||||
params = {
|
||||
'api_key': self.api_key,
|
||||
'query': parsed.title,
|
||||
'page': 1,
|
||||
}
|
||||
if parsed.year:
|
||||
params['first_air_date_year'] = parsed.year
|
||||
|
||||
response = self.session.get(search_url, params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
results = data.get('results', [])
|
||||
if not results:
|
||||
logger.debug(f"No TMDB results for TV show: {parsed.title}")
|
||||
return None
|
||||
|
||||
# Use the first (best) result
|
||||
show = results[0]
|
||||
show_id = show['id']
|
||||
|
||||
# Get episode details if we have season/episode
|
||||
episode_title = None
|
||||
if parsed.season and parsed.episode:
|
||||
episode_url = f"{self.TMDB_BASE_URL}/tv/{show_id}/season/{parsed.season}/episode/{parsed.episode}"
|
||||
ep_params = {'api_key': self.api_key}
|
||||
try:
|
||||
ep_response = self.session.get(episode_url, params=ep_params, timeout=30)
|
||||
if ep_response.status_code == 200:
|
||||
ep_data = ep_response.json()
|
||||
episode_title = ep_data.get('name')
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Parse year from first_air_date
|
||||
year = None
|
||||
first_air_date = show.get('first_air_date', '')
|
||||
if first_air_date and len(first_air_date) >= 4:
|
||||
try:
|
||||
year = int(first_air_date[:4])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return TMDBMatch(
|
||||
tmdb_id=show_id,
|
||||
title=show.get('name', parsed.title),
|
||||
original_title=show.get('original_name'),
|
||||
media_type='tv',
|
||||
year=year,
|
||||
poster_path=show.get('poster_path'),
|
||||
overview=show.get('overview'),
|
||||
season_number=parsed.season,
|
||||
episode_number=parsed.episode,
|
||||
episode_title=episode_title,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TMDB TV show matching failed: {e}")
|
||||
return None
|
||||
|
||||
def _match_movie(self, parsed: ParsedMedia) -> Optional[TMDBMatch]:
|
||||
"""Match a movie against TMDB."""
|
||||
try:
|
||||
# Search for the movie
|
||||
search_url = f"{self.TMDB_BASE_URL}/search/movie"
|
||||
params = {
|
||||
'api_key': self.api_key,
|
||||
'query': parsed.title,
|
||||
'page': 1,
|
||||
}
|
||||
if parsed.year:
|
||||
params['year'] = parsed.year
|
||||
|
||||
response = self.session.get(search_url, params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
results = data.get('results', [])
|
||||
if not results:
|
||||
logger.debug(f"No TMDB results for movie: {parsed.title}")
|
||||
return None
|
||||
|
||||
# Use the first (best) result
|
||||
movie = results[0]
|
||||
|
||||
# Parse year from release_date
|
||||
year = None
|
||||
release_date = movie.get('release_date', '')
|
||||
if release_date and len(release_date) >= 4:
|
||||
try:
|
||||
year = int(release_date[:4])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return TMDBMatch(
|
||||
tmdb_id=movie['id'],
|
||||
title=movie.get('title', parsed.title),
|
||||
original_title=movie.get('original_title'),
|
||||
media_type='movie',
|
||||
year=year,
|
||||
poster_path=movie.get('poster_path'),
|
||||
overview=movie.get('overview'),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TMDB movie matching failed: {e}")
|
||||
return None
|
||||
|
||||
def get_organized_path(
|
||||
self,
|
||||
match: TMDBMatch,
|
||||
base_path: str,
|
||||
original_filename: str,
|
||||
) -> str:
|
||||
"""
|
||||
Generate an organized file path for the matched media.
|
||||
|
||||
Args:
|
||||
match: TMDBMatch object with TMDB metadata
|
||||
base_path: Base directory for media storage
|
||||
original_filename: Original filename (for extension)
|
||||
|
||||
Returns:
|
||||
Full organized path for the file
|
||||
"""
|
||||
base = Path(base_path)
|
||||
|
||||
# Get extension from original filename
|
||||
ext = Path(original_filename).suffix
|
||||
|
||||
# Sanitize title for filesystem
|
||||
safe_title = self._sanitize_filename(match.title)
|
||||
|
||||
if match.media_type == 'tv':
|
||||
# TV: {base}/TV Shows/{Show}/Season {XX}/{Show} - S{XX}E{XX} - {Episode Title}.{ext}
|
||||
show_dir = base / "TV Shows" / safe_title
|
||||
|
||||
if match.season_number is not None:
|
||||
season_dir = show_dir / f"Season {match.season_number:02d}"
|
||||
else:
|
||||
season_dir = show_dir / "Season 01"
|
||||
|
||||
# Build filename
|
||||
if match.season_number is not None and match.episode_number is not None:
|
||||
ep_part = f"S{match.season_number:02d}E{match.episode_number:02d}"
|
||||
else:
|
||||
ep_part = "S01E01"
|
||||
|
||||
if match.episode_title:
|
||||
safe_ep_title = self._sanitize_filename(match.episode_title)
|
||||
filename = f"{safe_title} - {ep_part} - {safe_ep_title}{ext}"
|
||||
else:
|
||||
filename = f"{safe_title} - {ep_part}{ext}"
|
||||
|
||||
return str(season_dir / filename)
|
||||
|
||||
else:
|
||||
# Movie: {base}/Movies/{Title} ({Year})/{Title} ({Year}).{ext}
|
||||
if match.year:
|
||||
movie_folder = f"{safe_title} ({match.year})"
|
||||
else:
|
||||
movie_folder = safe_title
|
||||
|
||||
movie_dir = base / "Movies" / movie_folder
|
||||
filename = f"{movie_folder}{ext}"
|
||||
|
||||
return str(movie_dir / filename)
|
||||
|
||||
def _sanitize_filename(self, name: str) -> str:
|
||||
"""
|
||||
Sanitize a string for use as a filename.
|
||||
|
||||
Removes/replaces characters that are invalid in filenames.
|
||||
"""
|
||||
if not name:
|
||||
return "Unknown"
|
||||
|
||||
# Replace problematic characters
|
||||
name = re.sub(r'[<>:"/\\|?*]', '', name)
|
||||
name = re.sub(r'\s+', ' ', name)
|
||||
name = name.strip()
|
||||
|
||||
# Limit length
|
||||
if len(name) > 100:
|
||||
name = name[:100].strip()
|
||||
|
||||
return name if name else "Unknown"
|
||||
|
||||
def identify_and_match(
|
||||
self,
|
||||
filename: str,
|
||||
base_path: str = "/media",
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Convenience method to parse, match, and get organized path in one call.
|
||||
|
||||
Args:
|
||||
filename: The media filename to process
|
||||
base_path: Base directory for organized media
|
||||
|
||||
Returns:
|
||||
Dict with parsed info, TMDB match, and organized path
|
||||
"""
|
||||
result = {
|
||||
'success': False,
|
||||
'filename': filename,
|
||||
'parsed': None,
|
||||
'match': None,
|
||||
'organized_path': None,
|
||||
'error': None,
|
||||
}
|
||||
|
||||
try:
|
||||
# Parse filename
|
||||
parsed = self.parse_filename(filename)
|
||||
if not parsed:
|
||||
result['error'] = 'Failed to parse filename'
|
||||
return result
|
||||
|
||||
result['parsed'] = parsed.to_dict()
|
||||
|
||||
# Match against TMDB
|
||||
match = self.match_tmdb(parsed)
|
||||
if match:
|
||||
result['match'] = match.to_dict()
|
||||
|
||||
# Get organized path
|
||||
organized_path = self.get_organized_path(match, base_path, filename)
|
||||
result['organized_path'] = organized_path
|
||||
result['success'] = True
|
||||
else:
|
||||
result['error'] = 'No TMDB match found'
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
result['error'] = str(e)
|
||||
logger.error(f"identify_and_match failed for '{filename}': {e}")
|
||||
return result
|
||||
86
modules/monitor_wrapper.py
Normal file
86
modules/monitor_wrapper.py
Normal file
@@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Helper wrapper to integrate monitoring with downloaders
|
||||
"""
|
||||
|
||||
from functools import wraps
|
||||
from modules.downloader_monitor import get_monitor
|
||||
|
||||
|
||||
def monitor_download(downloader_name):
|
||||
"""
|
||||
Decorator to monitor download attempts
|
||||
|
||||
Usage:
|
||||
@monitor_download('fastdl')
|
||||
def download_function(username, ...):
|
||||
...
|
||||
return count
|
||||
"""
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
# Extract username from args or kwargs
|
||||
username = kwargs.get('username') or (args[0] if args else 'unknown')
|
||||
|
||||
try:
|
||||
# Call the actual download function
|
||||
result = func(*args, **kwargs)
|
||||
|
||||
# Determine success based on result
|
||||
if isinstance(result, int):
|
||||
count = result
|
||||
success = count > 0
|
||||
elif isinstance(result, dict):
|
||||
count = result.get('count', 0)
|
||||
success = result.get('success', count > 0)
|
||||
else:
|
||||
count = 0
|
||||
success = False
|
||||
|
||||
# Log to monitor
|
||||
monitor = get_monitor()
|
||||
monitor.log_download_attempt(
|
||||
downloader=downloader_name,
|
||||
username=username,
|
||||
success=success,
|
||||
file_count=count,
|
||||
error_message=None
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# Log failure
|
||||
monitor = get_monitor()
|
||||
monitor.log_download_attempt(
|
||||
downloader=downloader_name,
|
||||
username=username,
|
||||
success=False,
|
||||
file_count=0,
|
||||
error_message=str(e)
|
||||
)
|
||||
raise
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
def log_download_result(downloader: str, username: str, count: int, error: str = None):
|
||||
"""
|
||||
Simple function to log download result to monitor
|
||||
|
||||
Args:
|
||||
downloader: Downloader name (fastdl, imginn, etc.)
|
||||
username: Username
|
||||
count: Number of files downloaded
|
||||
error: Error message if failed
|
||||
"""
|
||||
monitor = get_monitor()
|
||||
monitor.log_download_attempt(
|
||||
downloader=downloader,
|
||||
username=username,
|
||||
success=(error is None),
|
||||
file_count=count,
|
||||
error_message=error
|
||||
)
|
||||
1714
modules/move_module.py
Executable file
1714
modules/move_module.py
Executable file
File diff suppressed because it is too large
Load Diff
36
modules/paid_content/__init__.py
Normal file
36
modules/paid_content/__init__.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
Paid Content Module
|
||||
|
||||
Downloads and organizes content from subscription-based creator platforms
|
||||
(OnlyFans, Fansly, Patreon, Fanbox, etc.) via the Coomer.party and Kemono.party archival APIs.
|
||||
Also supports YouTube channels and Twitch clips via yt-dlp.
|
||||
"""
|
||||
|
||||
from .scraper import PaidContentScraper
|
||||
from .api_client import PaidContentAPIClient
|
||||
from .db_adapter import PaidContentDBAdapter
|
||||
from .file_host_downloader import FileHostDownloader
|
||||
from .embed_downloader import EmbedDownloader
|
||||
from .youtube_client import YouTubeClient
|
||||
from .twitch_client import TwitchClient, TwitchThumbnailCache
|
||||
from .fansly_direct_client import FanslyDirectClient
|
||||
from .onlyfans_client import OnlyFansClient
|
||||
from .xhamster_client import XHamsterClient
|
||||
from .tiktok_client import TikTokClient
|
||||
from .instagram_adapter import InstagramAdapter
|
||||
|
||||
__all__ = [
|
||||
'PaidContentScraper',
|
||||
'PaidContentAPIClient',
|
||||
'PaidContentDBAdapter',
|
||||
'FileHostDownloader',
|
||||
'EmbedDownloader',
|
||||
'YouTubeClient',
|
||||
'TwitchClient',
|
||||
'TwitchThumbnailCache',
|
||||
'FanslyDirectClient',
|
||||
'OnlyFansClient',
|
||||
'XHamsterClient',
|
||||
'TikTokClient',
|
||||
'InstagramAdapter',
|
||||
]
|
||||
311
modules/paid_content/api_client.py
Normal file
311
modules/paid_content/api_client.py
Normal file
@@ -0,0 +1,311 @@
|
||||
"""
|
||||
Unified API client for Coomer.party and Kemono.party
|
||||
Both services share the same API structure (Kemono fork)
|
||||
"""
|
||||
|
||||
import aiohttp
|
||||
import asyncio
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
from modules.base_module import LoggingMixin, RateLimitMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
class PaidContentAPIClient(LoggingMixin, RateLimitMixin):
|
||||
"""
|
||||
API client for Coomer and Kemono archival services
|
||||
|
||||
API Endpoints:
|
||||
- GET /creators - List all creators
|
||||
- GET /{service}/user/{creator_id} - Get creator info
|
||||
- GET /{service}/user/{creator_id} - Get creator's posts (paginated with ?o=offset)
|
||||
- GET /{service}/user/{creator_id}/post/{post_id} - Get single post
|
||||
"""
|
||||
|
||||
# Fallback URLs if database doesn't have them configured
|
||||
DEFAULT_SERVICE_URLS = {
|
||||
'coomer': 'https://coomer.party',
|
||||
'kemono': 'https://kemono.party'
|
||||
}
|
||||
|
||||
SUPPORTED_PLATFORMS = {
|
||||
'coomer': ['onlyfans', 'fansly', 'candfans'],
|
||||
'kemono': ['patreon', 'fanbox', 'gumroad', 'subscribestar', 'discord']
|
||||
}
|
||||
|
||||
def __init__(self, service_id: str, session_cookie: str = None, base_url: str = None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='API')
|
||||
self._init_rate_limiter(min_delay=0.5, max_delay=2.0, batch_delay_min=1, batch_delay_max=3)
|
||||
|
||||
self.service_id = service_id
|
||||
|
||||
# Use provided base_url, or fall back to defaults
|
||||
if base_url:
|
||||
# If base_url includes /api/v1, extract just the base
|
||||
if '/api/v1' in base_url:
|
||||
self.base_url = base_url.replace('/api/v1', '').rstrip('/')
|
||||
else:
|
||||
self.base_url = base_url.rstrip('/')
|
||||
else:
|
||||
self.base_url = self.DEFAULT_SERVICE_URLS.get(service_id)
|
||||
|
||||
self.api_url = f"{self.base_url}/api/v1"
|
||||
self.session_cookie = session_cookie
|
||||
self._session: Optional[aiohttp.ClientSession] = None
|
||||
|
||||
async def _get_session(self) -> aiohttp.ClientSession:
|
||||
"""Get or create aiohttp session"""
|
||||
if self._session is None or self._session.closed:
|
||||
# Note: Coomer/Kemono require 'Accept: text/css' header as anti-scraping measure
|
||||
# Despite this, they still return JSON responses
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/css',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Referer': self.base_url
|
||||
}
|
||||
cookies = {}
|
||||
if self.session_cookie:
|
||||
cookies['session'] = self.session_cookie
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
self._session = aiohttp.ClientSession(headers=headers, cookies=cookies, timeout=timeout)
|
||||
return self._session
|
||||
|
||||
async def close(self):
|
||||
"""Close the aiohttp session"""
|
||||
if self._session and not self._session.closed:
|
||||
await self._session.close()
|
||||
self._session = None
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
await self.close()
|
||||
|
||||
async def check_health(self) -> Dict[str, Any]:
|
||||
"""Check API health status"""
|
||||
import time
|
||||
try:
|
||||
session = await self._get_session()
|
||||
start = time.time()
|
||||
async with session.get(f"{self.api_url}/creators", timeout=aiohttp.ClientTimeout(total=10)) as resp:
|
||||
elapsed = time.time() - start
|
||||
if resp.status == 200:
|
||||
# content_type=None allows parsing JSON regardless of response content-type
|
||||
await resp.json(content_type=None)
|
||||
return {'status': 'healthy', 'response_time': round(elapsed, 3)}
|
||||
elif resp.status == 429:
|
||||
return {'status': 'rate_limited', 'response_code': 429}
|
||||
else:
|
||||
return {'status': 'degraded', 'response_code': resp.status}
|
||||
except asyncio.TimeoutError:
|
||||
return {'status': 'timeout', 'error': 'Request timed out'}
|
||||
except Exception as e:
|
||||
return {'status': 'down', 'error': str(e)}
|
||||
|
||||
async def get_all_creators(self) -> List[Dict]:
|
||||
"""Get list of all available creators (for search)"""
|
||||
self._delay_between_items()
|
||||
try:
|
||||
session = await self._get_session()
|
||||
async with session.get(f"{self.api_url}/creators") as resp:
|
||||
if resp.status == 200:
|
||||
return await resp.json(content_type=None)
|
||||
self.log(f"Failed to get creators list: HTTP {resp.status}", 'warning')
|
||||
return []
|
||||
except Exception as e:
|
||||
self.log(f"Error getting creators list: {e}", 'error')
|
||||
return []
|
||||
|
||||
async def get_creator(self, platform: str, creator_id: str) -> Optional[Creator]:
|
||||
"""Get creator info"""
|
||||
self._delay_between_items()
|
||||
try:
|
||||
session = await self._get_session()
|
||||
|
||||
# First try to get creator profile
|
||||
url = f"{self.api_url}/{platform}/user/{creator_id}/profile"
|
||||
async with session.get(url) as resp:
|
||||
if resp.status == 200:
|
||||
data = await resp.json(content_type=None)
|
||||
return Creator.from_api(data, self.service_id, platform, self.base_url)
|
||||
|
||||
# Fallback: get first post to extract creator info
|
||||
url = f"{self.api_url}/{platform}/user/{creator_id}/posts"
|
||||
async with session.get(url) as resp:
|
||||
if resp.status == 200:
|
||||
posts = await resp.json(content_type=None)
|
||||
if posts and len(posts) > 0:
|
||||
# Extract creator info from first post
|
||||
first_post = posts[0]
|
||||
# Construct image URLs - use .st instead of .party
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(self.base_url)
|
||||
# Convert .party to .st for image URLs (coomer.party/kemono.party images are at .st)
|
||||
netloc = parsed.netloc.replace('.party', '.st')
|
||||
img_domain = f"img.{netloc}"
|
||||
profile_image_url = f"https://{img_domain}/icons/{platform}/{creator_id}"
|
||||
banner_image_url = f"https://{img_domain}/banners/{platform}/{creator_id}"
|
||||
return Creator(
|
||||
creator_id=creator_id,
|
||||
service_id=self.service_id,
|
||||
platform=platform,
|
||||
username=first_post.get('user', creator_id),
|
||||
display_name=first_post.get('user', creator_id),
|
||||
profile_image_url=profile_image_url,
|
||||
banner_image_url=banner_image_url
|
||||
)
|
||||
|
||||
self.log(f"Creator not found: {platform}/{creator_id}", 'warning')
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting creator {platform}/{creator_id}: {e}", 'error')
|
||||
return None
|
||||
|
||||
async def get_creator_posts(self, platform: str, creator_id: str, offset: int = 0) -> List[Post]:
|
||||
"""Get creator's posts (50 per page by default)"""
|
||||
self._delay_between_items()
|
||||
try:
|
||||
session = await self._get_session()
|
||||
|
||||
url = f"{self.api_url}/{platform}/user/{creator_id}/posts"
|
||||
params = {'o': offset} if offset > 0 else {}
|
||||
|
||||
async with session.get(url, params=params) as resp:
|
||||
if resp.status == 200:
|
||||
data = await resp.json(content_type=None)
|
||||
return [Post.from_api(p, self.service_id, platform, creator_id, self.base_url) for p in data]
|
||||
elif resp.status == 404:
|
||||
self.log(f"Creator not found: {platform}/{creator_id}", 'warning')
|
||||
else:
|
||||
self.log(f"Failed to get posts: HTTP {resp.status}", 'warning')
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting posts for {platform}/{creator_id}: {e}", 'error')
|
||||
return []
|
||||
|
||||
async def get_all_creator_posts(self, platform: str, creator_id: str,
|
||||
since_date: str = None, max_posts: int = None,
|
||||
progress_callback=None) -> List[Post]:
|
||||
"""Fetch all posts with pagination"""
|
||||
all_posts = []
|
||||
offset = 0
|
||||
page = 0
|
||||
|
||||
self.log(f"Fetching posts for {platform}/{creator_id}", 'info')
|
||||
|
||||
while True:
|
||||
posts = await self.get_creator_posts(platform, creator_id, offset)
|
||||
if not posts:
|
||||
break
|
||||
|
||||
for post in posts:
|
||||
# Stop if we've reached posts we've already seen
|
||||
if since_date and post.published_at and post.published_at <= since_date:
|
||||
self.log(f"Reached already-seen post date: {post.published_at}", 'debug')
|
||||
return all_posts
|
||||
|
||||
all_posts.append(post)
|
||||
|
||||
if max_posts and len(all_posts) >= max_posts:
|
||||
self.log(f"Reached max posts limit: {max_posts}", 'debug')
|
||||
return all_posts
|
||||
|
||||
page += 1
|
||||
offset += 50
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(page, len(all_posts))
|
||||
|
||||
self._delay_between_batches()
|
||||
|
||||
self.log(f"Fetched {len(all_posts)} posts for {platform}/{creator_id}", 'info')
|
||||
return all_posts
|
||||
|
||||
async def get_post(self, platform: str, creator_id: str, post_id: str) -> Optional[Post]:
|
||||
"""Get single post by ID"""
|
||||
self._delay_between_items()
|
||||
try:
|
||||
session = await self._get_session()
|
||||
|
||||
url = f"{self.api_url}/{platform}/user/{creator_id}/post/{post_id}"
|
||||
async with session.get(url) as resp:
|
||||
if resp.status == 200:
|
||||
data = await resp.json(content_type=None)
|
||||
# Single post endpoint wraps response in {"post": {...}}
|
||||
if isinstance(data, dict) and 'post' in data:
|
||||
data = data['post']
|
||||
return Post.from_api(data, self.service_id, platform, creator_id, self.base_url)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting post {post_id}: {e}", 'error')
|
||||
return None
|
||||
|
||||
async def search_creators(self, query: str, platform: str = None) -> List[Dict]:
|
||||
"""Search for creators by name"""
|
||||
self._delay_between_items()
|
||||
try:
|
||||
# Get all creators and filter locally (API doesn't have search endpoint)
|
||||
all_creators = await self.get_all_creators()
|
||||
|
||||
query_lower = query.lower()
|
||||
results = []
|
||||
|
||||
for creator in all_creators:
|
||||
if platform and creator.get('service') != platform:
|
||||
continue
|
||||
|
||||
name = (creator.get('name') or '').lower()
|
||||
if query_lower in name:
|
||||
results.append({
|
||||
'id': creator.get('id'),
|
||||
'name': creator.get('name'),
|
||||
'service': creator.get('service'),
|
||||
'indexed': creator.get('indexed'),
|
||||
'updated': creator.get('updated'),
|
||||
'favorited': creator.get('favorited', 0)
|
||||
})
|
||||
|
||||
# Sort by favorited count (popularity)
|
||||
results.sort(key=lambda x: x.get('favorited', 0), reverse=True)
|
||||
return results[:50] # Limit results
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error searching creators: {e}", 'error')
|
||||
return []
|
||||
|
||||
def get_attachment_url(self, server_path: str) -> str:
|
||||
"""Convert server path to full download URL"""
|
||||
if not server_path:
|
||||
return ''
|
||||
if server_path.startswith('http'):
|
||||
return server_path
|
||||
return f"{self.base_url}/data{server_path}"
|
||||
|
||||
def get_thumbnail_url(self, server_path: str) -> str:
|
||||
"""Get thumbnail URL for an attachment"""
|
||||
if not server_path:
|
||||
return ''
|
||||
if server_path.startswith('http'):
|
||||
return server_path
|
||||
return f"{self.base_url}/thumbnail/data{server_path}"
|
||||
|
||||
@classmethod
|
||||
def get_supported_platforms(cls, service_id: str) -> List[str]:
|
||||
"""Get list of supported platforms for a service"""
|
||||
return cls.SUPPORTED_PLATFORMS.get(service_id, [])
|
||||
|
||||
@classmethod
|
||||
def is_valid_service(cls, service_id: str) -> bool:
|
||||
"""Check if service ID is valid"""
|
||||
return service_id in cls.SERVICE_URLS
|
||||
|
||||
@classmethod
|
||||
def get_service_ids(cls) -> List[str]:
|
||||
"""Get list of all service IDs"""
|
||||
return list(cls.SERVICE_URLS.keys())
|
||||
389
modules/paid_content/bellazon_client.py
Normal file
389
modules/paid_content/bellazon_client.py
Normal file
@@ -0,0 +1,389 @@
|
||||
"""
|
||||
Bellazon Forum Thread Client for Paid Content
|
||||
|
||||
Scrapes Bellazon forum threads (Invision Power Suite) treating each thread
|
||||
as a "creator" and each reply with media as a post.
|
||||
|
||||
Only bellazon-hosted uploads are captured (external image host links are
|
||||
unreliable/ephemeral). Video attachments (attachment.php) are also captured.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Optional, Set
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Post, Attachment
|
||||
|
||||
|
||||
class BellazonClient(LoggingMixin):
|
||||
"""Client for scraping Bellazon forum threads."""
|
||||
|
||||
SERVICE_ID = 'bellazon'
|
||||
PLATFORM = 'bellazon'
|
||||
BASE_URL = 'https://www.bellazon.com/main'
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||||
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
# Extensions considered images
|
||||
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
|
||||
# Extensions considered videos
|
||||
VIDEO_EXTS = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
|
||||
|
||||
def __init__(self, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Bellazon')
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_profile_info(self, topic_id: str) -> Optional[Dict]:
|
||||
"""Fetch first page of a thread and return profile-like info.
|
||||
|
||||
Returns dict with: username (slug), display_name, post_count, topic_url
|
||||
"""
|
||||
# Bellazon requires a slug in the URL but redirects to the correct one
|
||||
url = f'{self.BASE_URL}/topic/{topic_id}-x/'
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"Bellazon topic {topic_id} returned HTTP {resp.status}", 'warning')
|
||||
return None
|
||||
final_url = str(resp.url)
|
||||
page_html = await resp.text()
|
||||
except Exception as e:
|
||||
self.log(f"Failed to fetch Bellazon topic {topic_id}: {e}", 'error')
|
||||
return None
|
||||
|
||||
# Extract slug from final URL: /topic/{id}-{slug}/
|
||||
slug = self._extract_slug(final_url, topic_id)
|
||||
|
||||
# Extract thread title from <h1>
|
||||
title = self._extract_title(page_html)
|
||||
|
||||
# Extract page count from "Page X of Y"
|
||||
page_count = self._extract_page_count(page_html)
|
||||
|
||||
# Count comments on this page to estimate total
|
||||
comment_ids = re.findall(r'data-commentid="(\d+)"', page_html)
|
||||
per_page = len(comment_ids) or 20
|
||||
estimated_comments = per_page * page_count
|
||||
|
||||
return {
|
||||
'username': slug,
|
||||
'display_name': title or slug,
|
||||
'post_count': estimated_comments,
|
||||
'page_count': page_count,
|
||||
'topic_url': final_url.split('?')[0].rstrip('/'),
|
||||
}
|
||||
|
||||
async def get_posts(self, topic_id: str, topic_url: str,
|
||||
known_post_ids: Optional[Set[str]] = None,
|
||||
progress_callback=None) -> List[Post]:
|
||||
"""Scrape all pages of a thread and return posts with media."""
|
||||
known = known_post_ids or set()
|
||||
posts: List[Post] = []
|
||||
|
||||
# Fetch page 1 to get page count
|
||||
page1_url = f'{topic_url}/page/1/'
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
page_html = await self._fetch_page(session, page1_url)
|
||||
if page_html is None:
|
||||
return posts
|
||||
|
||||
page_count = self._extract_page_count(page_html)
|
||||
self.log(f"Thread has {page_count} pages", 'info')
|
||||
|
||||
# Parse page 1
|
||||
page_posts = self._parse_page(page_html, topic_id, known)
|
||||
posts.extend(page_posts)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(posts))
|
||||
|
||||
# Parse remaining pages
|
||||
for page_num in range(2, page_count + 1):
|
||||
page_url = f'{topic_url}/page/{page_num}/'
|
||||
await asyncio.sleep(1) # Rate limit
|
||||
|
||||
page_html = await self._fetch_page(session, page_url)
|
||||
if page_html is None:
|
||||
self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
|
||||
break
|
||||
|
||||
page_posts = self._parse_page(page_html, topic_id, known)
|
||||
posts.extend(page_posts)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(posts))
|
||||
|
||||
self.log(f"Page {page_num}/{page_count}: {len(page_posts)} posts with media", 'debug')
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error scraping Bellazon thread: {e}", 'error')
|
||||
|
||||
self.log(f"Total: {len(posts)} posts with media from {page_count} pages", 'info')
|
||||
return posts
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# HTML parsing helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _parse_page(self, page_html: str, topic_id: str, known: Set[str]) -> List[Post]:
|
||||
"""Parse a single page of HTML and return Post objects for comments with media."""
|
||||
posts: List[Post] = []
|
||||
|
||||
# Split HTML into comment blocks using data-commentid markers
|
||||
# Each comment starts with data-commentid="..." and contains a content block
|
||||
comment_pattern = re.compile(
|
||||
r'data-commentid="(\d+)"\s+data-quotedata="([^"]*)"',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
matches = list(comment_pattern.finditer(page_html))
|
||||
if not matches:
|
||||
return posts
|
||||
|
||||
for i, match in enumerate(matches):
|
||||
comment_id = match.group(1)
|
||||
post_id = f"comment_{comment_id}"
|
||||
|
||||
if post_id in known:
|
||||
continue
|
||||
|
||||
quotedata_raw = match.group(2)
|
||||
|
||||
# Parse quote data for username and timestamp
|
||||
username, timestamp = self._parse_quotedata(quotedata_raw)
|
||||
|
||||
# Extract the content block for this comment
|
||||
start = match.end()
|
||||
end = matches[i + 1].start() if i + 1 < len(matches) else len(page_html)
|
||||
content_block = page_html[start:end]
|
||||
|
||||
# Find the actual content within data-role="commentContent"
|
||||
# The closing pattern is </div> followed by blank lines then </div>
|
||||
content_match = re.search(
|
||||
r'data-role="commentContent"[^>]*>(.*?)</div>\s*\n\s*\n\s*</div>',
|
||||
content_block, re.DOTALL
|
||||
)
|
||||
if not content_match:
|
||||
# Fallback: grab everything from commentContent to ipsEntry__foot
|
||||
content_match = re.search(
|
||||
r'data-role="commentContent"[^>]*>(.*?)(?=ipsEntry__foot)',
|
||||
content_block, re.DOTALL
|
||||
)
|
||||
if not content_match:
|
||||
continue
|
||||
|
||||
content_html = content_match.group(1)
|
||||
|
||||
# Extract media from content
|
||||
attachments = self._extract_media(content_html)
|
||||
|
||||
if not attachments:
|
||||
continue # Skip text-only replies
|
||||
|
||||
# Build published_at from timestamp
|
||||
published_at = None
|
||||
if timestamp:
|
||||
try:
|
||||
dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
|
||||
published_at = dt.isoformat()
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
post = Post(
|
||||
post_id=post_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=topic_id,
|
||||
title='',
|
||||
content=f"Posted by {username}" if username else '',
|
||||
published_at=published_at,
|
||||
attachments=attachments,
|
||||
)
|
||||
posts.append(post)
|
||||
known.add(post_id)
|
||||
|
||||
return posts
|
||||
|
||||
def _extract_media(self, content_html: str) -> List[Attachment]:
|
||||
"""Extract image and video attachments from a comment's HTML content."""
|
||||
attachments: List[Attachment] = []
|
||||
seen_urls: set = set()
|
||||
|
||||
# 1. Bellazon-hosted images: <a class="ipsAttachLink ipsAttachLink_image" href="...full..."><img src="...thumb...">
|
||||
for m in re.finditer(
|
||||
r'ipsAttachLink_image"\s+href="([^"]+)"[^>]*><img[^>]*src="([^"]+)"',
|
||||
content_html
|
||||
):
|
||||
full_url = self._normalize_url(m.group(1))
|
||||
if full_url in seen_urls:
|
||||
continue
|
||||
# Skip thumbnails as the full URL
|
||||
if '_thumb.' in full_url or '.thumb.' in full_url:
|
||||
continue
|
||||
seen_urls.add(full_url)
|
||||
attachments.append(self._make_attachment(full_url, 'image'))
|
||||
|
||||
# 2. Direct image/video links from bellazon uploads not caught by pattern 1
|
||||
for m in re.finditer(
|
||||
r'href="([^"]*bellazon\.com/main/uploads/[^"]+)"',
|
||||
content_html
|
||||
):
|
||||
url = self._normalize_url(m.group(1))
|
||||
if url in seen_urls:
|
||||
continue
|
||||
if '_thumb.' in url or '.thumb.' in url:
|
||||
continue
|
||||
ext = self._get_extension(url)
|
||||
if ext in self.IMAGE_EXTS or ext in self.VIDEO_EXTS:
|
||||
seen_urls.add(url)
|
||||
file_type = 'image' if ext in self.IMAGE_EXTS else 'video'
|
||||
attachments.append(self._make_attachment(url, file_type))
|
||||
|
||||
# 3. Video <source> tags: <source src="//www.bellazon.com/main/uploads/...MP4" type="video/mp4">
|
||||
for m in re.finditer(
|
||||
r'<source\s+src="([^"]+)"[^>]*type="video/',
|
||||
content_html
|
||||
):
|
||||
url = self._normalize_url(m.group(1))
|
||||
if url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
name = self._filename_from_url(url)
|
||||
attachments.append(self._make_attachment(url, 'video', name=name))
|
||||
|
||||
# 4. Video/file attachments: <a href="...attachment.php?id=XXX">filename.MP4</a>
|
||||
# These are protocol-relative URLs like //www.bellazon.com/main/applications/...
|
||||
for m in re.finditer(
|
||||
r'href="([^"]*attachment\.php\?id=\d+[^"]*)"[^>]*>([^<]+)',
|
||||
content_html
|
||||
):
|
||||
att_url = self._normalize_url(m.group(1))
|
||||
filename = m.group(2).strip()
|
||||
if att_url in seen_urls:
|
||||
continue
|
||||
ext = self._get_extension(filename)
|
||||
if ext in self.VIDEO_EXTS or ext in self.IMAGE_EXTS:
|
||||
seen_urls.add(att_url)
|
||||
file_type = 'video' if ext in self.VIDEO_EXTS else 'image'
|
||||
attachments.append(self._make_attachment(att_url, file_type, name=filename))
|
||||
|
||||
return attachments
|
||||
|
||||
def _make_attachment(self, url: str, file_type: str, name: str = None) -> Attachment:
|
||||
"""Create an Attachment from a URL."""
|
||||
if name is None:
|
||||
name = self._filename_from_url(url)
|
||||
ext = self._get_extension(name)
|
||||
|
||||
return Attachment(
|
||||
name=name,
|
||||
file_type=file_type,
|
||||
extension=ext if ext else None,
|
||||
server_path=url, # Used as dedup key
|
||||
download_url=url,
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Utility helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
|
||||
"""Fetch a single page, return HTML or None."""
|
||||
try:
|
||||
async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"HTTP {resp.status} for {url}", 'warning')
|
||||
return None
|
||||
return await resp.text()
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching {url}: {e}", 'warning')
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_slug(url: str, topic_id: str) -> str:
|
||||
"""Extract slug from URL like /topic/39089-india-reynolds/"""
|
||||
m = re.search(rf'/topic/{re.escape(topic_id)}-([^/?#]+)', url)
|
||||
if m:
|
||||
return m.group(1).strip('/')
|
||||
return topic_id
|
||||
|
||||
@staticmethod
|
||||
def _extract_title(page_html: str) -> Optional[str]:
|
||||
"""Extract thread title from <h1>."""
|
||||
m = re.search(r'<h1[^>]*>([^<]+)</h1>', page_html)
|
||||
if m:
|
||||
return html.unescape(m.group(1).strip())
|
||||
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
title = html.unescape(m.group(1).strip())
|
||||
# Remove site suffix
|
||||
title = re.sub(r'\s*[-–—]\s*Bellazon.*$', '', title, flags=re.IGNORECASE).strip()
|
||||
return title
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_page_count(page_html: str) -> int:
|
||||
"""Extract total page count from 'Page X of Y'."""
|
||||
m = re.search(r'Page\s+\d+\s+of\s+(\d+)', page_html)
|
||||
if m:
|
||||
return int(m.group(1))
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
def _parse_quotedata(raw: str) -> tuple:
|
||||
"""Parse HTML-encoded JSON quotedata, return (username, unix_timestamp)."""
|
||||
try:
|
||||
decoded = html.unescape(raw)
|
||||
data = json.loads(decoded)
|
||||
return data.get('username', ''), data.get('timestamp')
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return '', None
|
||||
|
||||
@staticmethod
|
||||
def _normalize_url(url: str) -> str:
|
||||
"""Normalize a URL: handle protocol-relative, decode HTML entities, make absolute."""
|
||||
url = html.unescape(url) # & → &
|
||||
if url.startswith('//'):
|
||||
url = 'https:' + url
|
||||
elif url.startswith('/'):
|
||||
url = 'https://www.bellazon.com' + url
|
||||
elif not url.startswith('http'):
|
||||
url = 'https://www.bellazon.com/main/' + url
|
||||
return url
|
||||
|
||||
@staticmethod
|
||||
def _get_extension(filename_or_url: str) -> str:
|
||||
"""Get lowercase file extension from a filename or URL."""
|
||||
# Strip query params
|
||||
clean = filename_or_url.split('?')[0].split('#')[0]
|
||||
if '.' in clean.split('/')[-1]:
|
||||
return clean.rsplit('.', 1)[-1].lower()
|
||||
return ''
|
||||
|
||||
@staticmethod
|
||||
def _filename_from_url(url: str) -> str:
|
||||
"""Extract filename from URL path."""
|
||||
path = urlparse(url).path
|
||||
name = path.rstrip('/').split('/')[-1]
|
||||
return name if name else 'unnamed'
|
||||
468
modules/paid_content/besteyecandy_client.py
Normal file
468
modules/paid_content/besteyecandy_client.py
Normal file
@@ -0,0 +1,468 @@
|
||||
"""
|
||||
BestEyeCandy.com Client for Paid Content
|
||||
|
||||
Scrapes celebrity photo galleries from BestEyeCandy.com.
|
||||
Each celeb has a unique CID and paginated photo listings.
|
||||
|
||||
Optimization: Full-res URLs follow a predictable pattern. We visit ONE
|
||||
detail page to determine the pattern (server hostname + name format),
|
||||
then construct all remaining URLs from photo IDs found on listing pages.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Optional, Set
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Post, Attachment
|
||||
|
||||
|
||||
class BestEyeCandyClient(LoggingMixin):
|
||||
"""Client for scraping BestEyeCandy.com celebrity photo galleries."""
|
||||
|
||||
SERVICE_ID = 'besteyecandy'
|
||||
PLATFORM = 'besteyecandy'
|
||||
BASE_URL = 'https://besteyecandy.com'
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||||
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
def __init__(self, unified_db=None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='BestEyeCandy')
|
||||
self.unified_db = unified_db
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Cookie support
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _get_cookies(self) -> Optional[list]:
|
||||
"""Load cookies from the scrapers table for besteyecandy."""
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?",
|
||||
(self.SERVICE_ID,))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
data = json.loads(row[0])
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
return data['cookies']
|
||||
elif isinstance(data, list):
|
||||
return data
|
||||
except Exception as e:
|
||||
self.log(f"Could not load cookies: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _build_cookie_jar(self, cookies_list: list) -> aiohttp.CookieJar:
|
||||
"""Build an aiohttp CookieJar from a list of cookie dicts."""
|
||||
jar = aiohttp.CookieJar(unsafe=True)
|
||||
for cookie in cookies_list:
|
||||
from http.cookies import Morsel
|
||||
import types
|
||||
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
domain = cookie.get('domain', '')
|
||||
path = cookie.get('path', '/')
|
||||
|
||||
# Use SimpleCookie approach
|
||||
from http.cookies import SimpleCookie
|
||||
sc = SimpleCookie()
|
||||
sc[name] = value
|
||||
sc[name]['domain'] = domain
|
||||
sc[name]['path'] = path
|
||||
if cookie.get('secure'):
|
||||
sc[name]['secure'] = True
|
||||
|
||||
jar.update_cookies(sc, urlparse(f"https://{domain.lstrip('.')}"))
|
||||
|
||||
return jar
|
||||
|
||||
def _create_session(self, timeout: aiohttp.ClientTimeout = None) -> aiohttp.ClientSession:
|
||||
"""Create an aiohttp session with cookies loaded from DB."""
|
||||
if timeout is None:
|
||||
timeout = aiohttp.ClientTimeout(total=60)
|
||||
|
||||
cookies_list = self._get_cookies()
|
||||
if cookies_list:
|
||||
jar = self._build_cookie_jar(cookies_list)
|
||||
self.log(f"Loaded {len(cookies_list)} cookies for session", 'debug')
|
||||
return aiohttp.ClientSession(timeout=timeout, cookie_jar=jar)
|
||||
else:
|
||||
self.log("No cookies found for besteyecandy, requests may fail", 'warning')
|
||||
return aiohttp.ClientSession(timeout=timeout)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_profile_info(self, cid: str, celeb_slug: str) -> Optional[Dict]:
|
||||
"""Fetch page 1 of a celeb's listing and return profile-like info."""
|
||||
url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
||||
f'sortedby-age/page-1/{celeb_slug}.html')
|
||||
|
||||
try:
|
||||
async with self._create_session() as session:
|
||||
async with session.get(url, headers=self.HEADERS,
|
||||
allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"BestEyeCandy cid {cid} returned HTTP {resp.status}",
|
||||
'warning')
|
||||
return None
|
||||
page_html = await resp.text()
|
||||
except Exception as e:
|
||||
self.log(f"Failed to fetch BestEyeCandy cid {cid}: {e}", 'error')
|
||||
return None
|
||||
|
||||
# Extract celeb name from page title or heading
|
||||
celeb_name = self._extract_celeb_name(page_html) or celeb_slug.replace('-', ' ')
|
||||
|
||||
# Extract total photos and pages
|
||||
total_photos = self._extract_total_photos(page_html)
|
||||
photos_per_page = len(self._extract_photo_ids(page_html)) or 48
|
||||
page_count = self._extract_page_count(page_html,
|
||||
photos_per_page=photos_per_page)
|
||||
|
||||
celeb_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
||||
f'sortedby-age/page-1/{celeb_slug}.html')
|
||||
|
||||
return {
|
||||
'username': celeb_slug,
|
||||
'display_name': celeb_name,
|
||||
'post_count': total_photos,
|
||||
'page_count': page_count,
|
||||
'celeb_url': celeb_url,
|
||||
}
|
||||
|
||||
async def get_posts(self, cid: str, celeb_slug: str,
|
||||
known_post_ids: Optional[Set[str]] = None,
|
||||
progress_callback=None) -> List[Post]:
|
||||
"""Scrape all listing pages and return posts with full-res image URLs.
|
||||
|
||||
Each listing page becomes one Post with ~48 Attachments (one per photo).
|
||||
Post IDs are "page_N" (e.g. "page_1", "page_2", ...).
|
||||
|
||||
Phase 1: Fetch page 1, get first photo ID, visit detail page to learn
|
||||
the full-res URL pattern.
|
||||
Phase 2: Paginate all listing pages, build one Post per page.
|
||||
"""
|
||||
known = known_post_ids or set()
|
||||
posts: List[Post] = []
|
||||
total_photos = 0
|
||||
url_pattern = None
|
||||
|
||||
try:
|
||||
async with self._create_session() as session:
|
||||
# -- Phase 1: Fetch page 1 and determine full-res URL pattern --
|
||||
page1_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
||||
f'sortedby-age/page-1/{celeb_slug}.html')
|
||||
|
||||
page_html = await self._fetch_page(session, page1_url)
|
||||
if page_html is None:
|
||||
return []
|
||||
|
||||
# Estimate page count for progress display
|
||||
photos_per_page = len(self._extract_photo_ids(page_html)) or 48
|
||||
estimated_pages = self._extract_page_count(
|
||||
page_html, photos_per_page=photos_per_page)
|
||||
self.log(f"Estimated {estimated_pages} pages of photos "
|
||||
f"({photos_per_page}/page)", 'info')
|
||||
|
||||
# Discover full-res URL pattern from first photo
|
||||
first_page_ids = self._extract_photo_ids(page_html)
|
||||
if first_page_ids:
|
||||
url_pattern = await self._discover_url_pattern(
|
||||
session, first_page_ids[0], cid, celeb_slug)
|
||||
|
||||
if not url_pattern:
|
||||
self.log("Could not determine full-res URL pattern", 'error')
|
||||
return []
|
||||
|
||||
self.log(f"URL pattern: server={url_pattern['server']}, "
|
||||
f"name_format={url_pattern['name_format']}, "
|
||||
f"ext={url_pattern['ext']}", 'info')
|
||||
|
||||
# -- Phase 2: Paginate all pages, one Post per page --
|
||||
page_num = 0
|
||||
has_next = True # start with page 1
|
||||
|
||||
while has_next:
|
||||
page_num += 1
|
||||
|
||||
if page_num == 1:
|
||||
# Already fetched page 1
|
||||
pass
|
||||
else:
|
||||
await asyncio.sleep(2) # Rate limit
|
||||
|
||||
page_url = (
|
||||
f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
||||
f'sortedby-age/page-{page_num}/{celeb_slug}.html')
|
||||
|
||||
page_html = await self._fetch_page(session, page_url)
|
||||
if page_html is None:
|
||||
self.log(f"Failed to fetch page {page_num}, stopping",
|
||||
'warning')
|
||||
break
|
||||
|
||||
page_ids = self._extract_photo_ids(page_html)
|
||||
if not page_ids:
|
||||
self.log(f"Page {page_num}: no photos, stopping", 'info')
|
||||
break
|
||||
|
||||
total_photos += len(page_ids)
|
||||
has_next = self._has_next_page(page_html)
|
||||
|
||||
# Check if this page-post is already known
|
||||
post_id = f"page_{page_num}"
|
||||
if post_id in known:
|
||||
self.log(f"Page {page_num}: already known, skipping",
|
||||
'debug')
|
||||
if progress_callback:
|
||||
progress_callback(
|
||||
f"Page {page_num}/~{estimated_pages} — "
|
||||
f"{total_photos} photos (skipped known)")
|
||||
continue
|
||||
|
||||
# Build attachments for all photos on this page
|
||||
attachments = []
|
||||
for photo_id in page_ids:
|
||||
dl_url = self._construct_full_res_url(url_pattern, photo_id)
|
||||
filename = dl_url.rsplit('/', 1)[-1]
|
||||
|
||||
attachments.append(Attachment(
|
||||
name=filename,
|
||||
file_type='image',
|
||||
extension=url_pattern.get('ext', 'jpg'),
|
||||
server_path=dl_url,
|
||||
download_url=dl_url,
|
||||
))
|
||||
|
||||
post = Post(
|
||||
post_id=post_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=cid,
|
||||
title=f"Page {page_num}",
|
||||
content=f"{len(page_ids)} photos",
|
||||
published_at=datetime.now(tz=timezone.utc).isoformat(),
|
||||
attachments=attachments,
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(
|
||||
f"Page {page_num}/~{estimated_pages} — "
|
||||
f"{total_photos} photos")
|
||||
|
||||
self.log(f"Page {page_num}/~{estimated_pages}: "
|
||||
f"{len(page_ids)} photos", 'debug')
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error scraping BestEyeCandy: {e}", 'error')
|
||||
|
||||
self.log(f"Total: {len(posts)} new page-posts with "
|
||||
f"{total_photos} photos across all pages", 'info')
|
||||
return posts
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# URL pattern discovery
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _discover_url_pattern(self, session: aiohttp.ClientSession,
|
||||
photo_id: str, cid: str,
|
||||
celeb_slug: str) -> Optional[Dict]:
|
||||
"""Visit a detail page to discover the full-res URL pattern.
|
||||
|
||||
Returns dict with keys: server, dir_pattern, name_format, ext
|
||||
"""
|
||||
detail_url = (f'{self.BASE_URL}/section/celeb-photogallery/'
|
||||
f'cid-{cid}/{celeb_slug}/photo-{photo_id}.html')
|
||||
|
||||
await asyncio.sleep(2) # Rate limit
|
||||
page_html = await self._fetch_page(session, detail_url)
|
||||
if page_html is None:
|
||||
return None
|
||||
|
||||
# Look for full-res image URL in the detail page
|
||||
# Pattern: <img src="https://euX.besteyecandy.com/section/large-photos/area-female/besteyecandy-{ID}/{Name}_{ID}_BestEyeCandyCOM.jpg">
|
||||
# or <a href="..."> with similar pattern
|
||||
patterns = [
|
||||
r'(https?://[a-z0-9]+\.besteyecandy\.com/section/large-photos/[^"\'>\s]+)',
|
||||
r'(https?://[a-z0-9]+\.besteyecandy\.com/[^"\'>\s]*besteyecandy-' + re.escape(photo_id) + r'[^"\'>\s]*)',
|
||||
]
|
||||
|
||||
full_res_url = None
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, page_html)
|
||||
if match:
|
||||
full_res_url = match.group(1)
|
||||
break
|
||||
|
||||
if not full_res_url:
|
||||
self.log(f"Could not find full-res URL on detail page for photo {photo_id}",
|
||||
'error')
|
||||
return None
|
||||
|
||||
self.log(f"Found full-res URL: {full_res_url}", 'debug')
|
||||
|
||||
# Parse the URL to extract the pattern components
|
||||
parsed = urlparse(full_res_url)
|
||||
server = parsed.netloc # e.g., eu4.besteyecandy.com
|
||||
|
||||
# Extract name format from the filename
|
||||
# e.g., Myleene_Klass_7727820_BestEyeCandyCOM.jpg
|
||||
filename = parsed.path.rsplit('/', 1)[-1]
|
||||
ext = filename.rsplit('.', 1)[-1] if '.' in filename else 'jpg'
|
||||
|
||||
# Extract the path pattern (everything before the filename)
|
||||
path_dir = parsed.path.rsplit('/', 1)[0] # e.g., /section/large-photos/area-female/besteyecandy-7727820
|
||||
|
||||
# The directory pattern includes the photo ID, extract the base
|
||||
# e.g., /section/large-photos/area-female/besteyecandy-{ID}
|
||||
dir_pattern = re.sub(re.escape(photo_id), '{ID}', path_dir)
|
||||
|
||||
# Extract the name format by removing the photo ID
|
||||
# e.g., Myleene_Klass_{ID}_BestEyeCandyCOM.jpg -> Myleene_Klass_{ID}_BestEyeCandyCOM
|
||||
name_without_ext = filename.rsplit('.', 1)[0]
|
||||
name_format = name_without_ext.replace(photo_id, '{ID}')
|
||||
|
||||
return {
|
||||
'server': server,
|
||||
'dir_pattern': dir_pattern,
|
||||
'name_format': name_format,
|
||||
'ext': ext,
|
||||
'example_url': full_res_url,
|
||||
}
|
||||
|
||||
def _construct_full_res_url(self, url_pattern: Dict, photo_id: str) -> str:
|
||||
"""Construct the full-res URL for a photo ID using the discovered pattern."""
|
||||
dir_path = url_pattern['dir_pattern'].replace('{ID}', photo_id)
|
||||
filename = url_pattern['name_format'].replace('{ID}', photo_id) + '.' + url_pattern['ext']
|
||||
return f"https://{url_pattern['server']}{dir_path}/{filename}"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# HTML parsing helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _extract_photo_ids(self, page_html: str) -> List[str]:
|
||||
"""Extract photo IDs from a listing page.
|
||||
|
||||
Photo links look like: href="...photo-12345.html"
|
||||
"""
|
||||
ids = re.findall(r'href="[^"]*photo-(\d+)\.html"', page_html)
|
||||
# Deduplicate while preserving order
|
||||
seen = set()
|
||||
unique_ids = []
|
||||
for pid in ids:
|
||||
if pid not in seen:
|
||||
seen.add(pid)
|
||||
unique_ids.append(pid)
|
||||
return unique_ids
|
||||
|
||||
@staticmethod
|
||||
def _extract_celeb_name(page_html: str) -> Optional[str]:
|
||||
"""Extract celebrity name from the page."""
|
||||
# Try <title> tag: "Myleene Klass Photo Collection @ ...::: BestEyeCandy.com :::..."
|
||||
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
title = html.unescape(m.group(1).strip())
|
||||
# Remove everything from "Photo Collection" or "@" onwards
|
||||
title = re.sub(r'\s*Photo\s+Collection.*$', '', title,
|
||||
flags=re.IGNORECASE).strip()
|
||||
title = re.sub(r'\s*@.*$', '', title).strip()
|
||||
# Fallback: remove BestEyeCandy suffix
|
||||
title = re.sub(r'\s*[-\u2013\u2014|]?\s*\.{0,3}:{0,3}\s*BestEyeCandy.*$', '',
|
||||
title, flags=re.IGNORECASE).strip()
|
||||
if title:
|
||||
return title
|
||||
|
||||
# Try <h1> or <h2>
|
||||
m = re.search(r'<h[12][^>]*>([^<]+)</h[12]>', page_html)
|
||||
if m:
|
||||
return html.unescape(m.group(1).strip())
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_total_photos(page_html: str) -> int:
|
||||
"""Extract total photo count from the page.
|
||||
|
||||
Handles European format (15.660) and US format (15,660).
|
||||
"""
|
||||
# Look for "N.NNN photos" or "N,NNN photos" or "NNN photos"
|
||||
# Require leading digit to avoid matching ", photo" from keywords
|
||||
m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
num_str = m.group(1)
|
||||
# European format uses dots as thousands separators: 15.660
|
||||
# US format uses commas: 15,660
|
||||
# Remove both dots and commas (they're thousands separators)
|
||||
num_str = num_str.replace('.', '').replace(',', '')
|
||||
try:
|
||||
return int(num_str)
|
||||
except ValueError:
|
||||
pass
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
def _extract_page_count(page_html: str, photos_per_page: int = 48) -> int:
|
||||
"""Extract total page count from the listing page.
|
||||
|
||||
Uses total photo count divided by photos per page, or falls back
|
||||
to finding the maximum page number in pagination links.
|
||||
"""
|
||||
# Method 1: Calculate from total photos
|
||||
m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
num_str = m.group(1).replace('.', '').replace(',', '')
|
||||
try:
|
||||
total = int(num_str)
|
||||
if total > 0:
|
||||
return (total + photos_per_page - 1) // photos_per_page
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Method 2: Find max page-N in pagination links for same celeb
|
||||
page_nums = [int(x) for x in re.findall(r'/page-(\d+)/', page_html)]
|
||||
if page_nums:
|
||||
return max(page_nums)
|
||||
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
def _has_next_page(page_html: str) -> bool:
|
||||
"""Check if there's a 'Next Page' link on the current page."""
|
||||
return 'alt="Next Page"' in page_html
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Utility helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _fetch_page(self, session: aiohttp.ClientSession,
|
||||
url: str) -> Optional[str]:
|
||||
"""Fetch a single page, return HTML or None."""
|
||||
try:
|
||||
async with session.get(url, headers=self.HEADERS,
|
||||
allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"HTTP {resp.status} for {url}", 'warning')
|
||||
return None
|
||||
return await resp.text()
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching {url}: {e}", 'warning')
|
||||
return None
|
||||
622
modules/paid_content/coppermine_client.py
Normal file
622
modules/paid_content/coppermine_client.py
Normal file
@@ -0,0 +1,622 @@
|
||||
"""
|
||||
Coppermine Gallery scraper client.
|
||||
|
||||
Coppermine is a PHP photo gallery with a nested structure:
|
||||
categories > sub-categories > albums > photos
|
||||
|
||||
One album maps to one Post with N Attachments.
|
||||
Full-res URLs are derived from thumbnails by stripping the `thumb_` prefix.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Set
|
||||
from urllib.parse import urljoin, urlparse, parse_qs
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Post, Attachment
|
||||
|
||||
|
||||
class CoppermineClient(LoggingMixin):
|
||||
SERVICE_ID = 'coppermine'
|
||||
PLATFORM = 'coppermine'
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||||
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
}
|
||||
|
||||
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
|
||||
|
||||
def __init__(self, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Coppermine')
|
||||
|
||||
async def get_profile_info(self, gallery_url: str) -> Optional[Dict]:
|
||||
"""Fetch gallery root and extract profile metadata.
|
||||
|
||||
Args:
|
||||
gallery_url: Base gallery URL (e.g. https://kylie-jenner.org/gallery)
|
||||
|
||||
Returns:
|
||||
Dict with username, display_name, post_count, gallery_url or None on failure
|
||||
"""
|
||||
root_url = self._build_url(gallery_url, 'index.php')
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
html = await self._fetch_page(session, root_url)
|
||||
if not html:
|
||||
return None
|
||||
|
||||
# Extract site title from <title> tag
|
||||
title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
|
||||
site_title = title_match.group(1).strip() if title_match else 'Coppermine Gallery'
|
||||
# Clean HTML entities
|
||||
site_title = re.sub(r'&', '&', site_title)
|
||||
site_title = re.sub(r'<', '<', site_title)
|
||||
site_title = re.sub(r'>', '>', site_title)
|
||||
site_title = re.sub(r'&#\d+;', '', site_title)
|
||||
site_title = re.sub(r'&\w+;', '', site_title)
|
||||
|
||||
# Try to extract stats: "N files in M albums"
|
||||
total_files = 0
|
||||
total_albums = 0
|
||||
stats_match = re.search(
|
||||
r'(\d[\d,]*)\s+files?\s+in\s+(\d[\d,]*)\s+albums?',
|
||||
html, re.IGNORECASE
|
||||
)
|
||||
if stats_match:
|
||||
total_files = int(stats_match.group(1).replace(',', ''))
|
||||
total_albums = int(stats_match.group(2).replace(',', ''))
|
||||
|
||||
# Use domain as username
|
||||
parsed = urlparse(gallery_url)
|
||||
domain = parsed.netloc.replace('www.', '')
|
||||
|
||||
return {
|
||||
'username': domain,
|
||||
'display_name': site_title,
|
||||
'post_count': total_albums,
|
||||
'gallery_url': gallery_url,
|
||||
}
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching profile info from {gallery_url}: {e}", 'error')
|
||||
return None
|
||||
|
||||
async def get_posts(self, gallery_url: str,
|
||||
known_post_ids: Optional[Set[str]] = None,
|
||||
progress_callback=None,
|
||||
post_callback=None):
|
||||
"""Crawl the gallery, yielding new albums as Post objects incrementally.
|
||||
|
||||
Phase 1: Fetch root, extract top-level category links
|
||||
Phase 2: Recursively crawl categories until album links found
|
||||
Phase 3: For each album, fetch thumbnails and call post_callback immediately
|
||||
|
||||
Args:
|
||||
gallery_url: Base gallery URL
|
||||
known_post_ids: Set of post IDs already in DB (album_NNN)
|
||||
progress_callback: Called with status message strings
|
||||
post_callback: async callable(post) — called for each album as it's fetched.
|
||||
If provided, posts are streamed instead of collected.
|
||||
|
||||
Returns:
|
||||
List of Post objects (only if post_callback is None)
|
||||
"""
|
||||
known = known_post_ids or set()
|
||||
timeout = aiohttp.ClientTimeout(total=None, sock_connect=30, sock_read=60)
|
||||
posts_collected = [] if post_callback is None else None
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
# Phase 1: Get all category links from root
|
||||
root_url = self._build_url(gallery_url, 'index.php')
|
||||
root_html = await self._fetch_page(session, root_url)
|
||||
if not root_html:
|
||||
self.log("Failed to fetch gallery root", 'error')
|
||||
return [] if post_callback is None else None
|
||||
|
||||
category_ids = self._extract_category_ids(root_html)
|
||||
self.log(f"Found {len(category_ids)} top-level categories", 'info')
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f'Found {len(category_ids)} categories, crawling...')
|
||||
|
||||
# Phase 2: Recursively crawl categories to find album IDs
|
||||
album_ids = set()
|
||||
visited_cats = set()
|
||||
for cat_id in category_ids:
|
||||
new_albums = await self._crawl_category(
|
||||
session, gallery_url, cat_id, visited_cats, known, progress_callback
|
||||
)
|
||||
album_ids.update(new_albums)
|
||||
|
||||
# Filter out known albums
|
||||
new_album_ids = {aid for aid in album_ids
|
||||
if f"album_{aid}" not in known}
|
||||
|
||||
self.log(f"Found {len(new_album_ids)} new albums "
|
||||
f"({len(album_ids)} total, {len(album_ids) - len(new_album_ids)} known)",
|
||||
'info')
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f'Found {len(new_album_ids)} new albums, fetching photos...')
|
||||
|
||||
# Phase 3: Fetch each new album and deliver Post objects
|
||||
parsed = urlparse(gallery_url)
|
||||
domain = parsed.netloc.replace('www.', '')
|
||||
fetched = 0
|
||||
|
||||
for i, album_id in enumerate(sorted(new_album_ids)):
|
||||
if progress_callback and (i + 1) % 5 == 0:
|
||||
progress_callback(
|
||||
f'Fetching album {i + 1}/{len(new_album_ids)}...'
|
||||
)
|
||||
|
||||
post = await self._fetch_album(session, gallery_url, album_id, domain)
|
||||
if post and post.attachments:
|
||||
fetched += 1
|
||||
if post_callback:
|
||||
await post_callback(post)
|
||||
else:
|
||||
posts_collected.append(post)
|
||||
|
||||
# Rate limit: 1s between page fetches
|
||||
await asyncio.sleep(2)
|
||||
|
||||
self.log(f"Fetched {fetched} albums with attachments", 'info')
|
||||
return posts_collected
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error crawling gallery {gallery_url}: {e}", 'error')
|
||||
return [] if post_callback is None else None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _build_url(self, gallery_url: str, page: str) -> str:
|
||||
"""Build a full URL from the gallery base and a page name."""
|
||||
base = gallery_url.rstrip('/')
|
||||
return f"{base}/{page}"
|
||||
|
||||
async def _fetch_page(self, session: aiohttp.ClientSession, url: str,
|
||||
max_retries: int = 3) -> Optional[str]:
|
||||
"""Fetch a page and return its HTML text, or None on failure.
|
||||
|
||||
Retries with exponential backoff on connection errors / server disconnects.
|
||||
"""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
async with session.get(url, headers=self.HEADERS) as resp:
|
||||
if resp.status == 429:
|
||||
wait = 5 * (attempt + 1)
|
||||
self.log(f"Rate limited on {url}, waiting {wait}s", 'warning')
|
||||
await asyncio.sleep(wait)
|
||||
continue
|
||||
if resp.status != 200:
|
||||
self.log(f"HTTP {resp.status} fetching {url}", 'warning')
|
||||
return None
|
||||
return await resp.text()
|
||||
except (aiohttp.ServerDisconnectedError, aiohttp.ClientOSError,
|
||||
aiohttp.ClientPayloadError, ConnectionResetError) as e:
|
||||
wait = 3 * (attempt + 1)
|
||||
if attempt < max_retries - 1:
|
||||
self.log(f"Connection error on {url}, retry {attempt + 1} in {wait}s: {e}",
|
||||
'warning')
|
||||
await asyncio.sleep(wait)
|
||||
else:
|
||||
self.log(f"Failed after {max_retries} attempts: {url}: {e}", 'warning')
|
||||
return None
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching {url}: {e}", 'warning')
|
||||
return None
|
||||
return None
|
||||
|
||||
def _extract_category_ids(self, html: str) -> List[str]:
|
||||
"""Extract category IDs from index.php page.
|
||||
|
||||
Looks for links like: index.php?cat=N
|
||||
"""
|
||||
cat_ids = []
|
||||
seen = set()
|
||||
for match in re.finditer(r'index\.php\?cat=(\d+)', html):
|
||||
cat_id = match.group(1)
|
||||
if cat_id not in seen:
|
||||
seen.add(cat_id)
|
||||
cat_ids.append(cat_id)
|
||||
return cat_ids
|
||||
|
||||
def _extract_album_ids(self, html: str) -> List[str]:
|
||||
"""Extract album IDs from a category page.
|
||||
|
||||
Looks for links like: thumbnails.php?album=N
|
||||
"""
|
||||
album_ids = []
|
||||
seen = set()
|
||||
for match in re.finditer(r'thumbnails\.php\?album=(\d+)', html):
|
||||
album_id = match.group(1)
|
||||
if album_id not in seen:
|
||||
seen.add(album_id)
|
||||
album_ids.append(album_id)
|
||||
return album_ids
|
||||
|
||||
def _extract_page_count(self, html: str) -> int:
|
||||
"""Extract total page count from Coppermine pagination text.
|
||||
|
||||
Looks for patterns like "53 albums on 2 page(s)" or "N files on M page(s)".
|
||||
"""
|
||||
match = re.search(r'on\s+(\d+)\s+page\(s\)', html, re.IGNORECASE)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
return 1
|
||||
|
||||
async def _crawl_category(self, session: aiohttp.ClientSession,
|
||||
gallery_url: str, cat_id: str,
|
||||
visited: Set[str], known: Set[str],
|
||||
progress_callback=None,
|
||||
depth: int = 0) -> Set[str]:
|
||||
"""Recursively crawl a category to find all album IDs.
|
||||
|
||||
Categories can contain sub-categories or albums. We recurse
|
||||
until we find album links (thumbnails.php?album=N).
|
||||
Handles pagination within category pages (index.php?cat=N&page=M).
|
||||
|
||||
Args:
|
||||
session: aiohttp session
|
||||
gallery_url: Base gallery URL
|
||||
cat_id: Category ID to crawl
|
||||
visited: Set of already-visited category IDs (prevents loops)
|
||||
known: Set of known post_ids (for logging only)
|
||||
progress_callback: Status callback
|
||||
depth: Recursion depth (max 10)
|
||||
|
||||
Returns:
|
||||
Set of album ID strings
|
||||
"""
|
||||
if cat_id in visited or depth > 10:
|
||||
return set()
|
||||
visited.add(cat_id)
|
||||
|
||||
# Fetch first page
|
||||
cat_url = self._build_url(gallery_url, f'index.php?cat={cat_id}')
|
||||
html = await self._fetch_page(session, cat_url)
|
||||
if not html:
|
||||
return set()
|
||||
|
||||
await asyncio.sleep(2)
|
||||
|
||||
album_ids = set(self._extract_album_ids(html))
|
||||
sub_cat_ids = self._extract_category_ids(html)
|
||||
|
||||
# Handle pagination: fetch remaining pages
|
||||
total_pages = self._extract_page_count(html)
|
||||
if total_pages > 1:
|
||||
for page_num in range(2, total_pages + 1):
|
||||
page_url = self._build_url(
|
||||
gallery_url, f'index.php?cat={cat_id}&page={page_num}'
|
||||
)
|
||||
page_html = await self._fetch_page(session, page_url)
|
||||
if page_html:
|
||||
album_ids.update(self._extract_album_ids(page_html))
|
||||
# Sub-categories are the same on every page, no need to re-extract
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Filter out the current category from sub-categories
|
||||
sub_cat_ids = [c for c in sub_cat_ids if c != cat_id and c not in visited]
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(
|
||||
f'Category {cat_id}: {len(album_ids)} albums, '
|
||||
f'{len(sub_cat_ids)} sub-categories'
|
||||
+ (f' ({total_pages} pages)' if total_pages > 1 else '')
|
||||
)
|
||||
|
||||
# Recurse into sub-categories
|
||||
for sub_id in sub_cat_ids:
|
||||
sub_albums = await self._crawl_category(
|
||||
session, gallery_url, sub_id, visited, known,
|
||||
progress_callback, depth + 1
|
||||
)
|
||||
album_ids.update(sub_albums)
|
||||
|
||||
return album_ids
|
||||
|
||||
async def _fetch_album(self, session: aiohttp.ClientSession,
|
||||
gallery_url: str, album_id: str,
|
||||
domain: str) -> Optional[Post]:
|
||||
"""Fetch an album page (all pages) and build a Post object.
|
||||
|
||||
Handles pagination within albums (thumbnails.php?album=N&page=M).
|
||||
|
||||
Args:
|
||||
session: aiohttp session
|
||||
gallery_url: Base gallery URL
|
||||
album_id: Album ID to fetch
|
||||
domain: Domain name for creator_id
|
||||
|
||||
Returns:
|
||||
Post object with attachments, or None on failure
|
||||
"""
|
||||
album_url = self._build_url(gallery_url, f'thumbnails.php?album={album_id}')
|
||||
html = await self._fetch_page(session, album_url)
|
||||
if not html:
|
||||
return None
|
||||
|
||||
# Extract album title from first page
|
||||
title = self._extract_album_title(html)
|
||||
if not title:
|
||||
title = f"Album {album_id}"
|
||||
|
||||
# Extract attachments from first page
|
||||
attachments = self._extract_attachments(html, gallery_url)
|
||||
|
||||
# Handle pagination within album
|
||||
total_pages = self._extract_page_count(html)
|
||||
if total_pages > 1:
|
||||
for page_num in range(2, total_pages + 1):
|
||||
page_url = self._build_url(
|
||||
gallery_url, f'thumbnails.php?album={album_id}&page={page_num}'
|
||||
)
|
||||
page_html = await self._fetch_page(session, page_url)
|
||||
if page_html:
|
||||
attachments.extend(self._extract_attachments(page_html, gallery_url))
|
||||
await asyncio.sleep(2)
|
||||
|
||||
if not attachments:
|
||||
return None
|
||||
|
||||
# Extract album date from breadcrumb + title
|
||||
album_date = self._extract_album_date(html, title)
|
||||
|
||||
post_id = f"album_{album_id}"
|
||||
return Post(
|
||||
post_id=post_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=domain,
|
||||
title=None,
|
||||
content=title,
|
||||
published_at=album_date,
|
||||
attachments=attachments,
|
||||
)
|
||||
|
||||
def _extract_album_title(self, html: str) -> Optional[str]:
|
||||
"""Extract album title from page HTML.
|
||||
|
||||
Priority: breadcrumb last item > <h1>/<h2> heading > <title> last segment
|
||||
"""
|
||||
# Try breadcrumb: last text segment after the last ">"
|
||||
# Coppermine breadcrumbs: "Home > Category > Sub > Album Title"
|
||||
bc_match = re.search(
|
||||
r'class="[^"]*breadcrumb[^"]*"[^>]*>(.*?)</(?:div|span|td|p)',
|
||||
html, re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
if bc_match:
|
||||
bc_text = bc_match.group(1)
|
||||
# Strip HTML tags, split on ">", take last segment
|
||||
bc_text = re.sub(r'<[^>]+>', ' ', bc_text)
|
||||
parts = [p.strip() for p in bc_text.split('>') if p.strip()]
|
||||
if parts:
|
||||
title = self._clean_text(parts[-1])
|
||||
if title and title.lower() not in ('home', 'index', 'gallery'):
|
||||
return title
|
||||
|
||||
# Try headings
|
||||
for tag in ('h1', 'h2', 'h3'):
|
||||
h_match = re.search(
|
||||
rf'<{tag}[^>]*>(.*?)</{tag}>', html, re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
if h_match:
|
||||
title = self._clean_text(h_match.group(1))
|
||||
if title and len(title) > 2:
|
||||
return title
|
||||
|
||||
# Fallback: <title> tag — take the last segment before the site name
|
||||
title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
|
||||
if title_match:
|
||||
title = title_match.group(1).strip()
|
||||
# Usually "Site Name - Album Title" or "Album Title - Site Name"
|
||||
# The album-specific part is typically not the site name;
|
||||
# use the longest segment as a heuristic
|
||||
if ' - ' in title:
|
||||
parts = [p.strip() for p in title.split(' - ')]
|
||||
# Pick the longest part (album names tend to be longer than site names)
|
||||
title = max(parts, key=len)
|
||||
if title:
|
||||
return self._clean_text(title)
|
||||
|
||||
return None
|
||||
|
||||
def _extract_album_date(self, html: str, title: str) -> str:
|
||||
"""Extract album date from breadcrumb year + title month/day.
|
||||
|
||||
Breadcrumb: "Home > Candids > 2026 > January 11 - Leaving..."
|
||||
Title: "January 11 - Leaving Golden Globes afterparty..."
|
||||
|
||||
Returns ISO date string, or current datetime as fallback.
|
||||
"""
|
||||
MONTHS = {
|
||||
'january': 1, 'february': 2, 'march': 3, 'april': 4,
|
||||
'may': 5, 'june': 6, 'july': 7, 'august': 8,
|
||||
'september': 9, 'october': 10, 'november': 11, 'december': 12,
|
||||
}
|
||||
|
||||
# Extract year from breadcrumb path (look for 4-digit year in links)
|
||||
year = None
|
||||
# Breadcrumb links: index.php?cat=155">2026</a>
|
||||
for m in re.finditer(r'>\s*((?:19|20)\d{2})\s*</', html):
|
||||
year = int(m.group(1))
|
||||
|
||||
# Also try path segments in albums/ URLs for year
|
||||
if not year:
|
||||
path_match = re.search(r'albums/[^/]+/(20\d{2})/', html)
|
||||
if path_match:
|
||||
year = int(path_match.group(1))
|
||||
|
||||
# Extract month and day from album title
|
||||
month, day = None, None
|
||||
if title:
|
||||
# "January 11 - ..." or "March 3 - ..."
|
||||
date_match = re.match(
|
||||
r'(\w+)\s+(\d{1,2})\b', title
|
||||
)
|
||||
if date_match:
|
||||
month_name = date_match.group(1).lower()
|
||||
if month_name in MONTHS:
|
||||
month = MONTHS[month_name]
|
||||
day = int(date_match.group(2))
|
||||
|
||||
# Build date from breadcrumb year + title month/day
|
||||
if year and month and day:
|
||||
try:
|
||||
return datetime(year, month, day).isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
if year and month:
|
||||
try:
|
||||
return datetime(year, month, 1).isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
if year:
|
||||
return datetime(year, 1, 1).isoformat()
|
||||
|
||||
# Fallback: parse "Date added=Jan 13, 2026" from thumbnail tooltips
|
||||
MONTH_ABBR = {
|
||||
'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
|
||||
'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
|
||||
'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
|
||||
}
|
||||
added_match = re.search(
|
||||
r'Date added\s*=\s*(\w{3})\s+(\d{1,2}),?\s+(\d{4})', html
|
||||
)
|
||||
if added_match:
|
||||
m_abbr = added_match.group(1).lower()
|
||||
if m_abbr in MONTH_ABBR:
|
||||
try:
|
||||
return datetime(
|
||||
int(added_match.group(3)),
|
||||
MONTH_ABBR[m_abbr],
|
||||
int(added_match.group(2))
|
||||
).isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Also try "last one added on Jan 13, 2026" from album_stat
|
||||
stat_match = re.search(
|
||||
r'last one added on\s+(\w{3})\s+(\d{1,2}),?\s+(\d{4})', html
|
||||
)
|
||||
if stat_match:
|
||||
m_abbr = stat_match.group(1).lower()
|
||||
if m_abbr in MONTH_ABBR:
|
||||
try:
|
||||
return datetime(
|
||||
int(stat_match.group(3)),
|
||||
MONTH_ABBR[m_abbr],
|
||||
int(stat_match.group(2))
|
||||
).isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return datetime.now().isoformat()
|
||||
|
||||
def _extract_attachments(self, html: str, gallery_url: str) -> List[Attachment]:
|
||||
"""Extract photo attachments from album page HTML.
|
||||
|
||||
Finds thumbnail images and converts them to full-res URLs by
|
||||
stripping the `thumb_` prefix from the filename.
|
||||
"""
|
||||
attachments = []
|
||||
seen_urls = set()
|
||||
|
||||
# Pattern: thumbnail images in album pages
|
||||
# Common patterns:
|
||||
# <img src="albums/path/thumb_filename.jpg" ...>
|
||||
# <img src="albums/path/normal_filename.jpg" ...>
|
||||
for match in re.finditer(
|
||||
r'<img[^>]+src=["\']([^"\']*?albums/[^"\']*?(?:thumb_|normal_)[^"\']+)["\']',
|
||||
html, re.IGNORECASE
|
||||
):
|
||||
thumb_src = match.group(1)
|
||||
full_url = self._thumb_to_fullres(thumb_src, gallery_url)
|
||||
if full_url and full_url not in seen_urls:
|
||||
seen_urls.add(full_url)
|
||||
filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url
|
||||
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
|
||||
|
||||
attachments.append(Attachment(
|
||||
name=filename,
|
||||
server_path=full_url, # use as dedup key
|
||||
file_type='image' if ext in self.IMAGE_EXTS else 'unknown',
|
||||
extension=ext or None,
|
||||
download_url=full_url,
|
||||
))
|
||||
|
||||
# Also try: <a href="displayimage.php?..."><img src="albums/...">
|
||||
# Some themes wrap thumbnails in links
|
||||
if not attachments:
|
||||
for match in re.finditer(
|
||||
r'<a[^>]+href=["\'][^"\']*displayimage\.php[^"\']*["\'][^>]*>'
|
||||
r'\s*<img[^>]+src=["\']([^"\']+)["\']',
|
||||
html, re.IGNORECASE | re.DOTALL
|
||||
):
|
||||
thumb_src = match.group(1)
|
||||
full_url = self._thumb_to_fullres(thumb_src, gallery_url)
|
||||
if full_url and full_url not in seen_urls:
|
||||
seen_urls.add(full_url)
|
||||
filename = full_url.rsplit('/', 1)[-1] if '/' in full_url else full_url
|
||||
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
|
||||
|
||||
attachments.append(Attachment(
|
||||
name=filename,
|
||||
server_path=full_url,
|
||||
file_type='image' if ext in self.IMAGE_EXTS else 'unknown',
|
||||
extension=ext or None,
|
||||
download_url=full_url,
|
||||
))
|
||||
|
||||
return attachments
|
||||
|
||||
def _thumb_to_fullres(self, thumb_src: str, gallery_url: str) -> Optional[str]:
|
||||
"""Convert a thumbnail URL to a full-resolution URL.
|
||||
|
||||
Strips `thumb_` or `normal_` prefix from the filename and
|
||||
prepends the gallery base URL if needed.
|
||||
|
||||
Args:
|
||||
thumb_src: Thumbnail src attribute value
|
||||
gallery_url: Base gallery URL
|
||||
|
||||
Returns:
|
||||
Full-resolution image URL, or None if conversion fails
|
||||
"""
|
||||
if not thumb_src:
|
||||
return None
|
||||
|
||||
# Strip thumb_ or normal_ prefix from filename
|
||||
# e.g. albums/candids/2026/0111/thumb_001.jpg → albums/candids/2026/0111/001.jpg
|
||||
fullres_path = re.sub(r'(/)(?:thumb_|normal_)', r'\1', thumb_src)
|
||||
|
||||
# If the path is already absolute (starts with http), return as-is
|
||||
if fullres_path.startswith(('http://', 'https://')):
|
||||
return fullres_path
|
||||
|
||||
# Otherwise, make it absolute relative to gallery URL
|
||||
base = gallery_url.rstrip('/')
|
||||
fullres_path = fullres_path.lstrip('./')
|
||||
return f"{base}/{fullres_path}"
|
||||
|
||||
def _clean_text(self, text: str) -> str:
|
||||
"""Clean HTML entities and whitespace from text."""
|
||||
text = re.sub(r'&', '&', text)
|
||||
text = re.sub(r'<', '<', text)
|
||||
text = re.sub(r'>', '>', text)
|
||||
text = re.sub(r'"', '"', text)
|
||||
text = re.sub(r'&#\d+;', '', text)
|
||||
text = re.sub(r'&\w+;', '', text)
|
||||
text = re.sub(r'<[^>]+>', '', text)
|
||||
return text.strip()
|
||||
3616
modules/paid_content/db_adapter.py
Normal file
3616
modules/paid_content/db_adapter.py
Normal file
File diff suppressed because it is too large
Load Diff
297
modules/paid_content/embed_downloader.py
Normal file
297
modules/paid_content/embed_downloader.py
Normal file
@@ -0,0 +1,297 @@
|
||||
"""
|
||||
Embed Downloader - Downloads embedded videos from posts using yt-dlp
|
||||
Supports: YouTube, Vimeo, Dailymotion, Twitch, and many other platforms
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
|
||||
|
||||
class EmbedDownloader(LoggingMixin):
|
||||
"""
|
||||
Download embedded videos from posts using yt-dlp
|
||||
|
||||
Wrapper around yt-dlp for downloading videos from various platforms
|
||||
embedded in creator posts.
|
||||
"""
|
||||
|
||||
# Quality presets for yt-dlp
|
||||
QUALITY_PRESETS = {
|
||||
'best': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
|
||||
'1080p': 'bestvideo[height<=1080][ext=mp4]+bestaudio[ext=m4a]/best[height<=1080][ext=mp4]/best',
|
||||
'720p': 'bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/best[height<=720][ext=mp4]/best',
|
||||
'480p': 'bestvideo[height<=480][ext=mp4]+bestaudio[ext=m4a]/best[height<=480][ext=mp4]/best',
|
||||
'audio': 'bestaudio[ext=m4a]/bestaudio/best',
|
||||
}
|
||||
|
||||
def __init__(self, ytdlp_path: str = None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Embed')
|
||||
|
||||
# Find yt-dlp executable
|
||||
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
|
||||
if not self.ytdlp_path:
|
||||
self.log("yt-dlp not found, embed downloading will be disabled", 'warning')
|
||||
|
||||
def _find_ytdlp(self) -> Optional[str]:
|
||||
"""Find yt-dlp executable"""
|
||||
# Check common locations
|
||||
common_paths = [
|
||||
'/usr/local/bin/yt-dlp',
|
||||
'/usr/bin/yt-dlp',
|
||||
'/opt/homebrew/bin/yt-dlp',
|
||||
os.path.expanduser('~/.local/bin/yt-dlp'),
|
||||
]
|
||||
|
||||
for path in common_paths:
|
||||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
return path
|
||||
|
||||
# Try to find via which
|
||||
try:
|
||||
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if yt-dlp is available"""
|
||||
return self.ytdlp_path is not None
|
||||
|
||||
async def download(self, url: str, output_dir: Path, quality: str = 'best',
|
||||
filename_template: str = None) -> Dict:
|
||||
"""
|
||||
Download video from URL
|
||||
|
||||
Args:
|
||||
url: Video URL to download
|
||||
output_dir: Directory to save the video
|
||||
quality: Quality preset ('best', '1080p', '720p', '480p', 'audio')
|
||||
filename_template: Optional custom filename template
|
||||
|
||||
Returns:
|
||||
Dict with success status and file info
|
||||
"""
|
||||
if not self.is_available():
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'yt-dlp not available'
|
||||
}
|
||||
|
||||
try:
|
||||
# Create output directory
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build output template
|
||||
if filename_template:
|
||||
output_template = str(output_dir / filename_template)
|
||||
else:
|
||||
output_template = str(output_dir / 'embed_%(title).50s_%(id)s.%(ext)s')
|
||||
|
||||
# Get format string
|
||||
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
|
||||
|
||||
# Build command
|
||||
cmd = [
|
||||
self.ytdlp_path,
|
||||
'--no-playlist',
|
||||
'--no-warnings',
|
||||
'-f', format_str,
|
||||
'--merge-output-format', 'mp4',
|
||||
'-o', output_template,
|
||||
'--print-json', # Output JSON with video info
|
||||
url
|
||||
]
|
||||
|
||||
self.log(f"Downloading embed: {url}", 'debug')
|
||||
|
||||
# Run yt-dlp
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = stderr.decode('utf-8', errors='replace').strip()
|
||||
# Try to extract useful error message
|
||||
if 'Video unavailable' in error_msg:
|
||||
error_msg = 'Video unavailable or private'
|
||||
elif 'age-restricted' in error_msg.lower():
|
||||
error_msg = 'Video is age-restricted'
|
||||
elif 'members only' in error_msg.lower():
|
||||
error_msg = 'Video is members-only'
|
||||
elif len(error_msg) > 200:
|
||||
error_msg = error_msg[:200] + '...'
|
||||
|
||||
self.log(f"yt-dlp failed: {error_msg}", 'warning')
|
||||
return {
|
||||
'success': False,
|
||||
'error': error_msg or f'yt-dlp exited with code {result.returncode}'
|
||||
}
|
||||
|
||||
# Parse output JSON
|
||||
stdout_text = stdout.decode('utf-8', errors='replace')
|
||||
video_info = None
|
||||
|
||||
for line in stdout_text.strip().split('\n'):
|
||||
try:
|
||||
video_info = json.loads(line)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not video_info:
|
||||
# Try to find the downloaded file
|
||||
files = list(output_dir.glob('embed_*'))
|
||||
if files:
|
||||
file_path = files[0]
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(file_path),
|
||||
'filename': file_path.name,
|
||||
'file_size': file_path.stat().st_size if file_path.exists() else None
|
||||
}
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'Could not parse yt-dlp output'
|
||||
}
|
||||
|
||||
# Extract file info
|
||||
file_path = video_info.get('_filename') or video_info.get('filename')
|
||||
|
||||
# Handle potential path issues
|
||||
if file_path:
|
||||
file_path = Path(file_path)
|
||||
if not file_path.exists():
|
||||
# Try to find the file
|
||||
possible_files = list(output_dir.glob(f"*{video_info.get('id', '')}*"))
|
||||
if possible_files:
|
||||
file_path = possible_files[0]
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(file_path) if file_path else None,
|
||||
'filename': file_path.name if file_path else None,
|
||||
'file_size': file_path.stat().st_size if file_path and file_path.exists() else video_info.get('filesize'),
|
||||
'title': video_info.get('title'),
|
||||
'duration': video_info.get('duration'),
|
||||
'uploader': video_info.get('uploader'),
|
||||
'upload_date': video_info.get('upload_date'),
|
||||
'video_id': video_info.get('id'),
|
||||
'platform': video_info.get('extractor_key', video_info.get('extractor', 'unknown')).lower()
|
||||
}
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'Download timed out'
|
||||
}
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading embed: {e}", 'error')
|
||||
return {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
async def get_video_info(self, url: str) -> Dict:
|
||||
"""
|
||||
Get video information without downloading
|
||||
|
||||
Args:
|
||||
url: Video URL
|
||||
|
||||
Returns:
|
||||
Dict with video metadata
|
||||
"""
|
||||
if not self.is_available():
|
||||
return {'success': False, 'error': 'yt-dlp not available'}
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
self.ytdlp_path,
|
||||
'--no-playlist',
|
||||
'--no-warnings',
|
||||
'-j', # Output JSON
|
||||
'--no-download',
|
||||
url
|
||||
]
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = stderr.decode('utf-8', errors='replace').strip()
|
||||
return {
|
||||
'success': False,
|
||||
'error': error_msg or f'yt-dlp exited with code {result.returncode}'
|
||||
}
|
||||
|
||||
video_info = json.loads(stdout.decode('utf-8'))
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'title': video_info.get('title'),
|
||||
'duration': video_info.get('duration'),
|
||||
'uploader': video_info.get('uploader'),
|
||||
'upload_date': video_info.get('upload_date'),
|
||||
'view_count': video_info.get('view_count'),
|
||||
'like_count': video_info.get('like_count'),
|
||||
'description': video_info.get('description'),
|
||||
'thumbnail': video_info.get('thumbnail'),
|
||||
'video_id': video_info.get('id'),
|
||||
'platform': video_info.get('extractor_key', video_info.get('extractor', 'unknown')).lower(),
|
||||
'formats': len(video_info.get('formats', []))
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting video info: {e}", 'error')
|
||||
return {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def detect_platform(url: str) -> Optional[str]:
|
||||
"""Detect video platform from URL"""
|
||||
url_lower = url.lower()
|
||||
|
||||
if 'youtube.com' in url_lower or 'youtu.be' in url_lower:
|
||||
return 'youtube'
|
||||
elif 'vimeo.com' in url_lower:
|
||||
return 'vimeo'
|
||||
elif 'dailymotion.com' in url_lower:
|
||||
return 'dailymotion'
|
||||
elif 'twitch.tv' in url_lower:
|
||||
return 'twitch'
|
||||
elif 'twitter.com' in url_lower or 'x.com' in url_lower:
|
||||
return 'twitter'
|
||||
elif 'tiktok.com' in url_lower:
|
||||
return 'tiktok'
|
||||
elif 'instagram.com' in url_lower:
|
||||
return 'instagram'
|
||||
elif 'reddit.com' in url_lower:
|
||||
return 'reddit'
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def is_supported_url(url: str) -> bool:
|
||||
"""Check if URL is from a supported platform"""
|
||||
return EmbedDownloader.detect_platform(url) is not None
|
||||
1158
modules/paid_content/fansly_direct_client.py
Normal file
1158
modules/paid_content/fansly_direct_client.py
Normal file
File diff suppressed because it is too large
Load Diff
529
modules/paid_content/file_host_downloader.py
Normal file
529
modules/paid_content/file_host_downloader.py
Normal file
@@ -0,0 +1,529 @@
|
||||
"""
|
||||
Download files from external file hosting services
|
||||
Supports: Bunkr, Pixeldrain, Gofile, Cyberdrop
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin, RateLimitMixin
|
||||
|
||||
|
||||
class FileHostDownloader(LoggingMixin, RateLimitMixin):
|
||||
"""
|
||||
Download files from various file hosting services
|
||||
Used for manual import of PPV content
|
||||
"""
|
||||
|
||||
SUPPORTED_HOSTS = {
|
||||
'bunkr': ['bunkr.sk', 'bunkr.si', 'bunkr.la', 'bunkrr.ru', 'bunkr.ph', 'bunkr.is', 'bunkr.ac', 'bunkr.cr'],
|
||||
'pixeldrain': ['pixeldrain.com'],
|
||||
'gofile': ['gofile.io'],
|
||||
'cyberdrop': ['cyberdrop.me', 'cyberdrop.to', 'cyberdrop.cc'],
|
||||
'fileditch': ['fileditchfiles.me', 'fileditch.me'],
|
||||
}
|
||||
|
||||
# Bunkr CDN servers (food-themed) - try in order
|
||||
BUNKR_CDNS = [
|
||||
'i-soup.bunkr.ru',
|
||||
'i-burger.bunkr.ru',
|
||||
'i-pizza.bunkr.ru',
|
||||
'i-taco.bunkr.ru',
|
||||
'i-fries.bunkr.ru',
|
||||
'i-hotdog.bunkr.ru',
|
||||
'i-nachos.bunkr.ru',
|
||||
'i-sushi.bunkr.ru',
|
||||
'i-ramen.bunkr.ru',
|
||||
'i-curry.bunkr.ru',
|
||||
'i-kebab.bunkr.ru',
|
||||
'i-pasta.bunkr.ru',
|
||||
'i-steak.bunkr.ru',
|
||||
'i-salad.bunkr.ru',
|
||||
'i-sandwich.bunkr.ru',
|
||||
'i-waffle.bunkr.ru',
|
||||
'i-pancake.bunkr.ru',
|
||||
'i-donut.bunkr.ru',
|
||||
'i-cookie.bunkr.ru',
|
||||
'i-cake.bunkr.ru',
|
||||
'i-bacon.bunkr.ru',
|
||||
'i-cheese.bunkr.ru',
|
||||
'i-chicken.bunkr.ru',
|
||||
'i-fish.bunkr.ru',
|
||||
'i-noodle.bunkr.ru',
|
||||
'i-rice.bunkr.ru',
|
||||
'i-bread.bunkr.ru',
|
||||
'burger.bunkr.ru',
|
||||
'pizza.bunkr.ru',
|
||||
'milkshake.bunkr.ru',
|
||||
]
|
||||
|
||||
def __init__(self, log_callback=None, progress_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='FileHost')
|
||||
self._init_rate_limiter(min_delay=1, max_delay=3)
|
||||
self.progress_callback = progress_callback # Called with (downloaded_bytes, total_bytes, filename)
|
||||
|
||||
def detect_host(self, url: str) -> Optional[str]:
|
||||
"""Detect which file host a URL belongs to"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower().replace('www.', '')
|
||||
|
||||
for host, domains in self.SUPPORTED_HOSTS.items():
|
||||
if domain in domains:
|
||||
return host
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
def is_supported_url(self, url: str) -> bool:
|
||||
"""Check if URL is from a supported file host"""
|
||||
return self.detect_host(url) is not None
|
||||
|
||||
async def download_url(self, url: str, save_dir: Path) -> Dict:
|
||||
"""
|
||||
Download file(s) from URL
|
||||
Returns: {'success': bool, 'files': [paths], 'error': str}
|
||||
"""
|
||||
host = self.detect_host(url)
|
||||
if not host:
|
||||
return {'success': False, 'files': [], 'error': 'Unsupported host'}
|
||||
|
||||
handler = getattr(self, f'_download_{host}', None)
|
||||
if not handler:
|
||||
return {'success': False, 'files': [], 'error': f'No handler for {host}'}
|
||||
|
||||
try:
|
||||
save_dir = Path(save_dir)
|
||||
save_dir.mkdir(parents=True, exist_ok=True)
|
||||
return await handler(url, save_dir)
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading from {host}: {e}", 'error')
|
||||
return {'success': False, 'files': [], 'error': str(e)}
|
||||
|
||||
async def _download_pixeldrain(self, url: str, save_dir: Path) -> Dict:
|
||||
"""Download from Pixeldrain"""
|
||||
# Extract file ID from URL
|
||||
# Format: https://pixeldrain.com/u/FILEID or /l/LISTID
|
||||
|
||||
parsed = urlparse(url)
|
||||
path_parts = parsed.path.strip('/').split('/')
|
||||
|
||||
if len(path_parts) < 2:
|
||||
return {'success': False, 'files': [], 'error': 'Invalid Pixeldrain URL'}
|
||||
|
||||
url_type, file_id = path_parts[0], path_parts[1]
|
||||
|
||||
files = []
|
||||
timeout = aiohttp.ClientTimeout(total=300)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
if url_type == 'u':
|
||||
# Single file
|
||||
api_url = f"https://pixeldrain.com/api/file/{file_id}/info"
|
||||
async with session.get(api_url) as resp:
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'files': [], 'error': f'API error: {resp.status}'}
|
||||
info = await resp.json()
|
||||
|
||||
download_url = f"https://pixeldrain.com/api/file/{file_id}"
|
||||
filename = info.get('name', f'{file_id}.bin')
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
await self._download_file(session, download_url, save_path)
|
||||
files.append(str(save_path))
|
||||
|
||||
elif url_type == 'l':
|
||||
# List (album)
|
||||
api_url = f"https://pixeldrain.com/api/list/{file_id}"
|
||||
async with session.get(api_url) as resp:
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'files': [], 'error': f'API error: {resp.status}'}
|
||||
data = await resp.json()
|
||||
|
||||
for i, item in enumerate(data.get('files', [])):
|
||||
self._delay_between_items()
|
||||
item_id = item['id']
|
||||
filename = item.get('name', f'{i:03d}_{item_id}.bin')
|
||||
download_url = f"https://pixeldrain.com/api/file/{item_id}"
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
try:
|
||||
await self._download_file(session, download_url, save_path)
|
||||
files.append(str(save_path))
|
||||
except Exception as e:
|
||||
self.log(f"Failed to download {filename}: {e}", 'warning')
|
||||
|
||||
return {'success': True, 'files': files, 'error': None}
|
||||
|
||||
async def _download_gofile(self, url: str, save_dir: Path) -> Dict:
|
||||
"""Download from Gofile"""
|
||||
# Extract content ID from URL
|
||||
# Format: https://gofile.io/d/CONTENTID
|
||||
|
||||
parsed = urlparse(url)
|
||||
path_parts = parsed.path.strip('/').split('/')
|
||||
|
||||
if len(path_parts) < 2 or path_parts[0] != 'd':
|
||||
return {'success': False, 'files': [], 'error': 'Invalid Gofile URL'}
|
||||
|
||||
content_id = path_parts[1]
|
||||
|
||||
files = []
|
||||
timeout = aiohttp.ClientTimeout(total=300)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
# Create guest account token (POST request required since API change)
|
||||
async with session.post('https://api.gofile.io/accounts') as resp:
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'files': [], 'error': 'Failed to get Gofile token'}
|
||||
account_data = await resp.json()
|
||||
if account_data.get('status') != 'ok':
|
||||
return {'success': False, 'files': [], 'error': f"Gofile API error: {account_data.get('status')}"}
|
||||
token = account_data.get('data', {}).get('token')
|
||||
|
||||
if not token:
|
||||
return {'success': False, 'files': [], 'error': 'No Gofile token received'}
|
||||
|
||||
# Get content info
|
||||
# Gofile requires x-website-token header (changed from query param in 2024)
|
||||
headers = {
|
||||
'Authorization': f'Bearer {token}',
|
||||
'x-website-token': '4fd6sg89d7s6',
|
||||
}
|
||||
api_url = f"https://api.gofile.io/contents/{content_id}"
|
||||
|
||||
async with session.get(api_url, headers=headers) as resp:
|
||||
if resp.status == 401:
|
||||
return {'success': False, 'files': [], 'error': 'Gofile authentication failed - websiteToken may have changed'}
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'files': [], 'error': f'Failed to get content: {resp.status}'}
|
||||
content_data = await resp.json()
|
||||
|
||||
if content_data.get('status') == 'error-notPremium':
|
||||
return {'success': False, 'files': [], 'error': 'Gofile requires premium account for API access - try direct download'}
|
||||
if content_data.get('status') != 'ok':
|
||||
error = content_data.get('data', {}).get('message', content_data.get('status', 'Unknown error'))
|
||||
return {'success': False, 'files': [], 'error': error}
|
||||
|
||||
contents = content_data.get('data', {}).get('children', {})
|
||||
|
||||
for item_id, item in contents.items():
|
||||
if item.get('type') != 'file':
|
||||
continue
|
||||
|
||||
self._delay_between_items()
|
||||
download_url = item.get('link')
|
||||
filename = item.get('name', f'{item_id}.bin')
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
try:
|
||||
await self._download_file(session, download_url, save_path, headers=headers)
|
||||
files.append(str(save_path))
|
||||
except Exception as e:
|
||||
self.log(f"Failed to download {filename}: {e}", 'warning')
|
||||
|
||||
return {'success': True, 'files': files, 'error': None}
|
||||
|
||||
async def _download_cyberdrop(self, url: str, save_dir: Path) -> Dict:
|
||||
"""Download from Cyberdrop"""
|
||||
# Cyberdrop albums: https://cyberdrop.me/a/ALBUMID
|
||||
# Single files: https://cyberdrop.me/f/FILEID or direct CDN links
|
||||
|
||||
files = []
|
||||
timeout = aiohttp.ClientTimeout(total=300)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
parsed = urlparse(url)
|
||||
path_parts = parsed.path.strip('/').split('/')
|
||||
|
||||
if len(path_parts) >= 2 and path_parts[0] == 'a':
|
||||
# Album
|
||||
album_url = url
|
||||
async with session.get(album_url) as resp:
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'files': [], 'error': f'Failed to fetch album: {resp.status}'}
|
||||
html = await resp.text()
|
||||
|
||||
# Parse file links from HTML
|
||||
# Pattern: href="https://fs-XXX.cyberdrop.to/FILE"
|
||||
cdn_pattern = r'href="(https://[a-z0-9-]+\.cyberdrop\.[a-z]+/[^"]+)"'
|
||||
matches = re.findall(cdn_pattern, html)
|
||||
|
||||
for i, file_url in enumerate(matches):
|
||||
self._delay_between_items()
|
||||
filename = file_url.split('/')[-1].split('?')[0]
|
||||
if not filename:
|
||||
filename = f'{i:03d}.bin'
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
try:
|
||||
await self._download_file(session, file_url, save_path)
|
||||
files.append(str(save_path))
|
||||
except Exception as e:
|
||||
self.log(f"Failed to download {filename}: {e}", 'warning')
|
||||
|
||||
else:
|
||||
# Single file or direct CDN link
|
||||
filename = parsed.path.split('/')[-1] or 'download.bin'
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
await self._download_file(session, url, save_path)
|
||||
files.append(str(save_path))
|
||||
|
||||
return {'success': True, 'files': files, 'error': None}
|
||||
|
||||
async def _download_bunkr(self, url: str, save_dir: Path) -> Dict:
|
||||
"""Download from Bunkr with CDN fallback support"""
|
||||
# Bunkr albums: https://bunkr.sk/a/ALBUMID
|
||||
# Single files: https://bunkr.sk/f/FILEID or https://bunkr.sk/v/VIDEOID
|
||||
|
||||
files = []
|
||||
failed = []
|
||||
timeout = aiohttp.ClientTimeout(total=600) # Increased for large files
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
|
||||
parsed = urlparse(url)
|
||||
path_parts = parsed.path.strip('/').split('/')
|
||||
|
||||
if len(path_parts) >= 2 and path_parts[0] == 'a':
|
||||
# Album page
|
||||
async with session.get(url) as resp:
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'files': [], 'error': f'Failed to fetch album: {resp.status}'}
|
||||
html = await resp.text()
|
||||
|
||||
# Parse file links from HTML - look for /f/ links
|
||||
file_pattern = r'href="(/f/[^"]+)"'
|
||||
matches = re.findall(file_pattern, html)
|
||||
|
||||
self.log(f"Found {len(matches)} files in Bunkr album", 'info')
|
||||
|
||||
for i, file_path in enumerate(matches):
|
||||
self._delay_between_items()
|
||||
|
||||
# Make absolute URL
|
||||
file_url = f"https://{parsed.netloc}{file_path}"
|
||||
|
||||
# Get direct download URL and file UUID
|
||||
direct_url, file_uuid = await self._get_bunkr_direct_url_with_uuid(session, file_url)
|
||||
if not direct_url:
|
||||
self.log(f"Could not get direct URL for {file_url}", 'warning')
|
||||
failed.append(file_url)
|
||||
continue
|
||||
|
||||
filename = direct_url.split('/')[-1].split('?')[0]
|
||||
if not filename:
|
||||
filename = f'{i:03d}.bin'
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
try:
|
||||
await self._download_file(session, direct_url, save_path,
|
||||
try_cdn_fallback=True, file_uuid=file_uuid)
|
||||
files.append(str(save_path))
|
||||
self.log(f"Downloaded: {filename}", 'info')
|
||||
except Exception as e:
|
||||
self.log(f"Failed to download {filename}: {e}", 'warning')
|
||||
failed.append(filename)
|
||||
|
||||
else:
|
||||
# Single file page
|
||||
direct_url, file_uuid = await self._get_bunkr_direct_url_with_uuid(session, url)
|
||||
if not direct_url:
|
||||
return {'success': False, 'files': [], 'error': 'Could not get direct download URL'}
|
||||
|
||||
filename = direct_url.split('/')[-1].split('?')[0] or 'download.bin'
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
await self._download_file(session, direct_url, save_path,
|
||||
try_cdn_fallback=True, file_uuid=file_uuid)
|
||||
files.append(str(save_path))
|
||||
|
||||
result = {'success': len(files) > 0, 'files': files, 'error': None}
|
||||
if failed:
|
||||
result['failed'] = failed
|
||||
result['error'] = f'{len(failed)} files failed to download'
|
||||
return result
|
||||
|
||||
async def _get_bunkr_direct_url_with_uuid(self, session: aiohttp.ClientSession, page_url: str) -> tuple:
|
||||
"""Extract direct download URL and file UUID from Bunkr file page"""
|
||||
try:
|
||||
async with session.get(page_url) as resp:
|
||||
if resp.status != 200:
|
||||
return None, None
|
||||
html = await resp.text()
|
||||
|
||||
file_uuid = None
|
||||
|
||||
# Extract file UUID first
|
||||
uuid_patterns = [
|
||||
r'data-v="([a-f0-9-]{36}\.[a-z0-9]+)"',
|
||||
r'([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}\.[a-z0-9]+)',
|
||||
]
|
||||
for pattern in uuid_patterns:
|
||||
match = re.search(pattern, html)
|
||||
if match:
|
||||
file_uuid = match.group(1)
|
||||
break
|
||||
|
||||
# Try to find existing CDN URL in page
|
||||
cdn_patterns = [
|
||||
r'href="(https://[^"]*\.bunkr\.ru/[^"]+)"',
|
||||
r'src="(https://[^"]*\.bunkr\.ru/[^"]+)"',
|
||||
r'data-src="(https://[^"]*\.bunkr\.ru/[^"]+)"',
|
||||
]
|
||||
|
||||
for pattern in cdn_patterns:
|
||||
match = re.search(pattern, html)
|
||||
if match:
|
||||
url = match.group(1)
|
||||
if await self._check_url_accessible(session, url):
|
||||
return url, file_uuid
|
||||
|
||||
# If we have UUID, try CDNs
|
||||
if file_uuid:
|
||||
self.log(f"Found file UUID: {file_uuid}, trying CDNs...", 'debug')
|
||||
for cdn in self.BUNKR_CDNS:
|
||||
cdn_url = f"https://{cdn}/{file_uuid}"
|
||||
if await self._check_url_accessible(session, cdn_url):
|
||||
self.log(f"Found working CDN: {cdn}", 'debug')
|
||||
return cdn_url, file_uuid
|
||||
|
||||
return None, file_uuid
|
||||
except Exception as e:
|
||||
self.log(f"Error getting Bunkr direct URL: {e}", 'warning')
|
||||
return None, None
|
||||
|
||||
async def _check_url_accessible(self, session: aiohttp.ClientSession, url: str) -> bool:
|
||||
"""Check if a URL is accessible (returns 200)"""
|
||||
try:
|
||||
async with session.head(url, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=10)) as resp:
|
||||
return resp.status == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
async def _download_fileditch(self, url: str, save_dir: Path) -> Dict:
|
||||
"""Download from FileDitch (Cloudflare-protected)"""
|
||||
from modules.cloudflare_handler import CloudflareHandler
|
||||
|
||||
# Extract filename from URL: file.php?f=/b74/tLyJWGrzvSyRlJvBVDBa.mp4
|
||||
parsed = urlparse(url)
|
||||
params = parse_qs(parsed.query)
|
||||
file_path = params.get('f', [''])[0]
|
||||
if not file_path:
|
||||
return {'success': False, 'files': [], 'error': 'Invalid FileDitch URL - no file parameter'}
|
||||
|
||||
filename = file_path.rsplit('/', 1)[-1] if '/' in file_path else file_path
|
||||
if not filename:
|
||||
return {'success': False, 'files': [], 'error': 'Could not extract filename from URL'}
|
||||
|
||||
save_path = save_dir / self._sanitize_filename(filename)
|
||||
|
||||
# Use CloudflareHandler to get cookies via FlareSolverr
|
||||
cf_handler = CloudflareHandler(
|
||||
module_name='FileDitch',
|
||||
flaresolverr_url='http://localhost:8191/v1',
|
||||
flaresolverr_enabled=True,
|
||||
)
|
||||
|
||||
self.log('Bypassing Cloudflare for FileDitch via FlareSolverr...', 'info')
|
||||
if not cf_handler.get_cookies_via_flaresolverr(url):
|
||||
return {'success': False, 'files': [], 'error': 'Failed to bypass Cloudflare for FileDitch'}
|
||||
|
||||
cookies = cf_handler.get_cookies_dict()
|
||||
user_agent = cf_handler.get_user_agent()
|
||||
|
||||
# Download with the obtained cookies
|
||||
timeout = aiohttp.ClientTimeout(total=3600)
|
||||
cookie_jar = aiohttp.CookieJar()
|
||||
headers = {'User-Agent': user_agent or 'Mozilla/5.0'}
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout, cookie_jar=cookie_jar, headers=headers) as session:
|
||||
# Set cookies on session
|
||||
for name, value in cookies.items():
|
||||
cookie_jar.update_cookies({name: value}, response_url=url)
|
||||
|
||||
await self._download_file(session, url, save_path, headers=headers)
|
||||
|
||||
return {'success': True, 'files': [str(save_path)], 'error': None}
|
||||
|
||||
async def _download_file(self, session: aiohttp.ClientSession, url: str,
|
||||
save_path: Path, headers: Dict = None,
|
||||
try_cdn_fallback: bool = False, file_uuid: str = None) -> None:
|
||||
"""Download a single file with streaming and optional CDN fallback"""
|
||||
save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
urls_to_try = [url]
|
||||
|
||||
# If CDN fallback enabled and we have a file UUID, add alternate CDNs
|
||||
if try_cdn_fallback and file_uuid:
|
||||
for cdn in self.BUNKR_CDNS:
|
||||
alt_url = f"https://{cdn}/{file_uuid}"
|
||||
if alt_url != url:
|
||||
urls_to_try.append(alt_url)
|
||||
|
||||
last_error = None
|
||||
for try_url in urls_to_try:
|
||||
try:
|
||||
self.log(f"Downloading: {save_path.name} from {try_url[:60]}...", 'info')
|
||||
async with session.get(try_url, headers=headers) as resp:
|
||||
if resp.status == 200:
|
||||
total_size = int(resp.headers.get('content-length', 0))
|
||||
downloaded = 0
|
||||
last_log_pct = 0
|
||||
|
||||
with open(save_path, 'wb') as f:
|
||||
async for chunk in resp.content.iter_chunked(65536): # 64KB chunks
|
||||
f.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
|
||||
# Log and callback progress every 2%
|
||||
if total_size > 0:
|
||||
pct = int(downloaded * 100 / total_size)
|
||||
if pct >= last_log_pct + 2:
|
||||
self.log(f" {save_path.name}: {pct}% ({downloaded // (1024*1024)}MB / {total_size // (1024*1024)}MB)", 'info')
|
||||
last_log_pct = pct
|
||||
# Call progress callback if provided
|
||||
if self.progress_callback:
|
||||
try:
|
||||
self.progress_callback(downloaded, total_size, save_path.name)
|
||||
except Exception:
|
||||
pass # Don't fail download due to callback error
|
||||
|
||||
self.log(f"Downloaded: {save_path.name} ({downloaded // (1024*1024)}MB)", 'info')
|
||||
return # Success
|
||||
else:
|
||||
last_error = f"HTTP {resp.status}"
|
||||
self.log(f"Download failed: {save_path.name} - {last_error}", 'warning')
|
||||
except Exception as e:
|
||||
last_error = str(e)
|
||||
self.log(f"Download error: {save_path.name} - {last_error}", 'warning')
|
||||
# Try next CDN
|
||||
continue
|
||||
|
||||
raise Exception(f"Download failed after trying {len(urls_to_try)} URLs: {last_error}")
|
||||
|
||||
def _sanitize_filename(self, filename: str) -> str:
|
||||
"""Sanitize filename for filesystem"""
|
||||
if not filename:
|
||||
return 'download.bin'
|
||||
# Remove/replace invalid characters
|
||||
filename = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', filename)
|
||||
filename = filename.strip('. ')
|
||||
return filename or 'download.bin'
|
||||
|
||||
@classmethod
|
||||
def get_supported_domains(cls) -> List[str]:
|
||||
"""Get list of all supported domains"""
|
||||
domains = []
|
||||
for host_domains in cls.SUPPORTED_HOSTS.values():
|
||||
domains.extend(host_domains)
|
||||
return domains
|
||||
171
modules/paid_content/filename_parser.py
Normal file
171
modules/paid_content/filename_parser.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""
|
||||
Filename parser for extracting dates and metadata from Fansly/paid content filenames.
|
||||
|
||||
Supports:
|
||||
1. Fansly snowflake IDs: 871257582885416960.mp4
|
||||
2. Embedded date format: 2023-05-11_at_15-51_id_513099759796367360-zRvVUZeP.mp4
|
||||
3. Date-prefixed files: 2022-07-08.mp4 or 2022-07-08_video.mp4
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, Dict, Tuple
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# Fansly epoch calibrated from known files
|
||||
# Based on: 513099759796367360 = 2023-05-11 15:51 UTC
|
||||
FANSLY_EPOCH_MS = 1561483337101
|
||||
|
||||
|
||||
def decode_fansly_snowflake(snowflake_id: str) -> Optional[datetime]:
|
||||
"""
|
||||
Decode a Fansly snowflake ID to a datetime.
|
||||
|
||||
Fansly uses Twitter-style snowflake IDs where the timestamp
|
||||
is encoded in the upper bits (shifted right by 22).
|
||||
"""
|
||||
try:
|
||||
sid = int(snowflake_id)
|
||||
# Timestamp is in upper bits
|
||||
timestamp_ms = (sid >> 22) + FANSLY_EPOCH_MS
|
||||
return datetime.fromtimestamp(timestamp_ms / 1000, tz=timezone.utc)
|
||||
except (ValueError, OverflowError, OSError):
|
||||
return None
|
||||
|
||||
|
||||
def parse_filename(filename: str) -> Dict:
|
||||
"""
|
||||
Parse a filename and extract any date/metadata information.
|
||||
|
||||
Returns:
|
||||
{
|
||||
'original_filename': str,
|
||||
'detected_date': datetime or None,
|
||||
'fansly_id': str or None,
|
||||
'date_source': str or None, # 'snowflake', 'embedded', 'prefix', None
|
||||
'confidence': str, # 'high', 'medium', 'low'
|
||||
}
|
||||
"""
|
||||
result = {
|
||||
'original_filename': filename,
|
||||
'detected_date': None,
|
||||
'fansly_id': None,
|
||||
'date_source': None,
|
||||
'confidence': 'low',
|
||||
}
|
||||
|
||||
# Get the base name without extension
|
||||
name = Path(filename).stem
|
||||
|
||||
# Pattern 1: Embedded date format
|
||||
# 2023-05-11_at_15-51_id_513099759796367360-zRvVUZeP-YcNs55W9.mp4
|
||||
# 2026-01-24_at_06-22_id_871257582885416960_hash2_4547ab5367c6d7ea3a28ac4fc79df018.mp4
|
||||
# Also handles spaces: 2023 05 11_at_15 51_id_513099759796367360
|
||||
embedded_pattern = r'(\d{4})[-_ ](\d{2})[-_ ](\d{2})[-_ ]?at[-_ ](\d{2})[-_ ](\d{2})[-_ ]?id[-_ ](\d{15,20})'
|
||||
match = re.search(embedded_pattern, name, re.IGNORECASE)
|
||||
if match:
|
||||
year, month, day, hour, minute, fansly_id = match.groups()
|
||||
try:
|
||||
result['detected_date'] = datetime(
|
||||
int(year), int(month), int(day),
|
||||
int(hour), int(minute), 0,
|
||||
tzinfo=timezone.utc
|
||||
)
|
||||
result['fansly_id'] = fansly_id
|
||||
result['date_source'] = 'embedded'
|
||||
result['confidence'] = 'high'
|
||||
return result
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Pattern 2: Date prefix (YYYY-MM-DD or YYYY_MM_DD)
|
||||
# 2022-07-08.mp4 or 2022-07-08_video.mp4
|
||||
date_prefix_pattern = r'^(\d{4})[-_](\d{2})[-_](\d{2})(?:[_\-\s]|$)'
|
||||
match = re.match(date_prefix_pattern, name)
|
||||
if match:
|
||||
year, month, day = match.groups()
|
||||
try:
|
||||
result['detected_date'] = datetime(
|
||||
int(year), int(month), int(day),
|
||||
12, 0, 0, # Default to noon
|
||||
tzinfo=timezone.utc
|
||||
)
|
||||
result['date_source'] = 'prefix'
|
||||
result['confidence'] = 'high'
|
||||
return result
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Pattern 3: Pure Fansly snowflake ID
|
||||
# 871257582885416960.mp4 (15-20 digit number)
|
||||
snowflake_pattern = r'^(\d{15,20})(?:_\d+)?$'
|
||||
match = re.match(snowflake_pattern, name)
|
||||
if match:
|
||||
fansly_id = match.group(1)
|
||||
decoded_date = decode_fansly_snowflake(fansly_id)
|
||||
if decoded_date:
|
||||
# Sanity check: date should be between 2020 and 2030
|
||||
if 2020 <= decoded_date.year <= 2030:
|
||||
result['detected_date'] = decoded_date
|
||||
result['fansly_id'] = fansly_id
|
||||
result['date_source'] = 'snowflake'
|
||||
result['confidence'] = 'high'
|
||||
return result
|
||||
|
||||
# Pattern 4: Fansly ID embedded anywhere in filename
|
||||
# e.g., video_871257582885416960_hd.mp4
|
||||
embedded_id_pattern = r'(\d{15,20})'
|
||||
matches = re.findall(embedded_id_pattern, name)
|
||||
for potential_id in matches:
|
||||
decoded_date = decode_fansly_snowflake(potential_id)
|
||||
if decoded_date and 2020 <= decoded_date.year <= 2030:
|
||||
result['detected_date'] = decoded_date
|
||||
result['fansly_id'] = potential_id
|
||||
result['date_source'] = 'snowflake'
|
||||
result['confidence'] = 'medium'
|
||||
return result
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def parse_filenames(filenames: list) -> Dict:
|
||||
"""
|
||||
Parse multiple filenames and return analysis.
|
||||
|
||||
Returns:
|
||||
{
|
||||
'files': [parsed result for each file],
|
||||
'earliest_date': datetime or None,
|
||||
'latest_date': datetime or None,
|
||||
'suggested_date': datetime or None, # Most common or earliest
|
||||
'has_dates': bool,
|
||||
}
|
||||
"""
|
||||
results = [parse_filename(f) for f in filenames]
|
||||
|
||||
dates = [r['detected_date'] for r in results if r['detected_date']]
|
||||
|
||||
analysis = {
|
||||
'files': results,
|
||||
'earliest_date': min(dates) if dates else None,
|
||||
'latest_date': max(dates) if dates else None,
|
||||
'suggested_date': min(dates) if dates else None, # Use earliest as default
|
||||
'has_dates': len(dates) > 0,
|
||||
}
|
||||
|
||||
return analysis
|
||||
|
||||
|
||||
def format_date_for_display(dt: datetime) -> str:
|
||||
"""Format datetime for display: 'May 11, 2023 at 3:51 PM'"""
|
||||
if dt is None:
|
||||
return ''
|
||||
return dt.strftime('%b %d, %Y at %-I:%M %p')
|
||||
|
||||
|
||||
def format_date_for_input(dt: datetime) -> Tuple[str, str]:
|
||||
"""Format datetime for HTML inputs: (date_str, time_str)"""
|
||||
if dt is None:
|
||||
return ('', '')
|
||||
return (dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M'))
|
||||
14
modules/paid_content/hqcelebcorner_client.py
Normal file
14
modules/paid_content/hqcelebcorner_client.py
Normal file
@@ -0,0 +1,14 @@
|
||||
"""Backwards-compatibility shim — use xenforo_forum_client instead."""
|
||||
from .xenforo_forum_client import XenForoForumClient
|
||||
|
||||
|
||||
class HQCelebCornerClient(XenForoForumClient):
|
||||
"""Legacy alias for XenForoForumClient, pre-configured for HQCelebCorner."""
|
||||
|
||||
def __init__(self, log_callback=None):
|
||||
super().__init__(
|
||||
service_id='hqcelebcorner',
|
||||
base_url='https://www.hqcelebcorner.net',
|
||||
cookie_path='/opt/media-downloader/cookies/forum_cookies_HQCelebCorner.json',
|
||||
log_callback=log_callback,
|
||||
)
|
||||
1285
modules/paid_content/instagram_adapter.py
Normal file
1285
modules/paid_content/instagram_adapter.py
Normal file
File diff suppressed because it is too large
Load Diff
312
modules/paid_content/models.py
Normal file
312
modules/paid_content/models.py
Normal file
@@ -0,0 +1,312 @@
|
||||
"""
|
||||
Pydantic models for Paid Content feature
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class Attachment:
|
||||
"""Represents a file attachment from a post"""
|
||||
name: str
|
||||
server_path: str
|
||||
file_type: Optional[str] = None
|
||||
extension: Optional[str] = None
|
||||
download_url: Optional[str] = None
|
||||
file_size: Optional[int] = None
|
||||
width: Optional[int] = None
|
||||
height: Optional[int] = None
|
||||
duration: Optional[int] = None
|
||||
needs_quality_recheck: bool = False
|
||||
is_preview: bool = False
|
||||
|
||||
@classmethod
|
||||
def from_api(cls, data: Dict, base_url: str = '') -> 'Attachment':
|
||||
"""Create Attachment from API response"""
|
||||
name = data.get('name', '')
|
||||
path = data.get('path', '')
|
||||
|
||||
# Detect file type from extension
|
||||
ext = ''
|
||||
if '.' in name:
|
||||
ext = name.rsplit('.', 1)[-1].lower()
|
||||
|
||||
file_type = 'unknown'
|
||||
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic'}
|
||||
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
|
||||
archive_exts = {'zip', 'rar', '7z', 'tar', 'gz'}
|
||||
|
||||
if ext in image_exts:
|
||||
file_type = 'image'
|
||||
elif ext in video_exts:
|
||||
file_type = 'video'
|
||||
elif ext in archive_exts:
|
||||
file_type = 'archive'
|
||||
elif ext in {'pdf', 'doc', 'docx', 'txt'}:
|
||||
file_type = 'document'
|
||||
|
||||
return cls(
|
||||
name=name,
|
||||
server_path=path,
|
||||
file_type=file_type,
|
||||
extension=ext if ext else None,
|
||||
download_url=f"{base_url}/data{path}" if base_url and path else None
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary for database storage"""
|
||||
d = {
|
||||
'name': self.name,
|
||||
'server_path': self.server_path,
|
||||
'file_type': self.file_type,
|
||||
'extension': self.extension,
|
||||
'download_url': self.download_url,
|
||||
'file_size': self.file_size,
|
||||
'width': self.width,
|
||||
'height': self.height,
|
||||
'duration': self.duration
|
||||
}
|
||||
if self.needs_quality_recheck:
|
||||
d['needs_quality_recheck'] = 1
|
||||
return d
|
||||
|
||||
|
||||
@dataclass
|
||||
class Post:
|
||||
"""Represents a post from a creator"""
|
||||
post_id: str
|
||||
service_id: str
|
||||
platform: str
|
||||
creator_id: str
|
||||
title: Optional[str] = None
|
||||
content: Optional[str] = None
|
||||
published_at: Optional[str] = None
|
||||
added_at: Optional[str] = None
|
||||
edited_at: Optional[str] = None
|
||||
attachments: List[Attachment] = field(default_factory=list)
|
||||
embed_urls: List[str] = field(default_factory=list)
|
||||
is_pinned: bool = False
|
||||
pinned_at: Optional[str] = None
|
||||
auto_tags: List[str] = field(default_factory=list) # Tag names to auto-apply on sync
|
||||
tagged_users: List[str] = field(default_factory=list) # Instagram users tagged in the post
|
||||
|
||||
@classmethod
|
||||
def from_api(cls, data: Dict, service_id: str, platform: str, creator_id: str, base_url: str = '') -> 'Post':
|
||||
"""Create Post from API response"""
|
||||
# Parse attachments
|
||||
attachments = []
|
||||
for att_data in data.get('attachments', []):
|
||||
attachments.append(Attachment.from_api(att_data, base_url))
|
||||
|
||||
# Also check file field (some APIs use this instead of attachments)
|
||||
if 'file' in data and data['file']:
|
||||
file_data = data['file']
|
||||
if isinstance(file_data, dict):
|
||||
attachments.append(Attachment.from_api(file_data, base_url))
|
||||
elif isinstance(file_data, str):
|
||||
attachments.append(Attachment(
|
||||
name=file_data.split('/')[-1] if '/' in file_data else file_data,
|
||||
server_path=file_data
|
||||
))
|
||||
|
||||
# Parse dates
|
||||
published = data.get('published')
|
||||
added = data.get('added')
|
||||
edited = data.get('edited')
|
||||
|
||||
# Content: use 'content' if available, fallback to 'substring' (list endpoint returns truncated)
|
||||
content = data.get('content') or data.get('substring') or ''
|
||||
|
||||
# Single post endpoint returns HTML content (e.g. <p>text</p>), strip tags
|
||||
if content and '<' in content:
|
||||
import re
|
||||
content = re.sub(r'<br\s*/?>', '\n', content)
|
||||
content = re.sub(r'</p>\s*<p>', '\n\n', content)
|
||||
content = re.sub(r'<[^>]+>', '', content)
|
||||
content = content.strip()
|
||||
|
||||
title = data.get('title')
|
||||
|
||||
# OnlyFans posts on Coomer have the post text in 'title' and empty 'content'.
|
||||
# Copy title to content and clear title (OF posts don't have real titles).
|
||||
if not content and title:
|
||||
content = title
|
||||
title = None
|
||||
|
||||
return cls(
|
||||
post_id=str(data.get('id', '')),
|
||||
service_id=service_id,
|
||||
platform=platform,
|
||||
creator_id=creator_id,
|
||||
title=title,
|
||||
content=content,
|
||||
published_at=published,
|
||||
added_at=added,
|
||||
edited_at=edited,
|
||||
attachments=attachments,
|
||||
embed_urls=data.get('embed', []) or []
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary for database storage"""
|
||||
return {
|
||||
'post_id': self.post_id,
|
||||
'title': self.title,
|
||||
'content': self.content,
|
||||
'published_at': self.published_at,
|
||||
'added_at': self.added_at,
|
||||
'edited_at': self.edited_at,
|
||||
'has_attachments': 1 if self.attachments else 0,
|
||||
'attachment_count': len(self.attachments),
|
||||
'embed_count': len(self.embed_urls),
|
||||
'is_pinned': 1 if self.is_pinned else 0,
|
||||
'pinned_at': self.pinned_at
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Message:
|
||||
"""Represents a chat message from/to a creator"""
|
||||
message_id: str
|
||||
platform: str
|
||||
service_id: str
|
||||
creator_id: str # Platform-specific creator ID
|
||||
text: Optional[str] = None
|
||||
sent_at: Optional[str] = None
|
||||
is_from_creator: bool = True
|
||||
is_tip: bool = False
|
||||
tip_amount: Optional[float] = None
|
||||
price: Optional[float] = None
|
||||
is_free: bool = True
|
||||
is_purchased: bool = False
|
||||
reply_to_message_id: Optional[str] = None
|
||||
attachments: List[Attachment] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary for database storage"""
|
||||
return {
|
||||
'message_id': self.message_id,
|
||||
'text': self.text,
|
||||
'sent_at': self.sent_at,
|
||||
'is_from_creator': 1 if self.is_from_creator else 0,
|
||||
'is_tip': 1 if self.is_tip else 0,
|
||||
'tip_amount': self.tip_amount,
|
||||
'price': self.price,
|
||||
'is_free': 1 if self.is_free else 0,
|
||||
'is_purchased': 1 if self.is_purchased else 0,
|
||||
'has_attachments': 1 if self.attachments else 0,
|
||||
'attachment_count': len(self.attachments),
|
||||
'reply_to_message_id': self.reply_to_message_id,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Creator:
|
||||
"""Represents a creator from Coomer/Kemono"""
|
||||
creator_id: str
|
||||
service_id: str
|
||||
platform: str
|
||||
username: str
|
||||
display_name: Optional[str] = None
|
||||
profile_image_url: Optional[str] = None
|
||||
banner_image_url: Optional[str] = None
|
||||
bio: Optional[str] = None
|
||||
post_count: int = 0
|
||||
|
||||
@classmethod
|
||||
def from_api(cls, data: Dict, service_id: str, platform: str, base_url: str = None) -> 'Creator':
|
||||
"""Create Creator from API response"""
|
||||
creator_id = str(data.get('id', ''))
|
||||
|
||||
# Construct image domain - use .st instead of .party (coomer.party redirects to coomer.st)
|
||||
img_domain = None
|
||||
if base_url and creator_id:
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(base_url)
|
||||
# Convert .party to .st for image URLs (coomer.party/kemono.party images are at .st)
|
||||
netloc = parsed.netloc.replace('.party', '.st')
|
||||
img_domain = f"img.{netloc}"
|
||||
|
||||
# Construct profile image URL from icon endpoint
|
||||
profile_image_url = data.get('profile_image')
|
||||
if not profile_image_url and img_domain:
|
||||
# Icon URLs are at img.{domain}/icons/{platform}/{creator_id}
|
||||
profile_image_url = f"https://{img_domain}/icons/{platform}/{creator_id}"
|
||||
|
||||
# Construct banner image URL
|
||||
banner_image_url = data.get('banner_image')
|
||||
if not banner_image_url and img_domain:
|
||||
# Banner URLs are at img.{domain}/banners/{platform}/{creator_id}
|
||||
banner_image_url = f"https://{img_domain}/banners/{platform}/{creator_id}"
|
||||
|
||||
return cls(
|
||||
creator_id=creator_id,
|
||||
service_id=service_id,
|
||||
platform=platform,
|
||||
username=data.get('name', ''),
|
||||
display_name=data.get('name'),
|
||||
profile_image_url=profile_image_url,
|
||||
banner_image_url=banner_image_url,
|
||||
post_count=data.get('post_count', 0)
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary for database storage"""
|
||||
return {
|
||||
'service_id': self.service_id,
|
||||
'platform': self.platform,
|
||||
'creator_id': self.creator_id,
|
||||
'username': self.username,
|
||||
'display_name': self.display_name,
|
||||
'profile_image_url': self.profile_image_url,
|
||||
'banner_image_url': self.banner_image_url,
|
||||
'bio': self.bio,
|
||||
'post_count': self.post_count
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class SyncResult:
|
||||
"""Result of a creator sync operation"""
|
||||
success: bool
|
||||
new_posts: int = 0
|
||||
new_attachments: int = 0
|
||||
downloaded_files: int = 0
|
||||
failed_files: int = 0
|
||||
skipped_files: int = 0
|
||||
error: Optional[str] = None
|
||||
downloaded_file_info: Optional[List[Dict]] = None # List of {file_path, filename, source, content_type}
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return {
|
||||
'success': self.success,
|
||||
'new_posts': self.new_posts,
|
||||
'new_attachments': self.new_attachments,
|
||||
'downloaded_files': self.downloaded_files,
|
||||
'failed_files': self.failed_files,
|
||||
'skipped_files': self.skipped_files,
|
||||
'error': self.error
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class DownloadResult:
|
||||
"""Result of a download operation"""
|
||||
success: bool
|
||||
file_path: Optional[str] = None
|
||||
file_hash: Optional[str] = None
|
||||
file_size: Optional[int] = None
|
||||
error: Optional[str] = None
|
||||
is_duplicate: bool = False
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return {
|
||||
'success': self.success,
|
||||
'file_path': self.file_path,
|
||||
'file_hash': self.file_hash,
|
||||
'file_size': self.file_size,
|
||||
'error': self.error,
|
||||
'is_duplicate': self.is_duplicate
|
||||
}
|
||||
729
modules/paid_content/onlyfans_client.py
Normal file
729
modules/paid_content/onlyfans_client.py
Normal file
@@ -0,0 +1,729 @@
|
||||
"""
|
||||
OnlyFans Direct API Client
|
||||
|
||||
Downloads content directly from the OnlyFans API using browser-extracted
|
||||
credentials and dynamic request signing.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Dict, Any, Callable
|
||||
from urllib.parse import urlparse, urlencode
|
||||
|
||||
from modules.base_module import LoggingMixin, RateLimitMixin
|
||||
from .models import Post, Attachment, Message
|
||||
from .onlyfans_signing import OnlyFansSigner
|
||||
|
||||
|
||||
class OnlyFansClient(LoggingMixin, RateLimitMixin):
|
||||
"""
|
||||
API client for downloading content directly from OnlyFans.
|
||||
|
||||
API Endpoints:
|
||||
- Base URL: https://onlyfans.com/api2/v2
|
||||
- Auth: Requires browser-extracted credentials (sess, auth_id, x-bc, User-Agent)
|
||||
- Signing: Every request needs dynamic sign/time/app-token headers
|
||||
- GET /users/me - Verify auth
|
||||
- GET /users/{username} - Get user profile
|
||||
- GET /users/{user_id}/posts?limit=50&offset={offset} - Get posts (paginated)
|
||||
"""
|
||||
|
||||
BASE_URL = "https://onlyfans.com/api2/v2"
|
||||
SERVICE_ID = "onlyfans_direct"
|
||||
PLATFORM = "onlyfans"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
auth_config: Dict[str, str],
|
||||
signing_url: Optional[str] = None,
|
||||
log_callback: Optional[Callable] = None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
auth_config: Dict with keys: sess, auth_id, auth_uid (optional), x_bc, user_agent
|
||||
signing_url: Optional custom URL for signing rules
|
||||
log_callback: Optional logging callback
|
||||
"""
|
||||
self._init_logger('PaidContent', log_callback, default_module='OnlyFansDirect')
|
||||
# More conservative rate limiting than Fansly (OF is stricter)
|
||||
self._init_rate_limiter(
|
||||
min_delay=1.5, max_delay=3.0,
|
||||
batch_delay_min=3, batch_delay_max=6
|
||||
)
|
||||
|
||||
self.auth_config = auth_config
|
||||
self._session: Optional[aiohttp.ClientSession] = None
|
||||
self._signer = OnlyFansSigner(rules_url=signing_url)
|
||||
|
||||
async def _get_session(self) -> aiohttp.ClientSession:
|
||||
"""Get or create aiohttp session with OnlyFans headers"""
|
||||
if self._session is None or self._session.closed:
|
||||
# Build cookie string
|
||||
cookies = f"sess={self.auth_config['sess']}; auth_id={self.auth_config['auth_id']}"
|
||||
auth_uid = self.auth_config.get('auth_uid')
|
||||
if auth_uid:
|
||||
cookies += f"; auth_uid_{self.auth_config['auth_id']}={auth_uid}"
|
||||
|
||||
headers = {
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'User-Agent': self.auth_config.get('user_agent', ''),
|
||||
'x-bc': self.auth_config.get('x_bc', ''),
|
||||
'Cookie': cookies,
|
||||
'Origin': 'https://onlyfans.com',
|
||||
'Referer': 'https://onlyfans.com/',
|
||||
}
|
||||
timeout = aiohttp.ClientTimeout(total=60)
|
||||
self._session = aiohttp.ClientSession(headers=headers, timeout=timeout)
|
||||
return self._session
|
||||
|
||||
async def _sign_request(self, endpoint: str) -> Dict[str, str]:
|
||||
"""
|
||||
Compute signing headers for an API request.
|
||||
|
||||
Args:
|
||||
endpoint: API path (e.g. "/users/me") - will be prefixed with /api2/v2
|
||||
|
||||
Returns:
|
||||
Dict with sign, time, app-token, user-id headers
|
||||
"""
|
||||
user_id = self.auth_config.get('auth_id', '0')
|
||||
# Sign with full URL path (matching OF-Scraper)
|
||||
full_path = f"/api2/v2{endpoint}"
|
||||
sign_headers = await self._signer.sign(full_path, user_id)
|
||||
sign_headers['user-id'] = user_id
|
||||
return sign_headers
|
||||
|
||||
async def _api_request(self, endpoint: str, params: Optional[Dict] = None) -> Optional[Dict]:
|
||||
"""
|
||||
Make a signed API request to OnlyFans.
|
||||
|
||||
Handles 401 (auth failure), 429 (rate limit), and general errors.
|
||||
Auto-retries on 429 with exponential backoff.
|
||||
|
||||
Args:
|
||||
endpoint: API path (e.g. "/users/me")
|
||||
params: Optional query parameters
|
||||
|
||||
Returns:
|
||||
Parsed JSON response or None on failure
|
||||
"""
|
||||
session = await self._get_session()
|
||||
# Include query params in the signing path (OF-Scraper does this)
|
||||
sign_endpoint = endpoint
|
||||
if params:
|
||||
sign_endpoint = f"{endpoint}?{urlencode(params)}"
|
||||
sign_headers = await self._sign_request(sign_endpoint)
|
||||
|
||||
url = f"{self.BASE_URL}{endpoint}"
|
||||
max_retries = 3
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
async with session.get(url, params=params, headers=sign_headers) as resp:
|
||||
if resp.status == 200:
|
||||
return await resp.json()
|
||||
elif resp.status == 401:
|
||||
self.log("OnlyFans auth failed (401) - credentials may be expired", 'error')
|
||||
return None
|
||||
elif resp.status == 429:
|
||||
retry_after = int(resp.headers.get('Retry-After', 30))
|
||||
wait = min(retry_after * (attempt + 1), 120)
|
||||
self.log(f"Rate limited (429), waiting {wait}s (attempt {attempt + 1}/{max_retries})", 'warning')
|
||||
await asyncio.sleep(wait)
|
||||
# Refresh signing headers for retry (timestamp changes)
|
||||
sign_headers = await self._sign_request(sign_endpoint)
|
||||
continue
|
||||
elif resp.status == 404:
|
||||
self.log(f"Not found (404): {endpoint}", 'debug')
|
||||
return None
|
||||
else:
|
||||
text = await resp.text()
|
||||
self.log(f"API error: HTTP {resp.status} for {endpoint}: {text[:200]}", 'warning')
|
||||
return None
|
||||
except asyncio.TimeoutError:
|
||||
self.log(f"Request timeout for {endpoint} (attempt {attempt + 1})", 'warning')
|
||||
if attempt < max_retries - 1:
|
||||
await asyncio.sleep(5 * (attempt + 1))
|
||||
sign_headers = await self._sign_request(sign_endpoint)
|
||||
continue
|
||||
return None
|
||||
except Exception as e:
|
||||
self.log(f"Request error for {endpoint}: {e}", 'error')
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _strip_html(text: str) -> str:
|
||||
"""Strip HTML tags and convert common entities to plain text"""
|
||||
if not text:
|
||||
return ''
|
||||
text = re.sub(r'<br\s*/?>', '\n', text)
|
||||
text = re.sub(r'<[^>]+>', '', text)
|
||||
text = text.replace('&', '&').replace('<', '<').replace('>', '>').replace(''', "'").replace('"', '"')
|
||||
return text.strip()
|
||||
|
||||
async def close(self):
|
||||
"""Close the aiohttp session"""
|
||||
if self._session and not self._session.closed:
|
||||
await self._session.close()
|
||||
self._session = None
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
await self.close()
|
||||
|
||||
async def check_auth(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Verify credentials by calling /users/me.
|
||||
|
||||
Returns:
|
||||
Dict with 'valid' bool and optionally 'user_id', 'username', 'name'
|
||||
"""
|
||||
self._delay_between_items()
|
||||
try:
|
||||
data = await self._api_request("/users/me")
|
||||
if data and data.get('id'):
|
||||
return {
|
||||
'valid': True,
|
||||
'user_id': str(data['id']),
|
||||
'username': data.get('username', ''),
|
||||
'name': data.get('name', ''),
|
||||
}
|
||||
return {'valid': False, 'error': 'Invalid credentials or unexpected response'}
|
||||
except Exception as e:
|
||||
self.log(f"Error checking auth: {e}", 'error')
|
||||
return {'valid': False, 'error': str(e)}
|
||||
|
||||
async def get_user_info(self, username: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get user profile info.
|
||||
|
||||
Args:
|
||||
username: The OnlyFans username
|
||||
|
||||
Returns:
|
||||
Normalized user info dict or None
|
||||
"""
|
||||
self._delay_between_items()
|
||||
try:
|
||||
data = await self._api_request(f"/users/{username}")
|
||||
if not data or not data.get('id'):
|
||||
self.log(f"User not found: {username}", 'warning')
|
||||
return None
|
||||
|
||||
return {
|
||||
'user_id': str(data['id']),
|
||||
'username': data.get('username', username),
|
||||
'display_name': data.get('name', ''),
|
||||
'avatar_url': data.get('avatar'),
|
||||
'banner_url': data.get('header'),
|
||||
'bio': self._strip_html(data.get('rawAbout') or data.get('about') or ''),
|
||||
'join_date': (data.get('joinDate') or '')[:10] or None,
|
||||
'posts_count': data.get('postsCount', 0),
|
||||
}
|
||||
except Exception as e:
|
||||
self.log(f"Error getting user info for {username}: {e}", 'error')
|
||||
return None
|
||||
|
||||
async def get_single_post(self, post_id: str) -> Optional[Post]:
|
||||
"""
|
||||
Fetch a single post by its OnlyFans post ID.
|
||||
|
||||
Args:
|
||||
post_id: The OnlyFans post ID
|
||||
|
||||
Returns:
|
||||
Post object or None
|
||||
"""
|
||||
self._delay_between_items()
|
||||
data = await self._api_request(f"/posts/{post_id}")
|
||||
if not data:
|
||||
self.log(f"Post {post_id} not found", 'warning')
|
||||
return None
|
||||
|
||||
user_id = str(data.get('author', {}).get('id', data.get('authorId', '')))
|
||||
post = self._parse_post(data, user_id)
|
||||
return post
|
||||
|
||||
async def get_posts(
|
||||
self,
|
||||
user_id: str,
|
||||
username: str,
|
||||
since_date: Optional[str] = None,
|
||||
until_date: Optional[str] = None,
|
||||
days_back: Optional[int] = None,
|
||||
max_posts: Optional[int] = None,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||||
) -> List[Post]:
|
||||
"""
|
||||
Fetch posts from a creator's timeline using offset-based pagination.
|
||||
|
||||
Args:
|
||||
user_id: The OnlyFans numeric user ID
|
||||
username: The username (for logging/reference)
|
||||
since_date: Only fetch posts after this date (ISO format)
|
||||
until_date: Only fetch posts before this date (ISO format)
|
||||
days_back: Fetch posts from the last N days
|
||||
max_posts: Maximum number of posts to fetch
|
||||
progress_callback: Called with (page, total_posts) during fetching
|
||||
|
||||
Returns:
|
||||
List of Post objects
|
||||
"""
|
||||
self.log(f"Fetching posts for {username} (user_id: {user_id})", 'info')
|
||||
|
||||
# Calculate date filters - use naive datetimes to avoid tz comparison issues
|
||||
since_dt = None
|
||||
until_dt = None
|
||||
|
||||
if days_back:
|
||||
from datetime import timedelta
|
||||
since_date = (datetime.now() - timedelta(days=days_back)).isoformat()
|
||||
|
||||
if since_date:
|
||||
try:
|
||||
dt = datetime.fromisoformat(since_date.replace('Z', '+00:00'))
|
||||
since_dt = dt.replace(tzinfo=None) # Normalize to naive
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
if until_date:
|
||||
try:
|
||||
dt = datetime.fromisoformat(until_date.replace('Z', '+00:00'))
|
||||
until_dt = dt.replace(tzinfo=None) # Normalize to naive
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
if since_dt:
|
||||
self.log(f"Date filter: since_date={since_dt.isoformat()}", 'debug')
|
||||
|
||||
all_posts: List[Post] = []
|
||||
offset = 0
|
||||
page_size = 50
|
||||
page = 0
|
||||
consecutive_old = 0 # Track consecutive old posts for early stop
|
||||
|
||||
while True:
|
||||
self._delay_between_items()
|
||||
|
||||
params = {
|
||||
'limit': str(page_size),
|
||||
'offset': str(offset),
|
||||
'order': 'publish_date_desc',
|
||||
}
|
||||
|
||||
data = await self._api_request(f"/users/{user_id}/posts", params=params)
|
||||
if not data:
|
||||
break
|
||||
|
||||
# OF returns a list of posts directly
|
||||
posts_list = data if isinstance(data, list) else data.get('list', [])
|
||||
if not posts_list:
|
||||
break
|
||||
|
||||
page_had_old_post = False
|
||||
for post_data in posts_list:
|
||||
post = self._parse_post(post_data, user_id)
|
||||
if not post:
|
||||
continue
|
||||
|
||||
# Check date filters using published_at
|
||||
if post.published_at and since_dt:
|
||||
try:
|
||||
post_dt = datetime.fromisoformat(post.published_at.replace('Z', '+00:00'))
|
||||
post_dt_naive = post_dt.replace(tzinfo=None) # Normalize to naive
|
||||
if post_dt_naive < since_dt:
|
||||
self.log(f"Reached posts older than since_date ({post.published_at}), stopping", 'debug')
|
||||
return all_posts
|
||||
except (ValueError, TypeError) as e:
|
||||
self.log(f"Date comparison error: {e} (post_date={post.published_at})", 'warning')
|
||||
|
||||
if post.published_at and until_dt:
|
||||
try:
|
||||
post_dt = datetime.fromisoformat(post.published_at.replace('Z', '+00:00'))
|
||||
post_dt_naive = post_dt.replace(tzinfo=None)
|
||||
if post_dt_naive > until_dt:
|
||||
continue
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
all_posts.append(post)
|
||||
|
||||
if max_posts and len(all_posts) >= max_posts:
|
||||
self.log(f"Reached max_posts limit: {max_posts}", 'debug')
|
||||
return all_posts
|
||||
|
||||
page += 1
|
||||
if progress_callback:
|
||||
progress_callback(page, len(all_posts))
|
||||
|
||||
# If we got fewer results than page_size, we've reached the end
|
||||
if len(posts_list) < page_size:
|
||||
break
|
||||
|
||||
offset += page_size
|
||||
self._delay_between_batches()
|
||||
|
||||
# Also fetch pinned posts (they may not appear in the timeline)
|
||||
self._delay_between_items()
|
||||
pinned_data = await self._api_request(
|
||||
f"/users/{user_id}/posts",
|
||||
params={'limit': '50', 'offset': '0', 'order': 'publish_date_desc', 'pinned': '1'},
|
||||
)
|
||||
if pinned_data:
|
||||
pinned_list = pinned_data if isinstance(pinned_data, list) else pinned_data.get('list', [])
|
||||
existing_ids = {p.post_id for p in all_posts}
|
||||
for post_data in pinned_list:
|
||||
post = self._parse_post(post_data, user_id)
|
||||
if post and post.post_id not in existing_ids:
|
||||
all_posts.append(post)
|
||||
|
||||
self.log(f"Fetched {len(all_posts)} posts for {username}", 'info')
|
||||
return all_posts
|
||||
|
||||
def _parse_post(self, post_data: Dict, user_id: str) -> Optional[Post]:
|
||||
"""
|
||||
Parse an OnlyFans post into a Post model.
|
||||
|
||||
Args:
|
||||
post_data: Raw post data from API
|
||||
user_id: Creator's user ID
|
||||
|
||||
Returns:
|
||||
Post object or None if parsing fails
|
||||
"""
|
||||
try:
|
||||
post_id = str(post_data.get('id', ''))
|
||||
if not post_id:
|
||||
return None
|
||||
|
||||
# Parse timestamp - OF uses ISO format strings
|
||||
published_at = None
|
||||
raw_date = post_data.get('postedAt') or post_data.get('createdAt')
|
||||
if raw_date:
|
||||
try:
|
||||
if isinstance(raw_date, str):
|
||||
published_at = raw_date
|
||||
elif isinstance(raw_date, (int, float)):
|
||||
published_at = datetime.fromtimestamp(raw_date).isoformat()
|
||||
except (ValueError, TypeError, OSError):
|
||||
pass
|
||||
|
||||
# Content text
|
||||
content = self._strip_html(post_data.get('rawText') or post_data.get('text') or '')
|
||||
|
||||
# Parse media attachments
|
||||
attachments = []
|
||||
media_list = post_data.get('media', []) or []
|
||||
for media_item in media_list:
|
||||
attachment = self._parse_attachment(media_item)
|
||||
if attachment:
|
||||
attachments.append(attachment)
|
||||
|
||||
# Extract embed URLs from content text
|
||||
embed_urls = []
|
||||
if content:
|
||||
url_pattern = r'https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/|vimeo\.com/|dailymotion\.com/video/)\S+'
|
||||
embed_urls = re.findall(url_pattern, content)
|
||||
|
||||
return Post(
|
||||
post_id=post_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=user_id,
|
||||
title=None,
|
||||
content=content,
|
||||
published_at=published_at,
|
||||
added_at=datetime.now().isoformat(),
|
||||
attachments=attachments,
|
||||
embed_urls=embed_urls,
|
||||
is_pinned=bool(post_data.get('isPinned')),
|
||||
pinned_at=post_data.get('pinnedAt'),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error parsing post: {e}", 'error')
|
||||
return None
|
||||
|
||||
def _parse_attachment(self, media_item: Dict) -> Optional[Attachment]:
|
||||
"""
|
||||
Parse an OnlyFans media item into an Attachment.
|
||||
|
||||
OF media structure:
|
||||
{
|
||||
id, type, source: {source: url, width, height, duration},
|
||||
full: {source: url, ...}, preview: {source: url, ...}
|
||||
}
|
||||
|
||||
Prefers 'full' quality (OF's standard since 2024), falls back to 'source'.
|
||||
|
||||
Args:
|
||||
media_item: Raw media dict from API
|
||||
|
||||
Returns:
|
||||
Attachment object or None
|
||||
"""
|
||||
try:
|
||||
media_id = str(media_item.get('id', ''))
|
||||
media_type = media_item.get('type', '').lower()
|
||||
|
||||
# Map OF media types to our file types
|
||||
type_map = {
|
||||
'photo': 'image',
|
||||
'video': 'video',
|
||||
'audio': 'audio',
|
||||
'gif': 'image',
|
||||
}
|
||||
file_type = type_map.get(media_type, 'unknown')
|
||||
|
||||
# Get download URL - prefer 'full' quality, fallback to 'source'
|
||||
download_url = None
|
||||
width = None
|
||||
height = None
|
||||
duration = None
|
||||
|
||||
# Current OF API nests media under 'files' key
|
||||
files = media_item.get('files') or media_item
|
||||
|
||||
# Try 'full' first (higher quality)
|
||||
full_data = files.get('full')
|
||||
if full_data and isinstance(full_data, dict):
|
||||
download_url = full_data.get('url') or full_data.get('source')
|
||||
width = full_data.get('width')
|
||||
height = full_data.get('height')
|
||||
duration = full_data.get('duration')
|
||||
|
||||
# Fallback to 'source'
|
||||
if not download_url:
|
||||
source_data = files.get('source')
|
||||
if source_data and isinstance(source_data, dict):
|
||||
download_url = source_data.get('url') or source_data.get('source')
|
||||
if not width:
|
||||
width = source_data.get('width')
|
||||
if not height:
|
||||
height = source_data.get('height')
|
||||
if not duration:
|
||||
duration = source_data.get('duration')
|
||||
|
||||
# For videos without a direct URL, get metadata from media item
|
||||
can_view = media_item.get('canView', True)
|
||||
if not download_url and media_type == 'video':
|
||||
# OF DRM videos use FairPlay SAMPLE-AES encryption — cannot be downloaded.
|
||||
# Get dimensions/duration for metadata, then fall through to preview frame.
|
||||
if not duration:
|
||||
duration = media_item.get('duration')
|
||||
if not width:
|
||||
width = (full_data or {}).get('width')
|
||||
if not height:
|
||||
height = (full_data or {}).get('height')
|
||||
|
||||
# Fallback to 'preview' for any content type
|
||||
# For DRM videos (canView=true), downloads the preview frame image (shown with lock overlay)
|
||||
# For PPV videos (canView=false), there's no preview — marked unavailable
|
||||
if not download_url:
|
||||
preview_data = files.get('preview')
|
||||
if preview_data and isinstance(preview_data, dict):
|
||||
download_url = preview_data.get('url') or preview_data.get('source')
|
||||
if not width:
|
||||
width = preview_data.get('width')
|
||||
if not height:
|
||||
height = preview_data.get('height')
|
||||
|
||||
# Some OF responses have src directly
|
||||
if not download_url:
|
||||
download_url = media_item.get('src')
|
||||
|
||||
# Determine extension from URL
|
||||
ext = ''
|
||||
if download_url:
|
||||
parsed = urlparse(download_url)
|
||||
path = parsed.path
|
||||
if '.' in path:
|
||||
ext = path.rsplit('.', 1)[-1].lower()
|
||||
# Clean up common issues
|
||||
if ext in ('jpeg',):
|
||||
ext = 'jpg'
|
||||
elif media_type == 'photo':
|
||||
ext = 'jpg'
|
||||
elif media_type == 'video':
|
||||
ext = 'mp4'
|
||||
|
||||
filename = f"{media_id}.{ext}" if ext else str(media_id)
|
||||
|
||||
# Override file_type based on actual extension (OF sometimes misreports type)
|
||||
video_exts = {'mp4', 'mov', 'webm', 'avi', 'mkv', 'flv', 'm4v', 'wmv', 'mpg', 'mpeg'}
|
||||
if ext in video_exts and file_type != 'video':
|
||||
file_type = 'video'
|
||||
|
||||
# Duration may be in seconds (float or int)
|
||||
if duration is not None:
|
||||
try:
|
||||
duration = int(float(duration))
|
||||
except (ValueError, TypeError):
|
||||
duration = None
|
||||
|
||||
# Check if content is actually locked (canView=false) vs just missing URL
|
||||
can_view = media_item.get('canView', True)
|
||||
is_preview = not can_view
|
||||
if not download_url and not can_view:
|
||||
self.log(f"PPV/locked content: {filename}", 'debug')
|
||||
|
||||
# Detect preview-only: no full/source URL but got a preview URL
|
||||
if not is_preview and download_url:
|
||||
has_full = False
|
||||
if full_data and isinstance(full_data, dict):
|
||||
has_full = bool(full_data.get('url') or full_data.get('source'))
|
||||
if not has_full:
|
||||
source_data = files.get('source')
|
||||
if source_data and isinstance(source_data, dict):
|
||||
has_full = bool(source_data.get('url') or source_data.get('source'))
|
||||
elif not source_data:
|
||||
has_full = False
|
||||
if not has_full and not media_item.get('src'):
|
||||
# Only got URL from preview fallback
|
||||
is_preview = True
|
||||
|
||||
return Attachment(
|
||||
name=filename,
|
||||
server_path=f"/onlyfans/{media_id}",
|
||||
file_type=file_type,
|
||||
extension=ext if ext else None,
|
||||
download_url=download_url,
|
||||
file_size=None,
|
||||
width=width,
|
||||
height=height,
|
||||
duration=duration,
|
||||
is_preview=is_preview,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error parsing attachment: {e}", 'error')
|
||||
return None
|
||||
|
||||
# ==================== MESSAGES ====================
|
||||
|
||||
async def get_messages(self, user_id: str, max_messages: int = 500) -> List[Message]:
|
||||
"""
|
||||
Fetch messages from a conversation with a creator.
|
||||
|
||||
Uses GET /chats/{user_id}/messages with cursor-based pagination.
|
||||
The 'id' param is used as cursor for older messages.
|
||||
|
||||
Args:
|
||||
user_id: OnlyFans numeric user ID of the creator
|
||||
max_messages: Maximum number of messages to fetch
|
||||
|
||||
Returns:
|
||||
List of Message objects
|
||||
"""
|
||||
messages = []
|
||||
cursor_id = None
|
||||
page = 0
|
||||
|
||||
while len(messages) < max_messages:
|
||||
page += 1
|
||||
params = {'limit': 50, 'order': 'desc'}
|
||||
if cursor_id:
|
||||
params['id'] = cursor_id
|
||||
|
||||
data = await self._api_request(f"/chats/{user_id}/messages", params=params)
|
||||
if not data:
|
||||
break
|
||||
|
||||
# Response is a dict with 'list' key containing messages
|
||||
msg_list = data.get('list', []) if isinstance(data, dict) else data
|
||||
if not msg_list:
|
||||
break
|
||||
|
||||
for msg_data in msg_list:
|
||||
msg = self._parse_message(msg_data, user_id)
|
||||
if msg:
|
||||
messages.append(msg)
|
||||
|
||||
self.log(f"Fetched page {page}: {len(msg_list)} messages (total: {len(messages)})", 'debug')
|
||||
|
||||
# Use the last message's id as cursor for next page
|
||||
if len(msg_list) < 50:
|
||||
break # Last page
|
||||
|
||||
last_id = msg_list[-1].get('id')
|
||||
if last_id and str(last_id) != str(cursor_id):
|
||||
cursor_id = last_id
|
||||
else:
|
||||
break
|
||||
|
||||
self.log(f"Fetched {len(messages)} messages for user {user_id}", 'info')
|
||||
return messages
|
||||
|
||||
def _parse_message(self, msg_data: Dict, creator_user_id: str) -> Optional[Message]:
|
||||
"""
|
||||
Parse an OnlyFans message into a Message model.
|
||||
|
||||
Args:
|
||||
msg_data: Raw message dict from API
|
||||
creator_user_id: Numeric user ID of the creator (to determine direction)
|
||||
|
||||
Returns:
|
||||
Message object or None
|
||||
"""
|
||||
try:
|
||||
msg_id = str(msg_data.get('id', ''))
|
||||
if not msg_id:
|
||||
return None
|
||||
|
||||
# Determine if message is from creator
|
||||
from_user = msg_data.get('fromUser', {})
|
||||
from_user_id = str(from_user.get('id', ''))
|
||||
is_from_creator = (from_user_id == str(creator_user_id))
|
||||
|
||||
# Parse text
|
||||
text = self._strip_html(msg_data.get('text') or '')
|
||||
|
||||
# Parse timestamp
|
||||
created_at = msg_data.get('createdAt')
|
||||
sent_at = None
|
||||
if created_at:
|
||||
try:
|
||||
sent_at = datetime.fromisoformat(created_at.replace('Z', '+00:00')).isoformat()
|
||||
except (ValueError, TypeError):
|
||||
sent_at = created_at
|
||||
|
||||
# PPV/price info
|
||||
price = msg_data.get('price')
|
||||
is_free = msg_data.get('isFree', True)
|
||||
is_purchased = msg_data.get('isOpened', False) or msg_data.get('canPurchase') is False
|
||||
is_tip = msg_data.get('isTip', False)
|
||||
tip_amount = msg_data.get('tipAmount')
|
||||
|
||||
# Parse media attachments (same structure as posts)
|
||||
attachments = []
|
||||
media_list = msg_data.get('media', []) or []
|
||||
for media_item in media_list:
|
||||
att = self._parse_attachment(media_item)
|
||||
if att:
|
||||
attachments.append(att)
|
||||
|
||||
return Message(
|
||||
message_id=msg_id,
|
||||
platform=self.PLATFORM,
|
||||
service_id=self.SERVICE_ID,
|
||||
creator_id=str(creator_user_id),
|
||||
text=text if text else None,
|
||||
sent_at=sent_at,
|
||||
is_from_creator=is_from_creator,
|
||||
is_tip=bool(is_tip),
|
||||
tip_amount=float(tip_amount) if tip_amount else None,
|
||||
price=float(price) if price else None,
|
||||
is_free=bool(is_free),
|
||||
is_purchased=bool(is_purchased),
|
||||
attachments=attachments,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error parsing message: {e}", 'error')
|
||||
return None
|
||||
109
modules/paid_content/onlyfans_signing.py
Normal file
109
modules/paid_content/onlyfans_signing.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""
|
||||
OnlyFans Request Signing Module
|
||||
|
||||
Handles the dynamic request signing required by the OnlyFans API.
|
||||
Fetches signing rules from the DATAHOARDERS/dynamic-rules GitHub repo
|
||||
and computes SHA-1 based signatures for each API request.
|
||||
|
||||
Isolated module so it's easy to update when OF changes their signing scheme.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import time
|
||||
from typing import Dict, Optional
|
||||
|
||||
import aiohttp
|
||||
|
||||
RULES_URL = "https://raw.githubusercontent.com/DATAHOARDERS/dynamic-rules/main/onlyfans.json"
|
||||
|
||||
|
||||
class OnlyFansSigner:
|
||||
"""
|
||||
Computes request signatures for the OnlyFans API.
|
||||
|
||||
Uses dynamic rules fetched from a public GitHub repo (same source as OF-Scraper).
|
||||
Rules are cached locally and refreshed every 6 hours.
|
||||
"""
|
||||
|
||||
RULES_TTL = 6 * 3600 # 6 hours
|
||||
|
||||
def __init__(self, rules_url: Optional[str] = None):
|
||||
self.rules_url = rules_url or RULES_URL
|
||||
self._rules: Optional[Dict] = None
|
||||
self._rules_fetched_at: float = 0
|
||||
|
||||
@property
|
||||
def rules_stale(self) -> bool:
|
||||
"""Check if cached rules need refreshing"""
|
||||
if self._rules is None:
|
||||
return True
|
||||
return (time.time() - self._rules_fetched_at) > self.RULES_TTL
|
||||
|
||||
async def get_rules(self) -> Dict:
|
||||
"""
|
||||
Fetch signing rules, using cache if fresh.
|
||||
|
||||
Returns:
|
||||
Dict with keys: static_param, format, checksum_indexes,
|
||||
checksum_constants, checksum_constant, app_token
|
||||
"""
|
||||
if not self.rules_stale:
|
||||
return self._rules
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=15)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(self.rules_url) as resp:
|
||||
if resp.status != 200:
|
||||
if self._rules is not None:
|
||||
# Use stale cache rather than failing
|
||||
return self._rules
|
||||
raise RuntimeError(
|
||||
f"Failed to fetch OF signing rules: HTTP {resp.status}"
|
||||
)
|
||||
self._rules = await resp.json(content_type=None)
|
||||
self._rules_fetched_at = time.time()
|
||||
|
||||
return self._rules
|
||||
|
||||
async def sign(self, endpoint_path: str, user_id: str = "0") -> Dict[str, str]:
|
||||
"""
|
||||
Compute signing headers for an OnlyFans API request.
|
||||
|
||||
Args:
|
||||
endpoint_path: The full URL path (e.g. "/api2/v2/users/me")
|
||||
user_id: The authenticated user's ID (from auth_id cookie)
|
||||
|
||||
Returns:
|
||||
Dict with 'sign', 'time', 'app-token' headers
|
||||
"""
|
||||
rules = await self.get_rules()
|
||||
# Timestamp in milliseconds (matching OF-Scraper's implementation)
|
||||
timestamp = str(round(time.time() * 1000))
|
||||
|
||||
# 1. Build the message to hash
|
||||
msg = "\n".join([
|
||||
rules["static_param"],
|
||||
timestamp,
|
||||
endpoint_path,
|
||||
str(user_id),
|
||||
])
|
||||
|
||||
# 2. SHA-1 hash
|
||||
sha1_hash = hashlib.sha1(msg.encode("utf-8")).hexdigest()
|
||||
sha1_bytes = sha1_hash.encode("ascii")
|
||||
|
||||
# 3. Checksum from indexed byte positions + single constant
|
||||
# (matching OF-Scraper's implementation)
|
||||
checksum_indexes = rules["checksum_indexes"]
|
||||
checksum_constant = rules.get("checksum_constant", 0)
|
||||
checksum = sum(sha1_bytes[i] for i in checksum_indexes) + checksum_constant
|
||||
|
||||
# 4. Build the sign header using the format template
|
||||
# Typical format: "53760:{}:{:x}:69723085"
|
||||
sign_value = rules["format"].format(sha1_hash, abs(checksum))
|
||||
|
||||
return {
|
||||
"sign": sign_value,
|
||||
"time": timestamp,
|
||||
"app-token": rules["app_token"],
|
||||
}
|
||||
755
modules/paid_content/pornhub_client.py
Normal file
755
modules/paid_content/pornhub_client.py
Normal file
@@ -0,0 +1,755 @@
|
||||
"""
|
||||
Pornhub Client - Fetches creator info and videos using yt-dlp
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html as html_module
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
class PornhubClient(LoggingMixin):
|
||||
"""
|
||||
Client for fetching Pornhub creator information and videos using yt-dlp
|
||||
|
||||
Supports:
|
||||
- Pornstar pages (pornhub.com/pornstar/name)
|
||||
- Channel pages (pornhub.com/channels/name)
|
||||
- User pages (pornhub.com/users/name)
|
||||
- Model pages (pornhub.com/model/name)
|
||||
"""
|
||||
|
||||
SERVICE_ID = 'pornhub'
|
||||
PLATFORM = 'pornhub'
|
||||
|
||||
# Quality presets for yt-dlp
|
||||
# Pornhub serves single combined streams with IDs like '1080p', '720p', etc.
|
||||
# NOT separate video+audio streams like YouTube
|
||||
QUALITY_PRESETS = {
|
||||
'best': 'bestvideo+bestaudio/best',
|
||||
'1080p': 'bestvideo[height<=1080]+bestaudio/best[height<=1080]/best',
|
||||
'720p': 'bestvideo[height<=720]+bestaudio/best[height<=720]/best',
|
||||
'480p': 'bestvideo[height<=480]+bestaudio/best[height<=480]/best',
|
||||
}
|
||||
|
||||
def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Pornhub')
|
||||
|
||||
# Find yt-dlp executable
|
||||
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
|
||||
if not self.ytdlp_path:
|
||||
self.log("yt-dlp not found, Pornhub support will be disabled", 'warning')
|
||||
|
||||
# Store database reference for cookie access
|
||||
self.unified_db = unified_db
|
||||
self._cookies_file = None
|
||||
|
||||
# Cache for profile page HTML (avoid re-fetching for avatar/banner/bio)
|
||||
self._profile_page_cache: Dict[str, Optional[str]] = {}
|
||||
|
||||
def _find_ytdlp(self) -> Optional[str]:
|
||||
"""Find yt-dlp executable"""
|
||||
common_paths = [
|
||||
'/opt/media-downloader/venv/bin/yt-dlp',
|
||||
'/usr/local/bin/yt-dlp',
|
||||
'/usr/bin/yt-dlp',
|
||||
'/opt/homebrew/bin/yt-dlp',
|
||||
os.path.expanduser('~/.local/bin/yt-dlp'),
|
||||
]
|
||||
|
||||
for path in common_paths:
|
||||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
return path
|
||||
|
||||
try:
|
||||
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if yt-dlp is available"""
|
||||
return self.ytdlp_path is not None
|
||||
|
||||
def _get_cookies_file(self) -> Optional[str]:
|
||||
"""Get path to cookies file, creating it from database if needed"""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
return self._cookies_file
|
||||
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('pornhub',))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
data = json.loads(row[0])
|
||||
# Support both {"cookies": [...]} and [...] formats
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
cookies_list = data['cookies']
|
||||
elif isinstance(data, list):
|
||||
cookies_list = data
|
||||
else:
|
||||
cookies_list = []
|
||||
|
||||
if cookies_list:
|
||||
# Write cookies to temp file in Netscape format
|
||||
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='pornhub_cookies_')
|
||||
with os.fdopen(fd, 'w') as f:
|
||||
f.write("# Netscape HTTP Cookie File\n")
|
||||
for cookie in cookies_list:
|
||||
domain = cookie.get('domain', '')
|
||||
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
|
||||
path = cookie.get('path', '/')
|
||||
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
|
||||
expiry = str(int(cookie.get('expirationDate', 0)))
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
|
||||
self.log(f"Loaded {len(cookies_list)} cookies from pornhub scraper", 'debug')
|
||||
return self._cookies_file
|
||||
except Exception as e:
|
||||
self.log(f"Could not load cookies: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _get_cookies_list(self) -> Optional[list]:
|
||||
"""Get cookies as a list of dicts for aiohttp requests"""
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('pornhub',))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
data = json.loads(row[0])
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
return data['cookies']
|
||||
elif isinstance(data, list):
|
||||
return data
|
||||
except Exception as e:
|
||||
self.log(f"Could not load cookies list: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _get_base_cmd(self) -> List[str]:
|
||||
"""Get base yt-dlp command with cookies if available"""
|
||||
cmd = [self.ytdlp_path]
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
cmd.extend(['--cookies', cookies_file])
|
||||
return cmd
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up temporary files"""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
try:
|
||||
os.unlink(self._cookies_file)
|
||||
except Exception:
|
||||
pass
|
||||
self._cookies_file = None
|
||||
self._profile_page_cache.clear()
|
||||
|
||||
@staticmethod
|
||||
def extract_creator_id(url: str) -> Optional[Tuple[str, str]]:
|
||||
"""
|
||||
Extract creator type and identifier from Pornhub URL
|
||||
|
||||
Returns:
|
||||
Tuple of (type, id) where type is 'pornstar', 'channels', 'users', or 'model'
|
||||
or None if not a valid Pornhub creator URL
|
||||
"""
|
||||
patterns = [
|
||||
(r'pornhub\.com/pornstar/([a-zA-Z0-9_-]+)', 'pornstar'),
|
||||
(r'pornhub\.com/channels/([a-zA-Z0-9_-]+)', 'channels'),
|
||||
(r'pornhub\.com/users/([a-zA-Z0-9_-]+)', 'users'),
|
||||
(r'pornhub\.com/model/([a-zA-Z0-9_-]+)', 'model'),
|
||||
]
|
||||
|
||||
for pattern, creator_type in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
return (creator_type, match.group(1))
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_creator_url(creator_id: str, creator_type: str = 'pornstar') -> str:
|
||||
"""Convert creator ID to a consistent URL format
|
||||
|
||||
Args:
|
||||
creator_id: Creator name/identifier (may be 'type/name' format)
|
||||
creator_type: Default type if not embedded in creator_id
|
||||
"""
|
||||
# Already a full URL
|
||||
if creator_id.startswith('http://') or creator_id.startswith('https://'):
|
||||
return creator_id
|
||||
|
||||
# Handle 'type/name' format from URL parser
|
||||
if '/' in creator_id:
|
||||
parts = creator_id.split('/', 1)
|
||||
creator_type = parts[0]
|
||||
creator_id = parts[1]
|
||||
|
||||
return f"https://www.pornhub.com/{creator_type}/{creator_id}"
|
||||
|
||||
def _get_listing_url(self, url: str) -> str:
|
||||
"""Get the URL to use for listing videos from a creator page.
|
||||
|
||||
For pornstars and models, append /videos to get the video listing.
|
||||
For channels and users, the base URL already lists videos.
|
||||
"""
|
||||
# Parse out the type
|
||||
parsed = self.extract_creator_id(url)
|
||||
if parsed:
|
||||
creator_type, _ = parsed
|
||||
if creator_type in ('pornstar', 'model'):
|
||||
# Strip any trailing slash and append /videos
|
||||
url = url.rstrip('/')
|
||||
if not url.endswith('/videos'):
|
||||
url = f"{url}/videos"
|
||||
return url
|
||||
|
||||
async def get_creator_info(self, url: str) -> Optional[Dict]:
|
||||
"""
|
||||
Get creator information using yt-dlp + profile page scraping
|
||||
|
||||
Returns dict with creator metadata or None if not found
|
||||
"""
|
||||
if not self.is_available():
|
||||
return None
|
||||
|
||||
creator_type_id = self.extract_creator_id(url)
|
||||
creator_type = creator_type_id[0] if creator_type_id else 'pornstar'
|
||||
|
||||
# Try to scrape the display name from the profile page first
|
||||
creator_name = None
|
||||
try:
|
||||
page_html = await self.get_profile_page(url)
|
||||
if page_html:
|
||||
# Look for <h1 itemprop="name">Name</h1> inside nameSubscribe div
|
||||
name_match = re.search(r'<div class="nameSubscribe">.*?<h1[^>]*>\s*(.+?)\s*</h1>', page_html, re.DOTALL)
|
||||
if name_match:
|
||||
creator_name = html_module.unescape(name_match.group(1).strip())
|
||||
self.log(f"Found creator name from profile page: {creator_name}", 'debug')
|
||||
except Exception as e:
|
||||
self.log(f"Could not scrape creator name: {e}", 'debug')
|
||||
|
||||
# If page scraping didn't find a name, try yt-dlp
|
||||
if not creator_name:
|
||||
try:
|
||||
listing_url = self._get_listing_url(url)
|
||||
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--playlist-items', '1',
|
||||
listing_url
|
||||
]
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode == 0:
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
playlist_title = data.get('playlist_title') or ''
|
||||
creator_name = (data.get('channel') or data.get('uploader')
|
||||
or playlist_title.replace(' - Videos', '') or None)
|
||||
if creator_name:
|
||||
creator_name = html_module.unescape(creator_name)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
self.log(f"yt-dlp creator info failed: {e}", 'debug')
|
||||
|
||||
# Fall back to deriving name from URL slug
|
||||
if not creator_name and creator_type_id:
|
||||
creator_name = creator_type_id[1].replace('-', ' ').title()
|
||||
|
||||
if creator_name:
|
||||
return {
|
||||
'creator_id': creator_type_id[1] if creator_type_id else None,
|
||||
'creator_name': creator_name,
|
||||
'creator_url': url,
|
||||
'creator_type': creator_type,
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
async def get_creator_videos(self, url: str, since_date: str = None,
|
||||
max_videos: int = None,
|
||||
progress_callback=None) -> List[Dict]:
|
||||
"""
|
||||
Get all videos from a creator page using --flat-playlist for speed.
|
||||
|
||||
Args:
|
||||
url: Pornhub creator URL
|
||||
since_date: Only fetch videos published after this date (ISO format)
|
||||
max_videos: Maximum number of videos to fetch
|
||||
progress_callback: Callback function(count) for progress updates
|
||||
|
||||
Returns:
|
||||
List of video metadata dicts
|
||||
"""
|
||||
if not self.is_available():
|
||||
return []
|
||||
|
||||
try:
|
||||
listing_url = self._get_listing_url(url)
|
||||
|
||||
# Use --flat-playlist for fast listing (avoids per-video HTTP requests)
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--socket-timeout', '30',
|
||||
'--retries', '3',
|
||||
listing_url
|
||||
]
|
||||
|
||||
if max_videos:
|
||||
cmd.extend(['--playlist-items', f'1:{max_videos}'])
|
||||
|
||||
self.log(f"Fetching videos from: {url}", 'info')
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error = stderr.decode('utf-8', errors='replace')
|
||||
self.log(f"Failed to get creator videos: {error}", 'warning')
|
||||
return []
|
||||
|
||||
videos = []
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
|
||||
# Skip non-video entries
|
||||
if data.get('_type') == 'playlist':
|
||||
continue
|
||||
|
||||
video_id = data.get('id')
|
||||
if not video_id:
|
||||
continue
|
||||
|
||||
# Flat-playlist doesn't provide upload_date for Pornhub, but check anyway
|
||||
upload_date = data.get('upload_date')
|
||||
if upload_date:
|
||||
try:
|
||||
upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Decode HTML entities in title (flat-playlist returns them encoded)
|
||||
title = html_module.unescape(data.get('title', f'Video {video_id}'))
|
||||
|
||||
# Build video URL
|
||||
video_url = (data.get('webpage_url') or data.get('url')
|
||||
or f"https://www.pornhub.com/view_video.php?viewkey={video_id}")
|
||||
|
||||
videos.append({
|
||||
'video_id': video_id,
|
||||
'title': title,
|
||||
'description': data.get('description', ''),
|
||||
'upload_date': upload_date,
|
||||
'duration': data.get('duration'),
|
||||
'view_count': data.get('view_count'),
|
||||
'thumbnail': data.get('thumbnail'),
|
||||
'url': video_url,
|
||||
})
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(videos))
|
||||
|
||||
if max_videos and len(videos) >= max_videos:
|
||||
break
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
self.log(f"Found {len(videos)} videos", 'info')
|
||||
return videos
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting creator videos: {e}", 'error')
|
||||
return []
|
||||
|
||||
async def download_video(self, video_url: str, output_dir: Path, quality: str = 'best',
|
||||
progress_callback=None) -> Dict:
|
||||
"""
|
||||
Download a video
|
||||
|
||||
Args:
|
||||
video_url: Pornhub video URL
|
||||
output_dir: Directory to save the video
|
||||
quality: Quality preset
|
||||
progress_callback: Callback for download progress
|
||||
|
||||
Returns:
|
||||
Dict with success status and file info
|
||||
"""
|
||||
if not self.is_available():
|
||||
return {'success': False, 'error': 'yt-dlp not available'}
|
||||
|
||||
try:
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s')
|
||||
|
||||
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
|
||||
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'-f', format_str,
|
||||
'-o', output_template,
|
||||
'--print-json',
|
||||
'--no-playlist',
|
||||
'--user-agent', 'Mozilla/5.0',
|
||||
'--referer', 'https://www.pornhub.com/',
|
||||
'--merge-output-format', 'mp4',
|
||||
'--concurrent-fragments', '4',
|
||||
'--no-part',
|
||||
'--retries', '20',
|
||||
video_url
|
||||
]
|
||||
|
||||
self.log(f"Downloading video: {video_url}", 'debug')
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = stderr.decode('utf-8', errors='replace').strip()
|
||||
if 'Video unavailable' in error_msg or 'not available' in error_msg:
|
||||
error_msg = 'Video unavailable or private'
|
||||
elif 'premium' in error_msg.lower():
|
||||
error_msg = 'Video requires premium access'
|
||||
elif len(error_msg) > 200:
|
||||
error_msg = error_msg[:200] + '...'
|
||||
|
||||
return {'success': False, 'error': error_msg}
|
||||
|
||||
# Parse output JSON
|
||||
video_info = None
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
try:
|
||||
video_info = json.loads(line)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not video_info:
|
||||
# Try to find downloaded file
|
||||
files = list(output_dir.glob('*.mp4'))
|
||||
if files:
|
||||
file_path = max(files, key=lambda f: f.stat().st_mtime)
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(file_path),
|
||||
'filename': file_path.name,
|
||||
'file_size': file_path.stat().st_size
|
||||
}
|
||||
return {'success': False, 'error': 'Could not find downloaded file'}
|
||||
|
||||
file_path = video_info.get('_filename') or video_info.get('filename')
|
||||
if file_path:
|
||||
file_path = Path(file_path)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(file_path) if file_path else None,
|
||||
'filename': file_path.name if file_path else None,
|
||||
'file_size': file_path.stat().st_size if file_path and file_path.exists() else video_info.get('filesize'),
|
||||
'title': video_info.get('title'),
|
||||
'duration': video_info.get('duration'),
|
||||
'video_id': video_info.get('id'),
|
||||
'upload_date': video_info.get('upload_date'),
|
||||
'timestamp': video_info.get('timestamp'),
|
||||
'thumbnail': video_info.get('thumbnail'),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading video: {e}", 'error')
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
async def get_profile_page(self, url: str) -> Optional[str]:
|
||||
"""Fetch profile page HTML via aiohttp (with cookies if available).
|
||||
Results are cached to avoid re-fetching for avatar/banner/bio."""
|
||||
# Strip /videos suffix for profile page
|
||||
base_url = re.sub(r'/videos/?$', '', url)
|
||||
|
||||
if base_url in self._profile_page_cache:
|
||||
return self._profile_page_cache[base_url]
|
||||
|
||||
try:
|
||||
import aiohttp
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
}
|
||||
|
||||
# Build simple cookies dict for the session
|
||||
cookies_dict = {}
|
||||
cookies_list = self._get_cookies_list()
|
||||
if cookies_list:
|
||||
for cookie in cookies_list:
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
if name:
|
||||
cookies_dict[name] = value
|
||||
|
||||
async with aiohttp.ClientSession(cookies=cookies_dict) as session:
|
||||
async with session.get(
|
||||
base_url,
|
||||
headers=headers,
|
||||
timeout=aiohttp.ClientTimeout(total=15)
|
||||
) as resp:
|
||||
if resp.status == 200:
|
||||
text = await resp.text()
|
||||
self._profile_page_cache[base_url] = text
|
||||
return text
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Could not fetch profile page: {e}", 'debug')
|
||||
|
||||
self._profile_page_cache[base_url] = None
|
||||
return None
|
||||
|
||||
async def get_profile_image(self, url: str) -> Optional[str]:
|
||||
"""Scrape profile page for avatar/photo URL"""
|
||||
try:
|
||||
page_html = await self.get_profile_page(url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
# Look for avatar image: <img id="getAvatar" src="...">
|
||||
avatar_match = re.search(r'<img[^>]*id=["\']getAvatar["\'][^>]*src=["\']([^"\']+)["\']', page_html)
|
||||
if avatar_match:
|
||||
self.log("Found Pornhub profile avatar", 'debug')
|
||||
return avatar_match.group(1)
|
||||
|
||||
# Try og:image meta tag
|
||||
og_match = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_html)
|
||||
if not og_match:
|
||||
og_match = re.search(r'<meta\s+content="([^"]+)"\s+property="og:image"', page_html)
|
||||
if og_match:
|
||||
return og_match.group(1)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Could not fetch profile image: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def get_profile_bio(self, url: str) -> Optional[str]:
|
||||
"""Scrape bio/about section from profile page"""
|
||||
try:
|
||||
page_html = await self.get_profile_page(url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
# Look for aboutMeSection -> div with the actual text
|
||||
# Structure: <section class="aboutMeSection ..."><div class="title">About Name</div><div>Bio text</div></section>
|
||||
about_match = re.search(
|
||||
r'<section\s+class="aboutMeSection[^"]*"[^>]*>.*?<div class="title">[^<]*</div>\s*<div>\s*(.*?)\s*</div>',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
if about_match:
|
||||
bio_text = re.sub(r'<[^>]+>', '', about_match.group(1)).strip()
|
||||
if bio_text:
|
||||
self.log("Found Pornhub profile bio", 'debug')
|
||||
return html_module.unescape(bio_text)
|
||||
|
||||
# Fallback: look for biographyAbout section
|
||||
bio_match = re.search(
|
||||
r'class="biographyAbout[^"]*"[^>]*>.*?<div class="content[^"]*">(.*?)</div>',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
if bio_match:
|
||||
bio_text = re.sub(r'<[^>]+>', '', bio_match.group(1)).strip()
|
||||
if bio_text:
|
||||
self.log("Found Pornhub profile bio (fallback)", 'debug')
|
||||
return html_module.unescape(bio_text)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Could not fetch profile bio: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def get_profile_banner(self, url: str) -> Optional[str]:
|
||||
"""Scrape banner/cover image if available"""
|
||||
try:
|
||||
page_html = await self.get_profile_page(url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
# Look for cover image: <img id="coverPictureDefault" src="...">
|
||||
cover_match = re.search(
|
||||
r'<img[^>]*id=["\']coverPictureDefault["\'][^>]*src=["\']([^"\']+)["\']',
|
||||
page_html
|
||||
)
|
||||
if cover_match:
|
||||
self.log("Found Pornhub profile banner", 'debug')
|
||||
return cover_match.group(1)
|
||||
|
||||
# Fallback: any img inside coverImage div
|
||||
cover_match = re.search(
|
||||
r'<div class="coverImage">\s*<img[^>]*src=["\']([^"\']+)["\']',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
if cover_match:
|
||||
self.log("Found Pornhub profile banner (div)", 'debug')
|
||||
return cover_match.group(1)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Could not fetch profile banner: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def get_profile_info(self, url: str) -> Optional[Dict]:
|
||||
"""Scrape all profile info from the page in one pass"""
|
||||
page_html = await self.get_profile_page(url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
info = {}
|
||||
|
||||
# Extract infoPiece data (Gender, Birth Place, Height, etc.)
|
||||
info_pieces = re.findall(
|
||||
r'<div class="infoPiece">\s*<span>\s*(.*?)\s*</span>\s*(.*?)\s*</div>',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
for label, value in info_pieces:
|
||||
label = re.sub(r'<[^>]+>', '', label).strip().rstrip(':')
|
||||
value = re.sub(r'<[^>]+>', '', value).strip()
|
||||
if label and value:
|
||||
info[label.lower().replace(' ', '_')] = value
|
||||
|
||||
return info if info else None
|
||||
|
||||
async def get_joined_date(self, url: str) -> Optional[str]:
|
||||
"""Extract a joined/career start date from profile info"""
|
||||
try:
|
||||
profile_info = await self.get_profile_info(url)
|
||||
if not profile_info:
|
||||
return None
|
||||
|
||||
# Pornstar pages have "Career Start and End: 2011 to Present"
|
||||
career = profile_info.get('career_start_and_end')
|
||||
if career:
|
||||
# Extract start year: "2011 to Present" -> "2011"
|
||||
match = re.match(r'(\d{4})', career)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# User/model pages might not have career info but could have other dates
|
||||
return None
|
||||
except Exception as e:
|
||||
self.log(f"Could not get joined date: {e}", 'debug')
|
||||
return None
|
||||
|
||||
async def get_creator(self, url: str) -> Optional[Creator]:
|
||||
"""
|
||||
Get Creator object from creator URL
|
||||
"""
|
||||
info = await self.get_creator_info(url)
|
||||
if not info:
|
||||
return None
|
||||
|
||||
# Build creator_id as 'type/name' format
|
||||
creator_type_id = self.extract_creator_id(url)
|
||||
if creator_type_id:
|
||||
creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}"
|
||||
else:
|
||||
creator_id = info.get('creator_id', '')
|
||||
|
||||
# Profile image is already fetched during get_creator_info (page was cached)
|
||||
profile_image = await self.get_profile_image(url)
|
||||
|
||||
return Creator(
|
||||
creator_id=creator_id,
|
||||
service_id='pornhub',
|
||||
platform='pornhub',
|
||||
username=info.get('creator_name', 'Unknown'),
|
||||
display_name=info.get('creator_name'),
|
||||
profile_image_url=profile_image,
|
||||
)
|
||||
|
||||
async def get_posts(self, url: str, since_date: str = None,
|
||||
max_videos: int = None, progress_callback=None) -> List[Post]:
|
||||
"""
|
||||
Get videos as Post objects
|
||||
"""
|
||||
videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback)
|
||||
|
||||
# Get creator_id from URL
|
||||
creator_type_id = self.extract_creator_id(url)
|
||||
creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}" if creator_type_id else ''
|
||||
|
||||
posts = []
|
||||
for video in videos:
|
||||
# Create attachment for the video
|
||||
attachment = Attachment(
|
||||
name=f"{video['title']}.mp4",
|
||||
file_type='video',
|
||||
extension='.mp4',
|
||||
server_path=video['url'],
|
||||
download_url=video['url'],
|
||||
duration=video.get('duration'),
|
||||
)
|
||||
|
||||
post = Post(
|
||||
post_id=video['video_id'],
|
||||
service_id='pornhub',
|
||||
platform='pornhub',
|
||||
creator_id=creator_id,
|
||||
title=video['title'],
|
||||
content=video.get('description') or video['title'],
|
||||
published_at=video.get('upload_date'),
|
||||
attachments=[attachment],
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
return posts
|
||||
678
modules/paid_content/reddit_client.py
Normal file
678
modules/paid_content/reddit_client.py
Normal file
@@ -0,0 +1,678 @@
|
||||
"""
|
||||
Reddit Client for Paid Content - Uses gallery-dl to fetch subreddit posts and download media.
|
||||
|
||||
Adapts the gallery-dl + metadata parsing pattern from reddit_community_monitor.py
|
||||
to produce Post/Attachment objects for the paid content system.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Post, Attachment
|
||||
|
||||
|
||||
class RedditClient(LoggingMixin):
|
||||
"""
|
||||
Client for fetching Reddit subreddit content via gallery-dl.
|
||||
|
||||
gallery-dl downloads files during fetch, so attachments come with local_path
|
||||
already set. The sync handler moves files to their final location.
|
||||
"""
|
||||
|
||||
SERVICE_ID = 'reddit'
|
||||
PLATFORM = 'reddit'
|
||||
|
||||
def __init__(self, unified_db=None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Reddit')
|
||||
self.unified_db = unified_db
|
||||
self.gallery_dl_path = shutil.which('gallery-dl') or '/opt/media-downloader/venv/bin/gallery-dl'
|
||||
|
||||
def get_subreddit_info(self, subreddit: str) -> Optional[Dict]:
|
||||
"""Get basic subreddit info by checking the Reddit JSON API.
|
||||
|
||||
Returns dict with creator_id and creator_name.
|
||||
"""
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
try:
|
||||
# Quick check via Reddit's public JSON endpoint
|
||||
url = f'https://www.reddit.com/r/{subreddit}/about.json'
|
||||
req = urllib.request.Request(url, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
data = json.loads(resp.read().decode())
|
||||
|
||||
sub_data = data.get('data', {})
|
||||
display_name = sub_data.get('display_name', subreddit)
|
||||
title = sub_data.get('title', '')
|
||||
|
||||
# Extract icon — community_icon is higher res, icon_img is fallback
|
||||
icon_url = (sub_data.get('community_icon') or sub_data.get('icon_img') or '').split('?')[0]
|
||||
# HTML entities in URLs
|
||||
icon_url = icon_url.replace('&', '&') if icon_url else None
|
||||
|
||||
# Extract banner — banner_background_image is the main one
|
||||
banner_url = sub_data.get('banner_background_image') or sub_data.get('mobile_banner_image') or ''
|
||||
banner_url = banner_url.split('?')[0] if banner_url else None
|
||||
if banner_url:
|
||||
banner_url = banner_url.replace('&', '&')
|
||||
|
||||
# Build bio from title + public description
|
||||
public_desc = sub_data.get('public_description', '')
|
||||
bio_parts = []
|
||||
if title:
|
||||
bio_parts.append(title)
|
||||
if public_desc and public_desc != title:
|
||||
bio_parts.append(public_desc)
|
||||
subscribers = sub_data.get('subscribers')
|
||||
if subscribers:
|
||||
bio_parts.append(f"{subscribers:,} subscribers")
|
||||
bio = ' — '.join(bio_parts) if bio_parts else None
|
||||
|
||||
# Subreddit creation date
|
||||
created_utc = sub_data.get('created_utc')
|
||||
joined_date = None
|
||||
if created_utc:
|
||||
try:
|
||||
joined_date = datetime.fromtimestamp(created_utc, tz=timezone.utc).strftime('%Y-%m-%d')
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
# Use the subreddit title as display name (e.g. "Reddit Pics")
|
||||
# Fall back to r/name format if no title
|
||||
friendly_name = title if title else f'r/{display_name}'
|
||||
|
||||
return {
|
||||
'creator_id': display_name.lower(),
|
||||
'creator_name': f'r/{display_name}',
|
||||
'display_name': friendly_name,
|
||||
'bio': bio,
|
||||
'joined_date': joined_date,
|
||||
'profile_image_url': icon_url or None,
|
||||
'banner_image_url': banner_url or None,
|
||||
}
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 404:
|
||||
self.log(f"Subreddit r/{subreddit} not found (404)", 'warning')
|
||||
return None
|
||||
elif e.code == 403:
|
||||
# Private/quarantined — still exists, return basic info
|
||||
self.log(f"Subreddit r/{subreddit} is private/quarantined", 'warning')
|
||||
return {
|
||||
'creator_id': subreddit.lower(),
|
||||
'creator_name': f'r/{subreddit}',
|
||||
}
|
||||
else:
|
||||
self.log(f"HTTP {e.code} checking r/{subreddit}", 'warning')
|
||||
# Return basic info and let sync verify
|
||||
return {
|
||||
'creator_id': subreddit.lower(),
|
||||
'creator_name': f'r/{subreddit}',
|
||||
}
|
||||
except Exception as e:
|
||||
self.log(f"Error getting subreddit info for r/{subreddit}: {e}", 'error')
|
||||
return None
|
||||
|
||||
def get_posts(self, subreddit: str, since_date: str = None, max_posts: int = 0,
|
||||
progress_callback=None) -> tuple:
|
||||
"""Fetch posts and download media from a subreddit using gallery-dl.
|
||||
|
||||
Args:
|
||||
subreddit: Subreddit name (without r/)
|
||||
since_date: ISO date string; skip posts older than this
|
||||
max_posts: Maximum posts to fetch (0 = unlimited)
|
||||
progress_callback: Optional callable(downloaded_count, skipped_count, latest_file)
|
||||
for live progress updates
|
||||
|
||||
Returns:
|
||||
Tuple of (List[Post], temp_dir_path) — caller must clean up temp_dir
|
||||
when done moving files. Returns ([], None) on failure.
|
||||
"""
|
||||
temp_dir = tempfile.mkdtemp(prefix=f'reddit_paid_{subreddit}_')
|
||||
|
||||
try:
|
||||
downloaded = self.run_gallery_dl(subreddit, temp_dir, since_date, max_posts,
|
||||
progress_callback=progress_callback)
|
||||
|
||||
if not downloaded:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
return [], None
|
||||
|
||||
# Group files by post using metadata sidecars
|
||||
grouped = self._group_files_by_post(downloaded, temp_dir, subreddit)
|
||||
|
||||
if not grouped:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
return [], None
|
||||
|
||||
posts = []
|
||||
for post_id, post_data in grouped.items():
|
||||
attachments = []
|
||||
for file_path in post_data['files']:
|
||||
ext = file_path.suffix.lower()
|
||||
file_type = self._detect_file_type(ext)
|
||||
|
||||
attachments.append(Attachment(
|
||||
name=file_path.name,
|
||||
file_type=file_type,
|
||||
extension=ext,
|
||||
server_path=str(file_path), # temp path, will be moved
|
||||
download_url=None, # Already downloaded
|
||||
file_size=file_path.stat().st_size if file_path.exists() else None,
|
||||
))
|
||||
|
||||
if not attachments:
|
||||
continue
|
||||
|
||||
post = Post(
|
||||
post_id=post_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=subreddit.lower(),
|
||||
title=post_data.get('title'),
|
||||
content=post_data.get('title'),
|
||||
published_at=post_data.get('date'),
|
||||
attachments=attachments,
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
self.log(f"Parsed {len(posts)} posts with {sum(len(p.attachments) for p in posts)} attachments from r/{subreddit}", 'info')
|
||||
return posts, temp_dir
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching posts from r/{subreddit}: {e}", 'error')
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
return [], None
|
||||
|
||||
def run_gallery_dl(self, subreddit: str, temp_dir: str,
|
||||
since_date: str = None, max_posts: int = 0,
|
||||
progress_callback=None, batch_callback=None,
|
||||
batch_size: int = 50) -> dict:
|
||||
"""Run gallery-dl to download media from a subreddit.
|
||||
|
||||
Streams stdout line-by-line. Calls progress_callback for status updates
|
||||
and batch_callback with lists of new file paths for incremental processing.
|
||||
|
||||
Args:
|
||||
progress_callback: Called with (dl_count, skip_count, total_seen)
|
||||
batch_callback: Called with (new_files: List[Path]) every batch_size files
|
||||
batch_size: How many files to accumulate before calling batch_callback
|
||||
|
||||
Returns:
|
||||
Dict with dl_count, skip_count, total.
|
||||
"""
|
||||
import time
|
||||
|
||||
# Use a separate download archive for paid content reddit
|
||||
archive_dir = '/opt/media-downloader/data/cache'
|
||||
os.makedirs(archive_dir, exist_ok=True)
|
||||
archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
|
||||
|
||||
cmd = [
|
||||
self.gallery_dl_path,
|
||||
'--write-metadata',
|
||||
'--download-archive', archive_path,
|
||||
'-d', temp_dir,
|
||||
]
|
||||
|
||||
# REST API mode to avoid shared OAuth rate limits
|
||||
cmd.extend(['-o', 'extractor.reddit.api=rest'])
|
||||
|
||||
# Limit posts (0 = unlimited)
|
||||
if max_posts > 0:
|
||||
cmd.extend(['--range', f'1-{max_posts}'])
|
||||
|
||||
# Date filtering
|
||||
if since_date:
|
||||
try:
|
||||
cutoff = since_date[:10] # YYYY-MM-DD
|
||||
cmd.extend(['--filter', f"date >= datetime.strptime('{cutoff}', '%Y-%m-%d')"])
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
cmd.append(f'https://www.reddit.com/r/{subreddit}/new/')
|
||||
|
||||
# Check for Reddit cookies file
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
|
||||
if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
|
||||
cmd.extend(['--cookies', temp_cookie_file])
|
||||
|
||||
self.log(f"Running gallery-dl for r/{subreddit}", 'info')
|
||||
self.log(f"Command: {' '.join(cmd)}", 'debug')
|
||||
|
||||
dl_count = 0
|
||||
skip_count = 0
|
||||
pending_files = []
|
||||
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
timeout_secs = 7200 # 2 hours
|
||||
|
||||
while True:
|
||||
if time.time() - start_time > timeout_secs:
|
||||
proc.kill()
|
||||
self.log(f"gallery-dl timed out for r/{subreddit}", 'error')
|
||||
break
|
||||
|
||||
line = proc.stdout.readline()
|
||||
if not line and proc.poll() is not None:
|
||||
break
|
||||
if not line:
|
||||
continue
|
||||
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith('# '):
|
||||
# Skipped file (already in archive)
|
||||
skip_count += 1
|
||||
else:
|
||||
# Downloaded file — gallery-dl prints the full path
|
||||
dl_count += 1
|
||||
file_path = Path(line)
|
||||
if file_path.exists() and not file_path.name.endswith('.json'):
|
||||
pending_files.append(file_path)
|
||||
|
||||
total = dl_count + skip_count
|
||||
if progress_callback and total % 5 == 0:
|
||||
progress_callback(dl_count, skip_count, total)
|
||||
|
||||
# Flush batch for processing
|
||||
if batch_callback and len(pending_files) >= batch_size:
|
||||
batch_callback(list(pending_files))
|
||||
pending_files.clear()
|
||||
|
||||
proc.wait()
|
||||
|
||||
# Final batch
|
||||
if batch_callback and pending_files:
|
||||
batch_callback(list(pending_files))
|
||||
pending_files.clear()
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(dl_count, skip_count, dl_count + skip_count)
|
||||
|
||||
returncode = proc.returncode
|
||||
if returncode not in (None, 0, 1, 4, 5):
|
||||
stderr = proc.stderr.read()
|
||||
self.log(f"gallery-dl returned code {returncode} for r/{subreddit}", 'warning')
|
||||
if stderr:
|
||||
self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"gallery-dl failed for r/{subreddit}: {e}", 'error')
|
||||
|
||||
self.log(f"gallery-dl done for r/{subreddit}: {dl_count} downloaded, {skip_count} skipped", 'info')
|
||||
return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
|
||||
|
||||
def _group_files_by_post(self, files: List[Path], temp_dir: str,
|
||||
subreddit: str) -> Dict[str, Dict]:
|
||||
"""Group downloaded files by Reddit post ID using metadata JSON sidecars.
|
||||
|
||||
Adapted from reddit_community_monitor.py:_group_files_by_post
|
||||
|
||||
Returns:
|
||||
Dict mapping reddit_post_id -> {
|
||||
'files': [Path],
|
||||
'title': str,
|
||||
'date': str,
|
||||
'source_url': str
|
||||
}
|
||||
"""
|
||||
posts: Dict[str, Dict] = {}
|
||||
|
||||
for file_path in files:
|
||||
# Look for matching metadata JSON sidecar
|
||||
json_path = file_path.with_suffix(file_path.suffix + '.json')
|
||||
if not json_path.exists():
|
||||
json_path = file_path.with_suffix('.json')
|
||||
|
||||
metadata = {}
|
||||
if json_path.exists():
|
||||
try:
|
||||
with open(json_path, 'r', encoding='utf-8') as f:
|
||||
metadata = json.load(f)
|
||||
except (json.JSONDecodeError, Exception) as e:
|
||||
self.log(f"Failed to parse metadata for {file_path.name}: {e}", 'debug')
|
||||
|
||||
# Extract Reddit post ID
|
||||
reddit_post_id = None
|
||||
for key in ('id', 'reddit_id', 'parent_id'):
|
||||
if key in metadata:
|
||||
reddit_post_id = str(metadata[key])
|
||||
break
|
||||
|
||||
if not reddit_post_id:
|
||||
# Filename-based fallback: subreddit_postid_num.ext
|
||||
parts = file_path.stem.split('_')
|
||||
if len(parts) >= 2:
|
||||
reddit_post_id = parts[-2] if len(parts) >= 3 else parts[-1]
|
||||
else:
|
||||
reddit_post_id = file_path.stem
|
||||
|
||||
# Extract post date
|
||||
post_date = None
|
||||
if 'date' in metadata:
|
||||
date_val = metadata['date']
|
||||
if isinstance(date_val, str):
|
||||
for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
|
||||
try:
|
||||
utc_dt = datetime.strptime(date_val, fmt).replace(tzinfo=timezone.utc)
|
||||
post_date = utc_dt.astimezone().strftime('%Y-%m-%dT%H:%M:%S')
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
if not post_date:
|
||||
post_date = date_val
|
||||
elif isinstance(date_val, (int, float)):
|
||||
try:
|
||||
post_date = datetime.fromtimestamp(date_val, tz=timezone.utc).isoformat()
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
if not post_date and 'created_utc' in metadata:
|
||||
try:
|
||||
post_date = datetime.fromtimestamp(metadata['created_utc'], tz=timezone.utc).isoformat()
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
if not post_date:
|
||||
post_date = datetime.now().isoformat()
|
||||
|
||||
title = metadata.get('title', metadata.get('description', ''))
|
||||
sub = metadata.get('subreddit', subreddit)
|
||||
source_url = f"https://www.reddit.com/r/{sub}/comments/{reddit_post_id}" if sub else ''
|
||||
|
||||
if reddit_post_id not in posts:
|
||||
posts[reddit_post_id] = {
|
||||
'files': [],
|
||||
'title': title,
|
||||
'date': post_date,
|
||||
'source_url': source_url,
|
||||
}
|
||||
|
||||
posts[reddit_post_id]['files'].append(file_path)
|
||||
|
||||
return posts
|
||||
|
||||
def _get_cookies_file(self) -> Optional[str]:
|
||||
"""Get Reddit cookies JSON from the scrapers table if configured."""
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT cookies FROM scrapers WHERE name = 'reddit' AND cookies IS NOT NULL"
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
return row[0]
|
||||
except Exception as e:
|
||||
self.log(f"Could not load Reddit cookies: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _write_netscape_cookie_file(self, cookies_json: str, output_path: str) -> bool:
|
||||
"""Convert JSON cookies array to Netscape cookie file format."""
|
||||
try:
|
||||
cookies = json.loads(cookies_json)
|
||||
if not isinstance(cookies, list):
|
||||
return False
|
||||
|
||||
with open(output_path, 'w') as f:
|
||||
f.write("# Netscape HTTP Cookie File\n")
|
||||
f.write("# https://curl.haxx.se/docs/http-cookies.html\n\n")
|
||||
for cookie in cookies:
|
||||
domain = cookie.get('domain', '')
|
||||
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
|
||||
path = cookie.get('path', '/')
|
||||
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
|
||||
expires = cookie.get('expirationDate', cookie.get('expiry', cookie.get('expires', 0)))
|
||||
if expires is None:
|
||||
expires = 0
|
||||
expires = str(int(float(expires)))
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
self.log(f"Failed to write Netscape cookie file: {e}", 'error')
|
||||
return False
|
||||
|
||||
def get_pullpush_post_ids(self, subreddit: str, after_ts: int = 0,
|
||||
before_ts: int = None,
|
||||
progress_callback=None) -> List[Dict]:
|
||||
"""Fetch all historical post IDs for a subreddit from the Pullpush (Pushshift) API.
|
||||
|
||||
Paginates through the full archive using created_utc ascending order.
|
||||
Rate-limited to ~1 request per 2 seconds.
|
||||
|
||||
Args:
|
||||
subreddit: Subreddit name (without r/)
|
||||
after_ts: Unix timestamp to start from (0 = beginning of time)
|
||||
before_ts: Unix timestamp to stop at (None = no upper limit)
|
||||
progress_callback: Optional callable(fetched_count, message)
|
||||
|
||||
Returns:
|
||||
List of dicts: [{id, title, created_utc, url, is_gallery}, ...]
|
||||
"""
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
base_url = 'https://api.pullpush.io/reddit/search/submission/'
|
||||
all_posts = []
|
||||
current_after = after_ts
|
||||
page = 0
|
||||
|
||||
while True:
|
||||
params = (
|
||||
f'subreddit={subreddit}'
|
||||
f'&size=100'
|
||||
f'&sort=asc'
|
||||
f'&sort_type=created_utc'
|
||||
f'&after={current_after}'
|
||||
)
|
||||
if before_ts is not None:
|
||||
params += f'&before={before_ts}'
|
||||
|
||||
url = f'{base_url}?{params}'
|
||||
page += 1
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; media-downloader/1.0)'
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
data = json.loads(resp.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 429:
|
||||
self.log(f"Pullpush rate limited, waiting 5s...", 'warning')
|
||||
time.sleep(5)
|
||||
continue
|
||||
self.log(f"Pullpush HTTP {e.code} for r/{subreddit}: {e}", 'error')
|
||||
break
|
||||
except Exception as e:
|
||||
self.log(f"Pullpush request failed for r/{subreddit}: {e}", 'error')
|
||||
break
|
||||
|
||||
posts = data.get('data', [])
|
||||
if not posts:
|
||||
break
|
||||
|
||||
for post in posts:
|
||||
all_posts.append({
|
||||
'id': post.get('id', ''),
|
||||
'title': post.get('title', ''),
|
||||
'created_utc': post.get('created_utc', 0),
|
||||
'url': post.get('url', ''),
|
||||
'is_gallery': post.get('is_gallery', False),
|
||||
'selftext': post.get('selftext', ''),
|
||||
})
|
||||
|
||||
last_ts = posts[-1].get('created_utc', 0)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(all_posts),
|
||||
f"Fetched {len(all_posts)} post IDs (page {page})")
|
||||
|
||||
# Handle stuck pagination — same timestamp repeating
|
||||
if last_ts <= current_after:
|
||||
current_after = last_ts + 1
|
||||
else:
|
||||
current_after = last_ts
|
||||
|
||||
# If we got fewer than 100, we've reached the end
|
||||
if len(posts) < 100:
|
||||
break
|
||||
|
||||
# Rate limit: 2s between requests
|
||||
time.sleep(2)
|
||||
|
||||
self.log(f"Pullpush: fetched {len(all_posts)} total post IDs for r/{subreddit}", 'info')
|
||||
return all_posts
|
||||
|
||||
def run_gallery_dl_urls(self, urls_file: str, temp_dir: str,
|
||||
progress_callback=None, batch_callback=None,
|
||||
batch_size: int = 50) -> dict:
|
||||
"""Run gallery-dl with --input-file to download specific Reddit post URLs.
|
||||
|
||||
Same streaming/batch pattern as run_gallery_dl() but reads URLs from a file
|
||||
instead of scraping a subreddit listing.
|
||||
|
||||
Args:
|
||||
urls_file: Path to file containing one URL per line
|
||||
temp_dir: Directory for gallery-dl to download into
|
||||
progress_callback: Called with (dl_count, skip_count, total_seen)
|
||||
batch_callback: Called with (new_files: List[Path]) every batch_size files
|
||||
batch_size: How many files to accumulate before calling batch_callback
|
||||
|
||||
Returns:
|
||||
Dict with dl_count, skip_count, total.
|
||||
"""
|
||||
import time
|
||||
|
||||
# Same archive as normal Reddit paid content sync
|
||||
archive_dir = '/opt/media-downloader/data/cache'
|
||||
os.makedirs(archive_dir, exist_ok=True)
|
||||
archive_path = os.path.join(archive_dir, 'reddit_paid_gallery_dl_archive.db')
|
||||
|
||||
cmd = [
|
||||
self.gallery_dl_path,
|
||||
'--write-metadata',
|
||||
'--download-archive', archive_path,
|
||||
'-d', temp_dir,
|
||||
'-o', 'extractor.reddit.api=rest',
|
||||
'--input-file', urls_file,
|
||||
]
|
||||
|
||||
# Check for Reddit cookies file
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
|
||||
if self._write_netscape_cookie_file(cookies_file, temp_cookie_file):
|
||||
cmd.extend(['--cookies', temp_cookie_file])
|
||||
|
||||
self.log(f"Running gallery-dl with input file ({urls_file})", 'info')
|
||||
self.log(f"Command: {' '.join(cmd)}", 'debug')
|
||||
|
||||
dl_count = 0
|
||||
skip_count = 0
|
||||
pending_files = []
|
||||
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
timeout_secs = 14400 # 4 hours for backfill (can be large)
|
||||
|
||||
while True:
|
||||
if time.time() - start_time > timeout_secs:
|
||||
proc.kill()
|
||||
self.log("gallery-dl backfill timed out", 'error')
|
||||
break
|
||||
|
||||
line = proc.stdout.readline()
|
||||
if not line and proc.poll() is not None:
|
||||
break
|
||||
if not line:
|
||||
continue
|
||||
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith('# '):
|
||||
skip_count += 1
|
||||
else:
|
||||
dl_count += 1
|
||||
file_path = Path(line)
|
||||
if file_path.exists() and not file_path.name.endswith('.json'):
|
||||
pending_files.append(file_path)
|
||||
|
||||
total = dl_count + skip_count
|
||||
if progress_callback:
|
||||
progress_callback(dl_count, skip_count, total)
|
||||
|
||||
if batch_callback and len(pending_files) >= batch_size:
|
||||
batch_callback(list(pending_files))
|
||||
pending_files.clear()
|
||||
|
||||
proc.wait()
|
||||
|
||||
# Final batch
|
||||
if batch_callback and pending_files:
|
||||
batch_callback(list(pending_files))
|
||||
pending_files.clear()
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(dl_count, skip_count, dl_count + skip_count)
|
||||
|
||||
returncode = proc.returncode
|
||||
if returncode not in (None, 0, 1, 4, 5):
|
||||
stderr = proc.stderr.read()
|
||||
self.log(f"gallery-dl backfill returned code {returncode}", 'warning')
|
||||
if stderr:
|
||||
self.log(f"gallery-dl stderr: {stderr[:500]}", 'debug')
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"gallery-dl backfill failed: {e}", 'error')
|
||||
|
||||
self.log(f"gallery-dl backfill done: {dl_count} downloaded, {skip_count} skipped", 'info')
|
||||
return {'dl_count': dl_count, 'skip_count': skip_count, 'total': dl_count + skip_count}
|
||||
|
||||
@staticmethod
|
||||
def _detect_file_type(ext: str) -> str:
|
||||
"""Detect file type from extension."""
|
||||
ext = ext.lower().lstrip('.')
|
||||
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
|
||||
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv', 'mpeg', 'mpg'}
|
||||
|
||||
if ext in image_exts:
|
||||
return 'image'
|
||||
elif ext in video_exts:
|
||||
return 'video'
|
||||
return 'unknown'
|
||||
9843
modules/paid_content/scraper.py
Normal file
9843
modules/paid_content/scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
259
modules/paid_content/snapchat_client.py
Normal file
259
modules/paid_content/snapchat_client.py
Normal file
@@ -0,0 +1,259 @@
|
||||
"""
|
||||
Snapchat Client for Paid Content - Wraps SnapchatClientDownloader for paid content system.
|
||||
|
||||
Maps spotlights and highlights to the Post/Attachment model used by the paid content scraper.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
class SnapchatPaidContentClient(LoggingMixin):
|
||||
"""
|
||||
Client for fetching Snapchat creator content via the existing SnapchatClientDownloader.
|
||||
|
||||
Each spotlight/highlight collection maps to one Post with snaps as Attachments.
|
||||
"""
|
||||
|
||||
SERVICE_ID = 'snapchat'
|
||||
PLATFORM = 'snapchat'
|
||||
|
||||
def __init__(self, unified_db=None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Snapchat')
|
||||
self.unified_db = unified_db
|
||||
self._downloader = None
|
||||
|
||||
def _get_downloader(self):
|
||||
"""Lazy-init the underlying SnapchatClientDownloader."""
|
||||
if self._downloader is None:
|
||||
from modules.snapchat_client_module import SnapchatClientDownloader
|
||||
self._downloader = SnapchatClientDownloader(
|
||||
show_progress=False,
|
||||
use_database=False,
|
||||
log_callback=self.log_callback,
|
||||
unified_db=self.unified_db,
|
||||
)
|
||||
return self._downloader
|
||||
|
||||
def get_creator_info(self, username: str) -> Optional[Dict]:
|
||||
"""Get creator information from profile page __NEXT_DATA__.
|
||||
|
||||
Returns dict with display_name and avatar_url if found.
|
||||
"""
|
||||
downloader = self._get_downloader()
|
||||
|
||||
profile_url = f"https://story.snapchat.com/@{username}"
|
||||
html = downloader._fetch_page(profile_url)
|
||||
if not html:
|
||||
return {'creator_id': username, 'creator_name': username}
|
||||
|
||||
data = downloader._extract_next_data(html)
|
||||
display_name = username
|
||||
avatar_url = None
|
||||
|
||||
if data:
|
||||
props = data.get('props', {}).get('pageProps', {})
|
||||
|
||||
# userProfile uses a $case/userInfo wrapper
|
||||
user_profile = props.get('userProfile', {})
|
||||
user_info = user_profile.get('userInfo', {})
|
||||
if user_info:
|
||||
name = user_info.get('displayName', '').strip()
|
||||
if name:
|
||||
display_name = name
|
||||
|
||||
# Bitmoji 3D avatar URL (best quality)
|
||||
bitmoji = user_info.get('bitmoji3d') or {}
|
||||
if isinstance(bitmoji, dict):
|
||||
avatar_url = bitmoji.get('avatarUrl') or bitmoji.get('url')
|
||||
|
||||
# linkPreview OG images as avatar (preview/square.jpeg — good quality)
|
||||
if not avatar_url:
|
||||
link_preview = props.get('linkPreview', {})
|
||||
for img_key in ('facebookImage', 'twitterImage'):
|
||||
img = link_preview.get(img_key, {})
|
||||
if isinstance(img, dict) and img.get('url'):
|
||||
avatar_url = img['url']
|
||||
break
|
||||
|
||||
# pageMetadata.pageTitle sometimes has the display name
|
||||
if display_name == username:
|
||||
page_meta = props.get('pageMetadata', {})
|
||||
page_title = page_meta.get('pageTitle', '')
|
||||
# Format: "DisplayName (@username) | Snapchat..."
|
||||
if page_title and '(@' in page_title:
|
||||
name_part = page_title.split('(@')[0].strip()
|
||||
if name_part:
|
||||
display_name = name_part
|
||||
|
||||
return {
|
||||
'creator_id': username,
|
||||
'creator_name': display_name,
|
||||
'profile_image_url': avatar_url,
|
||||
}
|
||||
|
||||
def get_creator(self, username: str) -> Optional[Creator]:
|
||||
"""Get Creator model for a Snapchat user."""
|
||||
info = self.get_creator_info(username)
|
||||
if not info:
|
||||
return None
|
||||
|
||||
return Creator(
|
||||
creator_id=username,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
username=info.get('creator_name', username),
|
||||
display_name=info.get('creator_name'),
|
||||
profile_image_url=info.get('profile_image_url'),
|
||||
)
|
||||
|
||||
def get_posts(self, username: str, since_date: str = None) -> List[Post]:
|
||||
"""Fetch spotlights and highlights as Post objects.
|
||||
|
||||
Args:
|
||||
username: Snapchat username (without @)
|
||||
since_date: ISO date string; skip snaps older than this
|
||||
|
||||
Returns:
|
||||
List of Post objects (one per spotlight/highlight collection)
|
||||
"""
|
||||
downloader = self._get_downloader()
|
||||
|
||||
# Parse cutoff date
|
||||
cutoff_dt = None
|
||||
if since_date:
|
||||
try:
|
||||
if 'T' in since_date:
|
||||
cutoff_dt = datetime.fromisoformat(since_date.replace('Z', '+00:00').replace('+00:00', ''))
|
||||
else:
|
||||
cutoff_dt = datetime.strptime(since_date[:10], '%Y-%m-%d')
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
# Discover content from profile (spotlights, highlights, stories)
|
||||
profile_content = downloader.get_profile_content(username)
|
||||
self.log(f"Found {len(profile_content.get('spotlights', []))} spotlights, "
|
||||
f"{len(profile_content.get('highlight_collections', []))} highlights, "
|
||||
f"{'stories' if profile_content.get('story_collection') else 'no stories'} "
|
||||
f"for @{username}", 'info')
|
||||
|
||||
posts = []
|
||||
|
||||
# Process story snaps (inline from profile page — no extra HTTP requests)
|
||||
story_collection = profile_content.get('story_collection')
|
||||
if story_collection and story_collection.snaps:
|
||||
post = self._collection_to_post(story_collection, username, cutoff_dt)
|
||||
if post and post.attachments:
|
||||
posts.append(post)
|
||||
|
||||
# Process highlights (inline from profile page — no extra HTTP requests)
|
||||
for collection in profile_content.get('highlight_collections', []):
|
||||
post = self._collection_to_post(collection, username, cutoff_dt)
|
||||
if post and post.attachments:
|
||||
posts.append(post)
|
||||
|
||||
# Process spotlights (still requires per-URL fetch for full metadata)
|
||||
for url in profile_content.get('spotlights', []):
|
||||
collection = downloader.get_spotlight_metadata(url)
|
||||
if not collection:
|
||||
continue
|
||||
post = self._collection_to_post(collection, username, cutoff_dt)
|
||||
if post and post.attachments:
|
||||
posts.append(post)
|
||||
|
||||
self.log(f"Mapped {len(posts)} posts with attachments for @{username}", 'info')
|
||||
return posts
|
||||
|
||||
def _collection_to_post(self, collection, username: str, cutoff_dt=None) -> Optional[Post]:
|
||||
"""Convert a SnapCollection to a Post with Attachments."""
|
||||
if not collection.snaps:
|
||||
return None
|
||||
|
||||
# Use the earliest snap timestamp as the post date
|
||||
timestamps = [s.timestamp for s in collection.snaps if s.timestamp]
|
||||
if timestamps:
|
||||
earliest = min(timestamps)
|
||||
published_at = earliest.strftime('%Y-%m-%d')
|
||||
else:
|
||||
published_at = None
|
||||
|
||||
# Skip if all snaps are older than cutoff
|
||||
if cutoff_dt and timestamps:
|
||||
latest = max(timestamps)
|
||||
if latest < cutoff_dt:
|
||||
return None
|
||||
|
||||
attachments = []
|
||||
for snap in collection.snaps:
|
||||
if not snap.media_url:
|
||||
continue
|
||||
|
||||
# Determine extension from media type
|
||||
ext = '.mp4' if snap.media_type == 'video' else '.jpg'
|
||||
name = f"{snap.media_id}{ext}" if snap.media_id else f"snap_{snap.index}{ext}"
|
||||
|
||||
attachment = Attachment(
|
||||
name=name,
|
||||
file_type=snap.media_type,
|
||||
extension=ext,
|
||||
server_path=snap.media_url,
|
||||
download_url=snap.media_url,
|
||||
width=snap.width if snap.width else None,
|
||||
height=snap.height if snap.height else None,
|
||||
duration=snap.duration_ms // 1000 if snap.duration_ms else None,
|
||||
)
|
||||
attachments.append(attachment)
|
||||
|
||||
if not attachments:
|
||||
return None
|
||||
|
||||
# Build content/title from collection metadata
|
||||
title = collection.title or None
|
||||
content = collection.title if collection.title else None
|
||||
|
||||
# Tag as spotlight or highlight
|
||||
tag_name = collection.collection_type.title() # "Spotlight" or "Highlight"
|
||||
|
||||
return Post(
|
||||
post_id=collection.collection_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=username,
|
||||
title=title,
|
||||
content=content,
|
||||
published_at=published_at,
|
||||
attachments=attachments,
|
||||
auto_tags=[tag_name],
|
||||
)
|
||||
|
||||
def download_snap(self, media_url: str, output_path: str) -> bool:
|
||||
"""Download a single snap file via curl_cffi.
|
||||
|
||||
Args:
|
||||
media_url: Direct URL to the media file
|
||||
output_path: Local path to save the file
|
||||
|
||||
Returns:
|
||||
True if download succeeded
|
||||
"""
|
||||
import os
|
||||
downloader = self._get_downloader()
|
||||
session = downloader._get_session()
|
||||
|
||||
try:
|
||||
url = media_url.replace('&', '&')
|
||||
resp = session.get(url, timeout=60)
|
||||
if resp.status_code == 200 and len(resp.content) > 0:
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(resp.content)
|
||||
return True
|
||||
else:
|
||||
self.log(f"Download failed: HTTP {resp.status_code}, size={len(resp.content)}", 'warning')
|
||||
return False
|
||||
except Exception as e:
|
||||
self.log(f"Download error: {e}", 'error')
|
||||
return False
|
||||
508
modules/paid_content/soundgasm_client.py
Normal file
508
modules/paid_content/soundgasm_client.py
Normal file
@@ -0,0 +1,508 @@
|
||||
"""
|
||||
Soundgasm + Liltsome Archive Client for Paid Content
|
||||
|
||||
Handles:
|
||||
- Soundgasm profile scraping (no auth/Cloudflare needed)
|
||||
- Liltsome archive (liltsome.yerf.org) as supplementary source
|
||||
- Bracket tag parsing from audio titles: [F4M] [Whisper] etc.
|
||||
- Direct HTTP audio downloads (.m4a)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from urllib.parse import quote
|
||||
|
||||
import aiohttp
|
||||
import aiofiles
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bracket tag helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def parse_bracket_tags(title: str) -> Tuple[str, List[str]]:
|
||||
"""Extract [bracket] tags from a title, normalize, return (clean_title, tags)."""
|
||||
tags = re.findall(r'\[([^\]]+)\]', title)
|
||||
clean_title = re.sub(r'\s*\[[^\]]+\]\s*', ' ', title).strip()
|
||||
normalized: List[str] = []
|
||||
seen: Set[str] = set()
|
||||
for tag in tags:
|
||||
tag_lower = tag.strip().lower()
|
||||
if tag_lower and tag_lower not in seen:
|
||||
seen.add(tag_lower)
|
||||
normalized.append(tag_lower)
|
||||
return clean_title, normalized
|
||||
|
||||
|
||||
def format_tag_display(tag_lower: str) -> str:
|
||||
"""Format a normalized lowercase tag for display.
|
||||
|
||||
Gender tags (f4m, m4f, f4a …) → uppercase.
|
||||
Everything else → title case.
|
||||
"""
|
||||
if re.match(r'^[a-z]+\d[a-z]+$', tag_lower):
|
||||
return tag_lower.upper()
|
||||
return tag_lower.title()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SoundgasmClient
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class SoundgasmClient(LoggingMixin):
|
||||
"""Client for fetching audio from Soundgasm and the Liltsome archive."""
|
||||
|
||||
SERVICE_ID = 'soundgasm'
|
||||
PLATFORM = 'soundgasm'
|
||||
|
||||
SOUNDGASM_BASE = 'https://soundgasm.net'
|
||||
LILTSOME_BASE = 'https://liltsome.yerf.org'
|
||||
LILTSOME_LIBRARY_URL = f'{LILTSOME_BASE}/data/library.json'
|
||||
LILTSOME_CACHE_PATH = Path('/opt/media-downloader/data/liltsome_library.json')
|
||||
LILTSOME_ETAG_PATH = Path('/opt/media-downloader/data/liltsome_library.json.etag')
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||||
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
def __init__(self, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Soundgasm')
|
||||
self._liltsome_data: Optional[Dict] = None # cached in-memory per sync run
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_profile_info(self, username: str) -> Optional[Dict]:
|
||||
"""Return basic profile info (post count) from Soundgasm and/or Liltsome."""
|
||||
post_count = 0
|
||||
source = None
|
||||
|
||||
# Try Soundgasm profile page first
|
||||
try:
|
||||
sg_posts = await self._fetch_soundgasm_profile(username)
|
||||
if sg_posts is not None:
|
||||
post_count = len(sg_posts)
|
||||
source = 'soundgasm'
|
||||
except Exception as e:
|
||||
self.log(f"Soundgasm profile fetch failed for {username}: {e}", 'debug')
|
||||
|
||||
# Also check Liltsome for additional posts
|
||||
try:
|
||||
lt_entries = await self._get_liltsome_entries(username)
|
||||
if lt_entries:
|
||||
post_count = max(post_count, len(lt_entries))
|
||||
if source is None:
|
||||
source = 'liltsome'
|
||||
except Exception as e:
|
||||
self.log(f"Liltsome lookup failed for {username}: {e}", 'debug')
|
||||
|
||||
if post_count == 0 and source is None:
|
||||
return None
|
||||
|
||||
return {
|
||||
'username': username,
|
||||
'post_count': post_count,
|
||||
'source': source,
|
||||
}
|
||||
|
||||
async def get_posts(self, username: str, known_post_ids: Optional[Set[str]] = None,
|
||||
progress_callback=None) -> List[Post]:
|
||||
"""Fetch posts from both Soundgasm and Liltsome, deduplicating by post_id."""
|
||||
known = known_post_ids or set()
|
||||
posts: List[Post] = []
|
||||
seen_ids: Set[str] = set(known)
|
||||
|
||||
# 1. Soundgasm (may fail if account deleted — that's OK)
|
||||
try:
|
||||
sg_posts = await self._fetch_soundgasm_posts(username, seen_ids)
|
||||
for p in sg_posts:
|
||||
if p.post_id not in seen_ids:
|
||||
seen_ids.add(p.post_id)
|
||||
posts.append(p)
|
||||
self.log(f"Soundgasm: {len(sg_posts)} new posts for {username}", 'info')
|
||||
except Exception as e:
|
||||
self.log(f"Soundgasm fetch failed for {username} (account may be deleted): {e}", 'warning')
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(posts))
|
||||
|
||||
# 2. Liltsome archive (always)
|
||||
try:
|
||||
lt_posts = await self._fetch_liltsome_posts(username, seen_ids)
|
||||
for p in lt_posts:
|
||||
if p.post_id not in seen_ids:
|
||||
seen_ids.add(p.post_id)
|
||||
posts.append(p)
|
||||
self.log(f"Liltsome: {len(lt_posts)} new posts for {username}", 'info')
|
||||
except Exception as e:
|
||||
self.log(f"Liltsome fetch failed for {username}: {e}", 'warning')
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(posts))
|
||||
|
||||
return posts
|
||||
|
||||
async def download_audio(self, download_url: str, output_path: Path) -> Dict:
|
||||
"""Download an audio file via direct HTTP GET."""
|
||||
try:
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=300)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(download_url, headers=self.HEADERS) as resp:
|
||||
if resp.status != 200:
|
||||
return {'success': False, 'error': f'HTTP {resp.status}'}
|
||||
|
||||
async with aiofiles.open(str(output_path), 'wb') as f:
|
||||
total = 0
|
||||
async for chunk in resp.content.iter_chunked(65536):
|
||||
await f.write(chunk)
|
||||
total += len(chunk)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(output_path),
|
||||
'file_size': total,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Download failed for {download_url}: {e}", 'error')
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Soundgasm scraping
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _fetch_soundgasm_profile(self, username: str) -> Optional[List[Dict]]:
|
||||
"""Scrape the Soundgasm profile page, return list of {slug, title, plays}."""
|
||||
url = f'{self.SOUNDGASM_BASE}/u/{username}'
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(url, headers=self.HEADERS) as resp:
|
||||
if resp.status == 404:
|
||||
return None
|
||||
if resp.status != 200:
|
||||
self.log(f"Soundgasm profile returned {resp.status}", 'warning')
|
||||
return None
|
||||
html = await resp.text()
|
||||
|
||||
# Parse .sound-details divs for links
|
||||
entries: List[Dict] = []
|
||||
# Pattern: <a href="https://soundgasm.net/u/{username}/{slug}">title</a>
|
||||
# (profile page uses absolute URLs)
|
||||
for m in re.finditer(
|
||||
r'<a\s+href="(?:https?://soundgasm\.net)?/u/' + re.escape(username) + r'/([^"]+)"[^>]*>\s*([^<]+)',
|
||||
html, re.IGNORECASE
|
||||
):
|
||||
slug = m.group(1).strip()
|
||||
title = m.group(2).strip()
|
||||
entries.append({'slug': slug, 'title': title})
|
||||
|
||||
return entries
|
||||
|
||||
async def _fetch_soundgasm_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
|
||||
"""Fetch full post details from Soundgasm for new posts."""
|
||||
profile_entries = await self._fetch_soundgasm_profile(username)
|
||||
if not profile_entries:
|
||||
return []
|
||||
|
||||
posts: List[Post] = []
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
for entry in profile_entries:
|
||||
slug = entry['slug']
|
||||
if slug in seen_ids:
|
||||
continue
|
||||
|
||||
try:
|
||||
detail = await self._fetch_soundgasm_detail(session, username, slug)
|
||||
if detail is None:
|
||||
continue
|
||||
|
||||
title_raw = detail.get('title', entry.get('title', slug))
|
||||
clean_title, tags = parse_bracket_tags(title_raw)
|
||||
description = detail.get('description', '')
|
||||
audio_url = detail.get('audio_url')
|
||||
|
||||
if not audio_url:
|
||||
continue
|
||||
|
||||
# Determine extension from URL
|
||||
ext = '.m4a'
|
||||
if audio_url:
|
||||
url_path = audio_url.split('?')[0]
|
||||
if '.' in url_path.split('/')[-1]:
|
||||
ext = '.' + url_path.split('/')[-1].rsplit('.', 1)[1]
|
||||
|
||||
filename = f"{slug}{ext}"
|
||||
|
||||
attachment = Attachment(
|
||||
name=filename,
|
||||
file_type='audio',
|
||||
extension=ext.lstrip('.'),
|
||||
server_path=f'/u/{username}/{slug}',
|
||||
download_url=audio_url,
|
||||
)
|
||||
|
||||
post = Post(
|
||||
post_id=slug,
|
||||
service_id='soundgasm',
|
||||
platform='soundgasm',
|
||||
creator_id=username,
|
||||
title=clean_title or None,
|
||||
content=description or None,
|
||||
published_at=None, # Soundgasm has no dates
|
||||
attachments=[attachment],
|
||||
auto_tags=tags,
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching Soundgasm detail for {slug}: {e}", 'debug')
|
||||
|
||||
return posts
|
||||
|
||||
async def _fetch_soundgasm_detail(self, session: aiohttp.ClientSession,
|
||||
username: str, slug: str) -> Optional[Dict]:
|
||||
"""Fetch a single Soundgasm audio detail page and extract metadata."""
|
||||
url = f'{self.SOUNDGASM_BASE}/u/{username}/{slug}'
|
||||
|
||||
async with session.get(url, headers=self.HEADERS) as resp:
|
||||
if resp.status != 200:
|
||||
return None
|
||||
html = await resp.text()
|
||||
|
||||
# Title: <div aria-label="title"...>Title Text</div>
|
||||
# or from the page title tag
|
||||
title = None
|
||||
title_match = re.search(r'aria-label="title"[^>]*>([^<]+)', html)
|
||||
if title_match:
|
||||
title = title_match.group(1).strip()
|
||||
if not title:
|
||||
title_match = re.search(r'<title>([^<]+)</title>', html, re.IGNORECASE)
|
||||
if title_match:
|
||||
title = title_match.group(1).strip()
|
||||
# Remove " - Soundgasm" suffix if present
|
||||
title = re.sub(r'\s*[-–—]\s*Soundgasm.*$', '', title, flags=re.IGNORECASE).strip()
|
||||
|
||||
# Description: <div class="jp-description">...</div>
|
||||
description = None
|
||||
desc_match = re.search(r'class="jp-description"[^>]*>(.*?)</div>', html, re.DOTALL)
|
||||
if desc_match:
|
||||
desc_html = desc_match.group(1)
|
||||
# Strip HTML tags
|
||||
description = re.sub(r'<br\s*/?>', '\n', desc_html)
|
||||
description = re.sub(r'<[^>]+>', '', description).strip()
|
||||
|
||||
# Audio URL: m4a: "https://..."
|
||||
audio_url = None
|
||||
audio_match = re.search(r'm4a:\s*"([^"]+)"', html)
|
||||
if audio_match:
|
||||
audio_url = audio_match.group(1)
|
||||
|
||||
if not audio_url:
|
||||
return None
|
||||
|
||||
return {
|
||||
'title': title or slug,
|
||||
'description': description,
|
||||
'audio_url': audio_url,
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Liltsome archive
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _ensure_liltsome_cache(self) -> bool:
|
||||
"""Download/refresh the Liltsome library.json using ETag-based invalidation.
|
||||
|
||||
Returns True if cache is available (fresh or existing), False otherwise.
|
||||
"""
|
||||
etag_file = self.LILTSOME_ETAG_PATH
|
||||
cache_file = self.LILTSOME_CACHE_PATH
|
||||
|
||||
stored_etag = None
|
||||
if etag_file.exists():
|
||||
try:
|
||||
stored_etag = etag_file.read_text().strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=600) # 131MB can take a while
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
# HEAD request to check ETag
|
||||
async with session.head(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"Liltsome HEAD returned {resp.status}", 'warning')
|
||||
return cache_file.exists()
|
||||
|
||||
remote_etag = resp.headers.get('ETag', '').strip()
|
||||
|
||||
if stored_etag and remote_etag and stored_etag == remote_etag and cache_file.exists():
|
||||
self.log("Liltsome cache is fresh (ETag match)", 'debug')
|
||||
return True
|
||||
|
||||
# Download the full library
|
||||
self.log("Downloading Liltsome library.json (this may take a while)...", 'info')
|
||||
async with session.get(self.LILTSOME_LIBRARY_URL, headers=self.HEADERS) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"Liltsome GET returned {resp.status}", 'warning')
|
||||
return cache_file.exists()
|
||||
|
||||
cache_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
async with aiofiles.open(str(cache_file), 'wb') as f:
|
||||
async for chunk in resp.content.iter_chunked(262144):
|
||||
await f.write(chunk)
|
||||
|
||||
new_etag = resp.headers.get('ETag', remote_etag or '').strip()
|
||||
|
||||
if new_etag:
|
||||
etag_file.write_text(new_etag)
|
||||
|
||||
self.log("Liltsome library.json downloaded successfully", 'info')
|
||||
self._liltsome_data = None # force re-parse
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Failed to refresh Liltsome cache: {e}", 'warning')
|
||||
return cache_file.exists()
|
||||
|
||||
async def _load_liltsome_data(self) -> Optional[Dict]:
|
||||
"""Load and cache the Liltsome library data in memory."""
|
||||
if self._liltsome_data is not None:
|
||||
return self._liltsome_data
|
||||
|
||||
cache_file = self.LILTSOME_CACHE_PATH
|
||||
if not cache_file.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
data = await asyncio.to_thread(self._read_liltsome_json, cache_file)
|
||||
self._liltsome_data = data
|
||||
return data
|
||||
except Exception as e:
|
||||
self.log(f"Failed to parse Liltsome library.json: {e}", 'error')
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _read_liltsome_json(path: Path) -> Dict:
|
||||
"""Read and parse the Liltsome JSON file (blocking, run in thread)."""
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
async def _get_liltsome_entries(self, username: str) -> Optional[List[Dict]]:
|
||||
"""Find artist entries in Liltsome data by username (case-insensitive).
|
||||
|
||||
library.json structure: {"artists": [{"id": "name", "files": {"audio": [...]}}]}
|
||||
"""
|
||||
await self._ensure_liltsome_cache()
|
||||
data = await self._load_liltsome_data()
|
||||
if not data:
|
||||
return None
|
||||
|
||||
username_lower = username.lower()
|
||||
|
||||
# Top-level is {"artists": [...]}
|
||||
artists = data.get('artists', []) if isinstance(data, dict) else data
|
||||
|
||||
for artist in artists:
|
||||
artist_id = str(artist.get('id', '')).lower()
|
||||
artist_name = str(artist.get('name', '')).lower()
|
||||
if artist_id == username_lower or artist_name == username_lower:
|
||||
# Audio entries are in files.audio
|
||||
files = artist.get('files', {})
|
||||
if isinstance(files, dict):
|
||||
return files.get('audio', [])
|
||||
return []
|
||||
|
||||
return None
|
||||
|
||||
async def _fetch_liltsome_posts(self, username: str, seen_ids: Set[str]) -> List[Post]:
|
||||
"""Convert Liltsome archive entries to Post objects."""
|
||||
entries = await self._get_liltsome_entries(username)
|
||||
if not entries:
|
||||
return []
|
||||
|
||||
posts: List[Post] = []
|
||||
for entry in entries:
|
||||
filename = entry.get('filename', '')
|
||||
path = entry.get('path', '')
|
||||
title_raw = entry.get('title', filename)
|
||||
entry_tags = entry.get('tags', []) # already lowercase in Liltsome
|
||||
duration = None
|
||||
file_size = entry.get('size')
|
||||
|
||||
if isinstance(entry.get('metadata'), dict):
|
||||
duration = entry['metadata'].get('duration')
|
||||
|
||||
# Build post_id: prefix with liltsome- to avoid collision
|
||||
sanitized_name = re.sub(r'[^a-zA-Z0-9_.-]', '_', filename) if filename else path
|
||||
post_id = f'liltsome-{sanitized_name}'
|
||||
|
||||
if post_id in seen_ids:
|
||||
continue
|
||||
|
||||
# Parse bracket tags from title for clean_title
|
||||
clean_title, title_tags = parse_bracket_tags(title_raw)
|
||||
|
||||
# Merge: use Liltsome's pre-parsed tags + any extra from title
|
||||
all_tags_set: Set[str] = set()
|
||||
all_tags: List[str] = []
|
||||
for t in entry_tags:
|
||||
t_lower = t.strip().lower()
|
||||
if t_lower and t_lower not in all_tags_set:
|
||||
all_tags_set.add(t_lower)
|
||||
all_tags.append(t_lower)
|
||||
for t in title_tags:
|
||||
if t not in all_tags_set:
|
||||
all_tags_set.add(t)
|
||||
all_tags.append(t)
|
||||
|
||||
# Build download URL
|
||||
download_url = f'{self.LILTSOME_BASE}/audio_files/{quote(path, safe="/")}' if path else None
|
||||
|
||||
# Determine extension
|
||||
ext = 'm4a'
|
||||
if filename and '.' in filename:
|
||||
ext = filename.rsplit('.', 1)[1].lower()
|
||||
elif path and '.' in path:
|
||||
ext = path.rsplit('.', 1)[1].lower()
|
||||
|
||||
attachment = Attachment(
|
||||
name=f"{sanitized_name}.{ext}" if not filename.endswith(f'.{ext}') else filename,
|
||||
file_type='audio',
|
||||
extension=ext,
|
||||
server_path=path or filename,
|
||||
download_url=download_url,
|
||||
file_size=file_size,
|
||||
duration=duration,
|
||||
)
|
||||
|
||||
post = Post(
|
||||
post_id=post_id,
|
||||
service_id='soundgasm',
|
||||
platform='soundgasm',
|
||||
creator_id=username,
|
||||
title=clean_title or None,
|
||||
content=None,
|
||||
published_at=None,
|
||||
attachments=[attachment],
|
||||
auto_tags=all_tags,
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
return posts
|
||||
827
modules/paid_content/tiktok_client.py
Normal file
827
modules/paid_content/tiktok_client.py
Normal file
@@ -0,0 +1,827 @@
|
||||
"""
|
||||
TikTok Client for Paid Content - Uses yt-dlp for listing and gallery-dl for downloading
|
||||
|
||||
Adapts the hybrid approach from modules/tiktok_module.py into the paid content client pattern.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html as html_module
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
class TikTokClient(LoggingMixin):
|
||||
"""
|
||||
Client for fetching TikTok creator information and videos.
|
||||
|
||||
Uses yt-dlp for listing (fast flat-playlist) and gallery-dl for downloading
|
||||
(handles carousels/slideshows properly).
|
||||
"""
|
||||
|
||||
SERVICE_ID = 'tiktok'
|
||||
PLATFORM = 'tiktok'
|
||||
|
||||
def __init__(self, unified_db=None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='TikTok')
|
||||
|
||||
self.ytdlp_path = self._find_executable('yt-dlp')
|
||||
self.gallery_dl_path = self._find_executable('gallery-dl')
|
||||
self.unified_db = unified_db
|
||||
self._cookies_file = None
|
||||
self._last_pinned_posts = {}
|
||||
|
||||
if not self.ytdlp_path:
|
||||
self.log("yt-dlp not found, TikTok listing will be disabled", 'warning')
|
||||
if not self.gallery_dl_path:
|
||||
self.log("gallery-dl not found, TikTok downloading will be disabled", 'warning')
|
||||
|
||||
def _find_executable(self, name: str) -> Optional[str]:
|
||||
"""Find an executable by name"""
|
||||
common_paths = [
|
||||
f'/opt/media-downloader/venv/bin/{name}',
|
||||
f'/usr/local/bin/{name}',
|
||||
f'/usr/bin/{name}',
|
||||
f'/opt/homebrew/bin/{name}',
|
||||
os.path.expanduser(f'~/.local/bin/{name}'),
|
||||
]
|
||||
|
||||
for path in common_paths:
|
||||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
return path
|
||||
|
||||
try:
|
||||
result = subprocess.run(['which', name], capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if both yt-dlp and gallery-dl are available"""
|
||||
return self.ytdlp_path is not None and self.gallery_dl_path is not None
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up any temporary files"""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
try:
|
||||
os.unlink(self._cookies_file)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _get_cookies_file(self) -> Optional[str]:
|
||||
"""Get path to cookies file, creating from database if needed."""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
return self._cookies_file
|
||||
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
# Check for tiktok scraper cookies
|
||||
for scraper_id in ('tiktok', 'tiktok_client'):
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", (scraper_id,))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
data = json.loads(row[0])
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
cookies_list = data['cookies']
|
||||
elif isinstance(data, list):
|
||||
cookies_list = data
|
||||
else:
|
||||
cookies_list = []
|
||||
|
||||
if cookies_list:
|
||||
import tempfile
|
||||
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='tiktok_cookies_')
|
||||
with os.fdopen(fd, 'w') as f:
|
||||
f.write("# Netscape HTTP Cookie File\n")
|
||||
for cookie in cookies_list:
|
||||
domain = cookie.get('domain', '')
|
||||
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
|
||||
path = cookie.get('path', '/')
|
||||
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
|
||||
expiry = str(int(cookie.get('expirationDate', 0)))
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
|
||||
self.log(f"Loaded {len(cookies_list)} TikTok cookies", 'debug')
|
||||
return self._cookies_file
|
||||
except Exception as e:
|
||||
self.log(f"Could not load TikTok cookies: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _save_cookies_back(self):
|
||||
"""Read updated cookies from temp file and save back to database.
|
||||
yt-dlp and gallery-dl update the cookies file with refreshed tokens
|
||||
from TikTok (e.g. msToken), so we need to persist those changes."""
|
||||
if not self._cookies_file or not os.path.exists(self._cookies_file):
|
||||
return
|
||||
if not self.unified_db:
|
||||
return
|
||||
|
||||
try:
|
||||
import http.cookiejar
|
||||
jar = http.cookiejar.MozillaCookieJar(self._cookies_file)
|
||||
jar.load(ignore_discard=True, ignore_expires=True)
|
||||
|
||||
updated_cookies = []
|
||||
for cookie in jar:
|
||||
updated_cookies.append({
|
||||
'name': cookie.name,
|
||||
'value': cookie.value,
|
||||
'domain': cookie.domain,
|
||||
'path': cookie.path,
|
||||
'secure': cookie.secure,
|
||||
'expirationDate': cookie.expires or 0,
|
||||
})
|
||||
|
||||
if not updated_cookies:
|
||||
return
|
||||
|
||||
# Merge updated cookies back to DB
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('tiktok',))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row and row[0]:
|
||||
existing_data = json.loads(row[0])
|
||||
existing_cookies = existing_data if isinstance(existing_data, list) else existing_data.get('cookies', [])
|
||||
# Merge: updated cookies override existing by name+domain
|
||||
cookie_map = {(c.get('name'), c.get('domain')): c for c in existing_cookies}
|
||||
for c in updated_cookies:
|
||||
cookie_map[(c['name'], c['domain'])] = c
|
||||
final_cookies = list(cookie_map.values())
|
||||
else:
|
||||
final_cookies = updated_cookies
|
||||
|
||||
self.unified_db.save_scraper_cookies('tiktok', final_cookies, merge=False)
|
||||
self.log(f"Saved {len(final_cookies)} refreshed cookies back to DB", 'debug')
|
||||
|
||||
# Clear cached file so next use gets fresh cookies from DB
|
||||
self._cookies_file = None
|
||||
except Exception as e:
|
||||
self.log(f"Failed to save cookies back: {e}", 'debug')
|
||||
|
||||
def _get_base_cmd(self) -> List[str]:
|
||||
"""Get base yt-dlp command with cookies if available."""
|
||||
cmd = [self.ytdlp_path]
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
cmd.extend(['--cookies', cookies_file])
|
||||
return cmd
|
||||
|
||||
@staticmethod
|
||||
def extract_username(url: str) -> Optional[str]:
|
||||
"""Extract username from TikTok URL"""
|
||||
match = re.search(r'tiktok\.com/@([a-zA-Z0-9_.]+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_creator_url(username: str) -> str:
|
||||
"""Convert username to a consistent URL format"""
|
||||
if username.startswith('http://') or username.startswith('https://'):
|
||||
return username
|
||||
username = username.lstrip('@')
|
||||
return f"https://www.tiktok.com/@{username}"
|
||||
|
||||
async def _resolve_channel_id(self, username: str) -> Optional[str]:
|
||||
"""Resolve a TikTok username to a channel_id (secUid).
|
||||
|
||||
When yt-dlp can't extract the secondary user ID from the profile page,
|
||||
we try to find a video URL from TikTok's embed/RSS and then extract
|
||||
the channel_id (secUid) from that video's metadata via yt-dlp.
|
||||
"""
|
||||
if not self.ytdlp_path:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Step 1: Get a video URL from this user via the oembed embed HTML
|
||||
video_url = None
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# The oembed HTML often contains a video ID we can use
|
||||
oembed_url = f"https://www.tiktok.com/oembed?url=https://www.tiktok.com/@{username}"
|
||||
async with session.get(oembed_url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
|
||||
if resp.status == 200:
|
||||
data = await resp.json()
|
||||
embed_html = data.get('html', '')
|
||||
# Extract video URL from embed iframe
|
||||
match = re.search(r'cite="(https://www\.tiktok\.com/@[^"]+/video/\d+)"', embed_html)
|
||||
if not match:
|
||||
match = re.search(r'data-video-id="(\d+)"', embed_html)
|
||||
if match:
|
||||
video_url = f"https://www.tiktok.com/@{username}/video/{match.group(1)}"
|
||||
else:
|
||||
video_url = match.group(1)
|
||||
|
||||
if not video_url:
|
||||
# oembed thumbnail_url sometimes contains the video ID
|
||||
thumb = data.get('thumbnail_url', '')
|
||||
vid_match = re.search(r'/video/(\d+)', thumb)
|
||||
if vid_match:
|
||||
video_url = f"https://www.tiktok.com/@{username}/video/{vid_match.group(1)}"
|
||||
|
||||
if not video_url:
|
||||
# Step 1b: Check if we have any existing video URLs in the database
|
||||
if self.unified_db:
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT a.download_url FROM paid_content_attachments a
|
||||
JOIN paid_content_posts p ON a.post_id = p.id
|
||||
JOIN paid_content_creators c ON p.creator_id = c.id
|
||||
WHERE c.username = ? AND a.download_url LIKE '%tiktok.com%'
|
||||
LIMIT 1
|
||||
""", (username,))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
video_url = row[0]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not video_url:
|
||||
self.log(f"No video URL found for @{username} to resolve channel_id", 'debug')
|
||||
return None
|
||||
|
||||
# Step 2: Use yt-dlp to get the channel_id from the single video
|
||||
self.log(f"Resolving channel_id from video: {video_url}", 'debug')
|
||||
cmd = self._get_base_cmd() + [
|
||||
'-j',
|
||||
'--no-warnings',
|
||||
'--no-download',
|
||||
'--socket-timeout', '30',
|
||||
video_url
|
||||
]
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode == 0:
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
video_data = json.loads(line)
|
||||
channel_id = video_data.get('channel_id') or video_data.get('playlist_id')
|
||||
if channel_id:
|
||||
self.log(f"Resolved @{username} channel_id: {channel_id[:30]}...", 'info')
|
||||
return channel_id
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Failed to resolve channel_id for @{username}: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def get_creator_info(self, url: str) -> Optional[Dict]:
|
||||
"""Get creator information using yt-dlp + profile page scraping"""
|
||||
username = self.extract_username(url)
|
||||
if not username:
|
||||
return None
|
||||
|
||||
profile_url = self.normalize_creator_url(username)
|
||||
creator_name = username
|
||||
|
||||
# Try yt-dlp for display name from video metadata
|
||||
if self.ytdlp_path:
|
||||
try:
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--playlist-items', '1',
|
||||
'--socket-timeout', '30',
|
||||
profile_url
|
||||
]
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode == 0:
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
creator_name = (data.get('channel') or data.get('uploader')
|
||||
or data.get('playlist_title') or username)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
else:
|
||||
# Fallback: try tiktokuser: scheme if secondary user ID extraction fails
|
||||
err_text = stderr.decode('utf-8', errors='replace')
|
||||
if 'secondary user ID' in err_text or 'Unable to extract' in err_text:
|
||||
channel_id = await self._resolve_channel_id(username)
|
||||
if channel_id:
|
||||
fb_cmd = self._get_base_cmd() + [
|
||||
'--no-warnings', '--flat-playlist',
|
||||
'-j', '--playlist-items', '1', '--socket-timeout', '30',
|
||||
f"tiktokuser:{channel_id}"
|
||||
]
|
||||
fb_result = await asyncio.create_subprocess_exec(
|
||||
*fb_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
fb_stdout, _ = await fb_result.communicate()
|
||||
if fb_result.returncode == 0:
|
||||
for line in fb_stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
creator_name = (data.get('channel') or data.get('uploader')
|
||||
or data.get('playlist_title') or username)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
self.log(f"Failed to get creator info via yt-dlp: {e}", 'debug')
|
||||
|
||||
# Scrape profile page for avatar and bio
|
||||
profile_image = None
|
||||
bio = None
|
||||
try:
|
||||
profile_image, bio, page_name = await self._scrape_profile_page(profile_url)
|
||||
if page_name and creator_name == username:
|
||||
creator_name = page_name
|
||||
except Exception as e:
|
||||
self.log(f"Failed to scrape profile page: {e}", 'debug')
|
||||
|
||||
return {
|
||||
'creator_id': username,
|
||||
'creator_name': creator_name,
|
||||
'creator_url': profile_url,
|
||||
'profile_image_url': profile_image,
|
||||
'bio': bio,
|
||||
}
|
||||
|
||||
async def _fetch_profile_with_cookies(self, url: str) -> Optional[str]:
|
||||
"""Fetch TikTok profile page using curl_cffi with cookies from database."""
|
||||
cookies_file = self._get_cookies_file()
|
||||
if not cookies_file:
|
||||
return None
|
||||
|
||||
try:
|
||||
from curl_cffi import requests as cf_requests
|
||||
import http.cookiejar
|
||||
|
||||
# Load cookies from the Netscape file
|
||||
jar = http.cookiejar.MozillaCookieJar(cookies_file)
|
||||
jar.load(ignore_discard=True, ignore_expires=True)
|
||||
|
||||
# Try multiple browser versions for curl_cffi compatibility
|
||||
for _browser in ("chrome136", "chrome131", "chrome"):
|
||||
try:
|
||||
session = cf_requests.Session(impersonate=_browser)
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
else:
|
||||
session = cf_requests.Session()
|
||||
for cookie in jar:
|
||||
session.cookies.set(cookie.name, cookie.value, domain=cookie.domain)
|
||||
|
||||
resp = session.get(url, timeout=15)
|
||||
if resp.status_code == 200 and 'avatarLarger' in resp.text:
|
||||
self.log("Fetched TikTok profile with cookies (curl_cffi)", 'debug')
|
||||
return resp.text
|
||||
elif 'captcha' in resp.text.lower():
|
||||
self.log("TikTok profile still returned captcha with cookies", 'debug')
|
||||
session.close()
|
||||
except Exception as e:
|
||||
self.log(f"curl_cffi profile fetch failed: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def _scrape_profile_page(self, url: str) -> tuple:
|
||||
"""
|
||||
Scrape TikTok profile page for avatar and bio from embedded JSON data.
|
||||
TikTok embeds user data in __UNIVERSAL_DATA_FOR_REHYDRATION__ script tag.
|
||||
Returns (profile_image_url, bio, display_name).
|
||||
"""
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
profile_image = None
|
||||
bio = None
|
||||
display_name = None
|
||||
|
||||
try:
|
||||
page_html = None
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
|
||||
if resp.status == 200:
|
||||
page_html = await resp.text()
|
||||
|
||||
# If we got a captcha page, try curl_cffi with cookies
|
||||
if not page_html or ('captcha' in page_html.lower() and 'avatarLarger' not in page_html):
|
||||
page_html = await self._fetch_profile_with_cookies(url)
|
||||
if not page_html:
|
||||
return (None, None, None)
|
||||
|
||||
# Try structured JSON first (__UNIVERSAL_DATA_FOR_REHYDRATION__)
|
||||
rehydration_match = re.search(
|
||||
r'<script[^>]*id="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>(.*?)</script>',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
if rehydration_match:
|
||||
try:
|
||||
rdata = json.loads(rehydration_match.group(1))
|
||||
user_detail = (rdata.get('__DEFAULT_SCOPE__', {})
|
||||
.get('webapp.user-detail', {}))
|
||||
user = user_detail.get('userInfo', {}).get('user', {})
|
||||
if user:
|
||||
avatar_val = user.get('avatarLarger') or user.get('avatarMedium')
|
||||
if avatar_val and not avatar_val.endswith('.mp4'):
|
||||
profile_image = avatar_val
|
||||
self.log("Found TikTok profile avatar (rehydration)", 'debug')
|
||||
sig_val = user.get('signature', '')
|
||||
if sig_val and sig_val.strip():
|
||||
bio = sig_val.strip()
|
||||
self.log("Found TikTok bio (rehydration)", 'debug')
|
||||
nick_val = user.get('nickname')
|
||||
if nick_val:
|
||||
display_name = nick_val
|
||||
self.log(f"Found TikTok display name (rehydration): {display_name}", 'debug')
|
||||
|
||||
# Extract pinned post IDs
|
||||
pinned_list = user_detail.get('pinnedList', [])
|
||||
if pinned_list:
|
||||
self._last_pinned_posts = {}
|
||||
for item in pinned_list:
|
||||
vid = str(item.get('id', ''))
|
||||
if vid:
|
||||
self._last_pinned_posts[vid] = {'pinned_at': None}
|
||||
if self._last_pinned_posts:
|
||||
self.log(f"Found {len(self._last_pinned_posts)} pinned TikTok posts", 'debug')
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
|
||||
# Fallback: regex extraction from raw HTML
|
||||
# Use json.loads to decode values (handles \uXXXX, surrogate pairs, and raw UTF-8)
|
||||
if not profile_image:
|
||||
avatar_match = re.search(r'"avatarLarger":"([^"]+)"', page_html)
|
||||
if not avatar_match:
|
||||
avatar_match = re.search(r'"avatarMedium":"([^"]+)"', page_html)
|
||||
if avatar_match:
|
||||
try:
|
||||
avatar_url = json.loads(f'"{avatar_match.group(1)}"')
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
avatar_url = avatar_match.group(1)
|
||||
if avatar_url and not avatar_url.endswith('.mp4'):
|
||||
profile_image = avatar_url
|
||||
self.log("Found TikTok profile avatar", 'debug')
|
||||
|
||||
if not bio:
|
||||
sig_match = re.search(r'"signature":"([^"]*)"', page_html)
|
||||
if sig_match:
|
||||
try:
|
||||
raw_bio = json.loads(f'"{sig_match.group(1)}"')
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
raw_bio = sig_match.group(1)
|
||||
if raw_bio and raw_bio.strip():
|
||||
bio = raw_bio.strip()
|
||||
self.log("Found TikTok bio", 'debug')
|
||||
|
||||
if not display_name:
|
||||
nick_match = re.search(r'"nickname":"([^"]+)"', page_html)
|
||||
if nick_match:
|
||||
try:
|
||||
display_name = json.loads(f'"{nick_match.group(1)}"')
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
display_name = nick_match.group(1)
|
||||
self.log(f"Found TikTok display name: {display_name}", 'debug')
|
||||
|
||||
# Extract banner/cover from "coverLarger" field
|
||||
# (stored separately, not returned here but could be used later)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
self.log("TikTok profile page request timed out", 'debug')
|
||||
except Exception as e:
|
||||
self.log(f"Error scraping TikTok profile: {e}", 'debug')
|
||||
|
||||
return (profile_image, bio, display_name)
|
||||
|
||||
async def get_creator_videos(self, url: str, since_date: str = None,
|
||||
max_videos: int = None,
|
||||
progress_callback=None) -> List[Dict]:
|
||||
"""
|
||||
Get all videos from a TikTok profile using yt-dlp --flat-playlist -j.
|
||||
|
||||
Uses JSON output to properly handle multi-line descriptions/titles.
|
||||
Returns list of video metadata dicts with video_id and upload_date.
|
||||
"""
|
||||
if not self.ytdlp_path:
|
||||
return []
|
||||
|
||||
username = self.extract_username(url)
|
||||
if not username:
|
||||
return []
|
||||
|
||||
profile_url = self.normalize_creator_url(username)
|
||||
|
||||
try:
|
||||
# Use yt-dlp flat-playlist with JSON output for full metadata
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--no-warnings',
|
||||
'--socket-timeout', '30',
|
||||
profile_url
|
||||
]
|
||||
|
||||
self.log(f"Fetching TikTok videos for @{username}", 'info')
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error = stderr.decode('utf-8', errors='replace')
|
||||
|
||||
# Fallback: if yt-dlp can't extract secondary user ID, try tiktokuser: scheme
|
||||
if 'secondary user ID' in error or 'Unable to extract' in error:
|
||||
self.log(f"yt-dlp can't extract user ID for @{username}, trying channel_id fallback", 'info')
|
||||
channel_id = await self._resolve_channel_id(username)
|
||||
if channel_id:
|
||||
fallback_cmd = self._get_base_cmd() + [
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--no-warnings',
|
||||
'--socket-timeout', '30',
|
||||
f"tiktokuser:{channel_id}"
|
||||
]
|
||||
fb_result = await asyncio.create_subprocess_exec(
|
||||
*fallback_cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await fb_result.communicate()
|
||||
if fb_result.returncode == 0:
|
||||
self.log(f"Fallback tiktokuser: succeeded for @{username}", 'info')
|
||||
else:
|
||||
fb_error = stderr.decode('utf-8', errors='replace')
|
||||
self.log(f"Fallback also failed for @{username}: {fb_error}", 'warning')
|
||||
return []
|
||||
else:
|
||||
self.log(f"Could not resolve channel_id for @{username}", 'warning')
|
||||
return []
|
||||
else:
|
||||
self.log(f"Failed to list TikTok videos: {error}", 'warning')
|
||||
return []
|
||||
|
||||
lines = stdout.decode('utf-8', errors='replace').strip().split('\n')
|
||||
|
||||
# Parse since_date for filtering
|
||||
cutoff_str = None
|
||||
if since_date:
|
||||
try:
|
||||
if 'T' in since_date:
|
||||
cutoff_dt = datetime.fromisoformat(since_date.replace('Z', '+00:00').replace('+00:00', ''))
|
||||
else:
|
||||
cutoff_dt = datetime.strptime(since_date[:10], '%Y-%m-%d')
|
||||
cutoff_str = cutoff_dt.strftime('%Y%m%d')
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
videos = []
|
||||
for line in lines:
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
try:
|
||||
data = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
video_id = str(data.get('id', ''))
|
||||
if not video_id:
|
||||
continue
|
||||
|
||||
upload_date = data.get('upload_date', '')
|
||||
title = data.get('title', '')
|
||||
description = data.get('description', '')
|
||||
|
||||
# Skip posts where yt-dlp returned no metadata at all
|
||||
# When cookies are expired, yt-dlp returns no date, no title,
|
||||
# and no description. Real posts with empty captions still have
|
||||
# upload_date, so we use that as the key signal.
|
||||
if not upload_date and not title and not description:
|
||||
self.log(f"Skipping TikTok {video_id}: no metadata (cookies may be expired)", 'debug')
|
||||
continue
|
||||
|
||||
title = title or description or f"TikTok video #{video_id}"
|
||||
description = description or title
|
||||
|
||||
# Filter by date if cutoff specified
|
||||
if cutoff_str and upload_date and upload_date < cutoff_str:
|
||||
continue
|
||||
|
||||
# Format upload_date to ISO
|
||||
formatted_date = None
|
||||
if upload_date and len(upload_date) == 8 and upload_date.isdigit():
|
||||
formatted_date = f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:8]}"
|
||||
|
||||
video_url = data.get('url') or f"https://www.tiktok.com/@{username}/video/{video_id}"
|
||||
|
||||
videos.append({
|
||||
'video_id': video_id,
|
||||
'title': title,
|
||||
'description': description,
|
||||
'upload_date': formatted_date,
|
||||
'url': video_url,
|
||||
'username': username,
|
||||
})
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(videos))
|
||||
|
||||
if max_videos and len(videos) >= max_videos:
|
||||
break
|
||||
|
||||
self.log(f"Found {len(videos)} TikTok videos for @{username}", 'info')
|
||||
self._save_cookies_back()
|
||||
return videos
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting TikTok videos: {e}", 'error')
|
||||
self._save_cookies_back()
|
||||
return []
|
||||
|
||||
async def download_video(self, video_url: str, output_dir: Path, username: str = '') -> Dict:
|
||||
"""
|
||||
Download a TikTok video/carousel using gallery-dl.
|
||||
|
||||
gallery-dl handles both regular videos and carousel/slideshow posts.
|
||||
Returns dict with success status and list of downloaded files.
|
||||
"""
|
||||
if not self.gallery_dl_path:
|
||||
return {'success': False, 'error': 'gallery-dl not available'}
|
||||
|
||||
try:
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
cmd = [
|
||||
self.gallery_dl_path,
|
||||
'--write-metadata',
|
||||
'-D', str(output_dir),
|
||||
'-f', '{id}_{num}.{extension}',
|
||||
]
|
||||
|
||||
# Add cookies for age-restricted / login-required content
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
cmd.extend(['--cookies', cookies_file])
|
||||
|
||||
cmd.append(video_url)
|
||||
|
||||
self.log(f"Downloading TikTok: {video_url}", 'debug')
|
||||
|
||||
# Snapshot existing files before download so we only pick up new ones
|
||||
existing_files = set(f.name for f in output_dir.iterdir() if f.is_file())
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
# Find newly downloaded files (exclude .json metadata and audio-only files)
|
||||
downloaded_files = []
|
||||
for f in output_dir.iterdir():
|
||||
if f.is_file() and f.name not in existing_files and f.suffix.lower() not in ('.json',):
|
||||
# Skip audio-only files
|
||||
if f.suffix.lower() in ('.mp3', '.m4a', '.aac', '.wav', '.ogg'):
|
||||
continue
|
||||
downloaded_files.append(f)
|
||||
|
||||
if result.returncode != 0:
|
||||
# gallery-dl exit code 4 = partial failure (e.g. slideshow images OK but audio failed)
|
||||
# If we got media files, treat as success
|
||||
if downloaded_files:
|
||||
self.log(f"gallery-dl partial failure (code {result.returncode}) but {len(downloaded_files)} files downloaded", 'debug')
|
||||
else:
|
||||
error_msg = stderr.decode('utf-8', errors='replace').strip()
|
||||
if 'not available' in error_msg.lower() or '404' in error_msg:
|
||||
error_msg = 'Video not available (deleted or private)'
|
||||
elif len(error_msg) > 200:
|
||||
error_msg = error_msg[:200] + '...'
|
||||
return {'success': False, 'error': error_msg}
|
||||
|
||||
if not downloaded_files:
|
||||
return {'success': False, 'error': 'No files downloaded'}
|
||||
|
||||
# Sort by name to maintain carousel order (e.g. id_1.jpg, id_2.jpg)
|
||||
downloaded_files.sort(key=lambda f: f.name)
|
||||
primary_file = downloaded_files[0]
|
||||
|
||||
# Determine if this is a photo carousel (multiple images)
|
||||
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.webp'}
|
||||
is_carousel = len(downloaded_files) > 1 and all(
|
||||
f.suffix.lower() in image_exts for f in downloaded_files
|
||||
)
|
||||
|
||||
self._save_cookies_back()
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(primary_file),
|
||||
'filename': primary_file.name,
|
||||
'file_size': primary_file.stat().st_size,
|
||||
'all_files': [str(f) for f in downloaded_files],
|
||||
'file_count': len(downloaded_files),
|
||||
'is_carousel': is_carousel,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading TikTok video: {e}", 'error')
|
||||
self._save_cookies_back()
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
async def get_creator(self, url: str) -> Optional[Creator]:
|
||||
"""Get Creator object from URL"""
|
||||
info = await self.get_creator_info(url)
|
||||
if not info:
|
||||
return None
|
||||
|
||||
username = info.get('creator_id', '')
|
||||
|
||||
return Creator(
|
||||
creator_id=username,
|
||||
service_id='tiktok',
|
||||
platform='tiktok',
|
||||
username=info.get('creator_name', username),
|
||||
display_name=info.get('creator_name'),
|
||||
profile_image_url=info.get('profile_image_url'),
|
||||
bio=info.get('bio'),
|
||||
)
|
||||
|
||||
async def get_posts(self, url: str, since_date: str = None,
|
||||
max_videos: int = None, progress_callback=None) -> List[Post]:
|
||||
"""Get TikTok videos as Post objects"""
|
||||
videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback)
|
||||
|
||||
username = self.extract_username(url) or ''
|
||||
|
||||
posts = []
|
||||
for video in videos:
|
||||
# Each TikTok post could be video or carousel
|
||||
# We create a single attachment for now; the actual download determines type
|
||||
attachment = Attachment(
|
||||
name=f"{video['video_id']}.mp4",
|
||||
file_type='video',
|
||||
extension='.mp4',
|
||||
server_path=video['url'],
|
||||
download_url=video['url'],
|
||||
)
|
||||
|
||||
post = Post(
|
||||
post_id=video['video_id'],
|
||||
service_id='tiktok',
|
||||
platform='tiktok',
|
||||
creator_id=username,
|
||||
title=None,
|
||||
content=video.get('description') or video.get('title', ''),
|
||||
published_at=video.get('upload_date'),
|
||||
attachments=[attachment],
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
return posts
|
||||
751
modules/paid_content/twitch_client.py
Normal file
751
modules/paid_content/twitch_client.py
Normal file
@@ -0,0 +1,751 @@
|
||||
"""
|
||||
Twitch Clips Client - Fetches channel clips using yt-dlp
|
||||
"""
|
||||
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
class TwitchThumbnailCache:
|
||||
"""Cache for Twitch clip thumbnails"""
|
||||
|
||||
def __init__(self, cache_dir: str = None):
|
||||
self.cache_dir = Path(cache_dir or '/opt/media-downloader/data/cache/twitch_thumbnails')
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _get_cache_path(self, thumbnail_url: str) -> Path:
|
||||
"""Get local cache path for a thumbnail URL"""
|
||||
# Create a hash of the URL for the filename
|
||||
url_hash = hashlib.md5(thumbnail_url.encode()).hexdigest()
|
||||
# Extract extension from URL or default to jpg
|
||||
ext = '.jpg'
|
||||
if '.png' in thumbnail_url.lower():
|
||||
ext = '.png'
|
||||
elif '.webp' in thumbnail_url.lower():
|
||||
ext = '.webp'
|
||||
return self.cache_dir / f"{url_hash}{ext}"
|
||||
|
||||
def get_cached(self, thumbnail_url: str) -> Optional[str]:
|
||||
"""Get cached thumbnail path if it exists"""
|
||||
cache_path = self._get_cache_path(thumbnail_url)
|
||||
if cache_path.exists():
|
||||
return str(cache_path)
|
||||
return None
|
||||
|
||||
async def cache_thumbnail(self, thumbnail_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
|
||||
"""Download and cache a thumbnail, return local path"""
|
||||
if not thumbnail_url:
|
||||
return None
|
||||
|
||||
# Check if already cached
|
||||
cache_path = self._get_cache_path(thumbnail_url)
|
||||
if cache_path.exists():
|
||||
return str(cache_path)
|
||||
|
||||
# Download thumbnail
|
||||
try:
|
||||
close_session = False
|
||||
if session is None:
|
||||
session = aiohttp.ClientSession()
|
||||
close_session = True
|
||||
|
||||
try:
|
||||
async with session.get(thumbnail_url, timeout=aiohttp.ClientTimeout(total=30)) as resp:
|
||||
if resp.status == 200:
|
||||
content = await resp.read()
|
||||
with open(cache_path, 'wb') as f:
|
||||
f.write(content)
|
||||
return str(cache_path)
|
||||
finally:
|
||||
if close_session:
|
||||
await session.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
async def cache_thumbnails_batch(self, thumbnail_urls: List[str], max_concurrent: int = 5) -> Dict[str, str]:
|
||||
"""Cache multiple thumbnails in parallel, return url->local_path mapping"""
|
||||
result = {}
|
||||
|
||||
# Filter out already cached
|
||||
to_download = []
|
||||
for url in thumbnail_urls:
|
||||
if not url:
|
||||
continue
|
||||
cached = self.get_cached(url)
|
||||
if cached:
|
||||
result[url] = cached
|
||||
else:
|
||||
to_download.append(url)
|
||||
|
||||
if not to_download:
|
||||
return result
|
||||
|
||||
# Download in batches
|
||||
async with aiohttp.ClientSession() as session:
|
||||
semaphore = asyncio.Semaphore(max_concurrent)
|
||||
|
||||
async def download_one(url: str):
|
||||
async with semaphore:
|
||||
path = await self.cache_thumbnail(url, session)
|
||||
if path:
|
||||
result[url] = path
|
||||
|
||||
await asyncio.gather(*[download_one(url) for url in to_download])
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class TwitchClient(LoggingMixin):
|
||||
"""
|
||||
Client for fetching Twitch channel clips using yt-dlp
|
||||
|
||||
Supports:
|
||||
- Channel clips URLs (twitch.tv/username/clips)
|
||||
- Fetching channel metadata
|
||||
- Listing all clips from a channel
|
||||
- Downloading clips
|
||||
"""
|
||||
|
||||
# Quality presets for yt-dlp
|
||||
QUALITY_PRESETS = {
|
||||
'best': 'best',
|
||||
'1080p': 'best[height<=1080]',
|
||||
'720p': 'best[height<=720]',
|
||||
'480p': 'best[height<=480]',
|
||||
}
|
||||
|
||||
def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None, cache_dir: str = None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Twitch')
|
||||
|
||||
# Find yt-dlp executable
|
||||
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
|
||||
if not self.ytdlp_path:
|
||||
self.log("yt-dlp not found, Twitch support will be disabled", 'warning')
|
||||
|
||||
# Store database reference for cookie access
|
||||
self.unified_db = unified_db
|
||||
self._cookies_file = None
|
||||
|
||||
# Initialize thumbnail cache
|
||||
self.thumbnail_cache = TwitchThumbnailCache(cache_dir)
|
||||
|
||||
def _find_ytdlp(self) -> Optional[str]:
|
||||
"""Find yt-dlp executable"""
|
||||
common_paths = [
|
||||
'/opt/media-downloader/venv/bin/yt-dlp', # Prefer venv version (kept up to date)
|
||||
'/usr/local/bin/yt-dlp',
|
||||
'/usr/bin/yt-dlp',
|
||||
'/opt/homebrew/bin/yt-dlp',
|
||||
os.path.expanduser('~/.local/bin/yt-dlp'),
|
||||
]
|
||||
|
||||
for path in common_paths:
|
||||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
return path
|
||||
|
||||
try:
|
||||
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if yt-dlp is available"""
|
||||
return self.ytdlp_path is not None
|
||||
|
||||
def _get_cookies_file(self) -> Optional[str]:
|
||||
"""Get path to cookies file, creating it from database if needed"""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
return self._cookies_file
|
||||
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
# Try twitch-specific cookies first, then fall back to ytdlp
|
||||
for scraper_id in ['twitch', 'ytdlp']:
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", (scraper_id,))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
data = json.loads(row[0])
|
||||
# Support both {"cookies": [...]} and [...] formats
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
cookies_list = data['cookies']
|
||||
elif isinstance(data, list):
|
||||
cookies_list = data
|
||||
else:
|
||||
cookies_list = []
|
||||
|
||||
if cookies_list:
|
||||
# Write cookies to temp file in Netscape format
|
||||
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='twitch_cookies_')
|
||||
with os.fdopen(fd, 'w') as f:
|
||||
f.write("# Netscape HTTP Cookie File\n")
|
||||
for cookie in cookies_list:
|
||||
domain = cookie.get('domain', '')
|
||||
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
|
||||
path = cookie.get('path', '/')
|
||||
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
|
||||
expiry = str(int(cookie.get('expirationDate', 0)))
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
|
||||
self.log(f"Loaded {len(cookies_list)} cookies from {scraper_id} scraper", 'debug')
|
||||
return self._cookies_file
|
||||
except Exception as e:
|
||||
self.log(f"Could not load cookies: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _get_base_cmd(self) -> List[str]:
|
||||
"""Get base yt-dlp command with cookies if available"""
|
||||
cmd = [self.ytdlp_path]
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
cmd.extend(['--cookies', cookies_file])
|
||||
return cmd
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up temporary files"""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
try:
|
||||
os.unlink(self._cookies_file)
|
||||
except Exception:
|
||||
pass
|
||||
self._cookies_file = None
|
||||
|
||||
@staticmethod
|
||||
def extract_channel_name(url: str) -> Optional[str]:
|
||||
"""
|
||||
Extract channel name from Twitch URL
|
||||
|
||||
Supports:
|
||||
- twitch.tv/username
|
||||
- twitch.tv/username/clips
|
||||
- m.twitch.tv/username/clips
|
||||
"""
|
||||
patterns = [
|
||||
r'twitch\.tv/([a-zA-Z0-9_]+)(?:/clips)?',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
return match.group(1).lower()
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_clips_url(channel_name: str) -> str:
|
||||
"""Convert channel name to clips URL with all-time filter"""
|
||||
return f"https://www.twitch.tv/{channel_name}/clips?filter=clips&range=all"
|
||||
|
||||
async def get_channel_info(self, channel_url: str, count_clips: bool = True) -> Optional[Dict]:
|
||||
"""
|
||||
Get channel information and optionally count all clips
|
||||
"""
|
||||
if not self.is_available():
|
||||
return None
|
||||
|
||||
channel_name = self.extract_channel_name(channel_url)
|
||||
if not channel_name:
|
||||
return None
|
||||
|
||||
try:
|
||||
clips_url = self.normalize_clips_url(channel_name)
|
||||
|
||||
# First get basic info from first clip
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--playlist-items', '1',
|
||||
clips_url
|
||||
]
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
self.log(f"Failed to get channel info: {stderr.decode()}", 'warning')
|
||||
return None
|
||||
|
||||
first_clip_data = None
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
first_clip_data = json.loads(line)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not first_clip_data:
|
||||
return None
|
||||
|
||||
# Count all clips if requested (this can take a while for channels with many clips)
|
||||
clip_count = 0
|
||||
if count_clips:
|
||||
self.log(f"Counting clips for {channel_name}...", 'debug')
|
||||
count_cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'--print', 'id',
|
||||
clips_url
|
||||
]
|
||||
|
||||
count_result = await asyncio.create_subprocess_exec(
|
||||
*count_cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
count_stdout, _ = await count_result.communicate()
|
||||
if count_result.returncode == 0:
|
||||
clip_count = len([l for l in count_stdout.decode('utf-8', errors='replace').strip().split('\n') if l])
|
||||
self.log(f"Found {clip_count} clips for {channel_name}", 'info')
|
||||
|
||||
return {
|
||||
'channel_id': channel_name,
|
||||
'channel_name': channel_name,
|
||||
'channel_url': f"https://www.twitch.tv/{channel_name}",
|
||||
'clips_url': clips_url,
|
||||
'thumbnail': first_clip_data.get('thumbnail'),
|
||||
'clip_count': clip_count,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting channel info: {e}", 'error')
|
||||
return None
|
||||
|
||||
async def get_channel_clips(self, channel_url: str, since_date: str = None,
|
||||
max_clips: int = None, progress_callback=None,
|
||||
cache_thumbnails: bool = True) -> List[Dict]:
|
||||
"""
|
||||
Get all clips from a channel
|
||||
|
||||
Args:
|
||||
channel_url: Twitch channel URL
|
||||
since_date: Only fetch clips created after this date (ISO format)
|
||||
max_clips: Maximum number of clips to fetch
|
||||
progress_callback: Callback function(count) for progress updates
|
||||
cache_thumbnails: Whether to download and cache thumbnails locally
|
||||
|
||||
Returns:
|
||||
List of clip metadata dicts with cached thumbnail paths
|
||||
"""
|
||||
if not self.is_available():
|
||||
return []
|
||||
|
||||
channel_name = self.extract_channel_name(channel_url)
|
||||
if not channel_name:
|
||||
self.log(f"Could not extract channel name from URL: {channel_url}", 'error')
|
||||
return []
|
||||
|
||||
try:
|
||||
clips_url = self.normalize_clips_url(channel_name)
|
||||
|
||||
# Use flat-playlist for faster extraction (full metadata available in flat mode for Twitch clips)
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
clips_url
|
||||
]
|
||||
|
||||
# Add date filter at yt-dlp level for efficiency
|
||||
if since_date:
|
||||
try:
|
||||
from datetime import datetime
|
||||
# Convert ISO date to YYYYMMDD format for yt-dlp
|
||||
date_obj = datetime.fromisoformat(since_date.replace('Z', '+00:00'))
|
||||
dateafter = date_obj.strftime('%Y%m%d')
|
||||
cmd.extend(['--dateafter', dateafter])
|
||||
self.log(f"Filtering clips after {dateafter}", 'debug')
|
||||
except (ValueError, AttributeError):
|
||||
pass
|
||||
|
||||
if max_clips:
|
||||
cmd.extend(['--playlist-items', f'1:{max_clips}'])
|
||||
|
||||
self.log(f"Fetching clips from channel: {channel_name}", 'info')
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error = stderr.decode('utf-8', errors='replace')
|
||||
self.log(f"Failed to get channel clips: {error}", 'warning')
|
||||
return []
|
||||
|
||||
clips = []
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
|
||||
clip_id = data.get('id')
|
||||
if not clip_id:
|
||||
continue
|
||||
|
||||
# Parse timestamp to ISO format
|
||||
timestamp = data.get('timestamp')
|
||||
upload_date = data.get('upload_date')
|
||||
if timestamp:
|
||||
try:
|
||||
upload_date = datetime.fromtimestamp(timestamp).isoformat()
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
elif upload_date:
|
||||
# Convert YYYYMMDD to ISO format
|
||||
try:
|
||||
upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Check if clip is newer than since_date
|
||||
if since_date and upload_date and upload_date <= since_date:
|
||||
self.log(f"Reached clip from {upload_date}, stopping", 'debug')
|
||||
break
|
||||
|
||||
# Extract clip slug from URL
|
||||
clip_url = data.get('url') or data.get('webpage_url', '')
|
||||
clip_slug = clip_url.split('/')[-1] if clip_url else clip_id
|
||||
|
||||
clips.append({
|
||||
'clip_id': clip_id,
|
||||
'clip_slug': clip_slug,
|
||||
'title': data.get('title', f'Clip {clip_id}'),
|
||||
'upload_date': upload_date,
|
||||
'timestamp': timestamp,
|
||||
'duration': data.get('duration'),
|
||||
'view_count': data.get('view_count'),
|
||||
'thumbnail': data.get('thumbnail'),
|
||||
'url': clip_url,
|
||||
'language': data.get('language'),
|
||||
'channel_name': channel_name,
|
||||
})
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(clips))
|
||||
|
||||
if max_clips and len(clips) >= max_clips:
|
||||
break
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
self.log(f"Found {len(clips)} clips", 'info')
|
||||
|
||||
# Cache thumbnails if requested
|
||||
if cache_thumbnails and clips:
|
||||
thumbnail_urls = [c.get('thumbnail') for c in clips if c.get('thumbnail')]
|
||||
if thumbnail_urls:
|
||||
self.log(f"Caching {len(thumbnail_urls)} thumbnails...", 'debug')
|
||||
cached_paths = await self.thumbnail_cache.cache_thumbnails_batch(thumbnail_urls)
|
||||
|
||||
# Update clips with cached thumbnail paths
|
||||
for clip in clips:
|
||||
thumb_url = clip.get('thumbnail')
|
||||
if thumb_url and thumb_url in cached_paths:
|
||||
clip['thumbnail_cached'] = cached_paths[thumb_url]
|
||||
|
||||
self.log(f"Cached {len(cached_paths)} thumbnails", 'debug')
|
||||
|
||||
return clips
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting channel clips: {e}", 'error')
|
||||
return []
|
||||
|
||||
async def download_clip(self, clip_url: str, output_dir: Path, quality: str = 'best',
|
||||
progress_callback=None) -> Dict:
|
||||
"""
|
||||
Download a clip
|
||||
|
||||
Args:
|
||||
clip_url: Twitch clip URL
|
||||
output_dir: Directory to save the clip
|
||||
quality: Quality preset
|
||||
progress_callback: Callback for download progress
|
||||
|
||||
Returns:
|
||||
Dict with success status and file info
|
||||
"""
|
||||
if not self.is_available():
|
||||
return {'success': False, 'error': 'yt-dlp not available'}
|
||||
|
||||
try:
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Output template preserves title and ID
|
||||
output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s')
|
||||
|
||||
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
|
||||
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'-f', format_str,
|
||||
'-o', output_template,
|
||||
'--print-json',
|
||||
clip_url
|
||||
]
|
||||
|
||||
self.log(f"Downloading clip: {clip_url}", 'debug')
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = stderr.decode('utf-8', errors='replace').strip()
|
||||
if len(error_msg) > 200:
|
||||
error_msg = error_msg[:200] + '...'
|
||||
return {'success': False, 'error': error_msg}
|
||||
|
||||
# Parse output JSON
|
||||
clip_info = None
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
try:
|
||||
clip_info = json.loads(line)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not clip_info:
|
||||
# Try to find downloaded file
|
||||
files = list(output_dir.glob('*.mp4'))
|
||||
if files:
|
||||
file_path = max(files, key=lambda f: f.stat().st_mtime)
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(file_path),
|
||||
'filename': file_path.name,
|
||||
'file_size': file_path.stat().st_size
|
||||
}
|
||||
return {'success': False, 'error': 'Could not find downloaded file'}
|
||||
|
||||
file_path = clip_info.get('_filename') or clip_info.get('filename')
|
||||
if file_path:
|
||||
file_path = Path(file_path)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(file_path) if file_path else None,
|
||||
'filename': file_path.name if file_path else None,
|
||||
'file_size': file_path.stat().st_size if file_path and file_path.exists() else clip_info.get('filesize'),
|
||||
'title': clip_info.get('title'),
|
||||
'duration': clip_info.get('duration'),
|
||||
'clip_id': clip_info.get('id'),
|
||||
'upload_date': clip_info.get('upload_date'),
|
||||
'thumbnail': clip_info.get('thumbnail'),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading clip: {e}", 'error')
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
async def get_channel_avatar(self, channel_name: str) -> Optional[str]:
|
||||
"""
|
||||
Try to fetch channel avatar from Twitch
|
||||
|
||||
Note: This requires either Twitch API credentials or scraping.
|
||||
Returns None if avatar cannot be fetched.
|
||||
"""
|
||||
profile = await self.get_channel_profile(channel_name)
|
||||
return profile.get('avatar') if profile else None
|
||||
|
||||
async def get_channel_profile(self, channel_name: str) -> Optional[Dict]:
|
||||
"""
|
||||
Fetch channel profile info using Twitch's GQL API.
|
||||
|
||||
Returns dict with avatar, banner, display_name, bio, joined_date, external_links
|
||||
"""
|
||||
try:
|
||||
import aiohttp
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
headers = {
|
||||
'Client-Id': 'kimne78kx3ncx6brgo4mv6wki5h1ko', # Public Twitch web client ID
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
}
|
||||
|
||||
# GQL query for comprehensive user info
|
||||
query = '''
|
||||
query {
|
||||
user(login: "%s") {
|
||||
id
|
||||
login
|
||||
displayName
|
||||
description
|
||||
createdAt
|
||||
profileImageURL(width: 300)
|
||||
bannerImageURL
|
||||
offlineImageURL
|
||||
channel {
|
||||
socialMedias {
|
||||
name
|
||||
url
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
''' % channel_name
|
||||
|
||||
async with session.post(
|
||||
'https://gql.twitch.tv/gql',
|
||||
headers=headers,
|
||||
json={'query': query},
|
||||
timeout=aiohttp.ClientTimeout(total=15)
|
||||
) as resp:
|
||||
if resp.status == 200:
|
||||
data = await resp.json()
|
||||
user = data.get('data', {}).get('user')
|
||||
|
||||
if not user:
|
||||
self.log(f"Twitch user not found: {channel_name}", 'warning')
|
||||
return None
|
||||
|
||||
result = {}
|
||||
|
||||
# Avatar
|
||||
if user.get('profileImageURL'):
|
||||
result['avatar'] = user['profileImageURL']
|
||||
|
||||
# Banner - prefer offlineImageURL (larger), fall back to bannerImageURL
|
||||
if user.get('offlineImageURL'):
|
||||
result['banner'] = user['offlineImageURL']
|
||||
elif user.get('bannerImageURL'):
|
||||
result['banner'] = user['bannerImageURL']
|
||||
|
||||
# Display name
|
||||
if user.get('displayName'):
|
||||
result['display_name'] = user['displayName']
|
||||
|
||||
# Bio/description
|
||||
if user.get('description'):
|
||||
result['bio'] = user['description']
|
||||
|
||||
# Joined date (format: "Jun 10, 2016")
|
||||
if user.get('createdAt'):
|
||||
try:
|
||||
created_dt = datetime.fromisoformat(user['createdAt'].replace('Z', '+00:00'))
|
||||
result['joined_date'] = created_dt.strftime('%b %d, %Y')
|
||||
self.log(f"Found Twitch joined date: {result['joined_date']}", 'debug')
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Social links
|
||||
social_medias = user.get('channel', {}).get('socialMedias', [])
|
||||
if social_medias:
|
||||
links = []
|
||||
for social in social_medias:
|
||||
name = social.get('name', 'Link')
|
||||
url = social.get('url', '')
|
||||
if url:
|
||||
# Capitalize first letter of name
|
||||
title = name.capitalize() if name else 'Link'
|
||||
links.append({'title': title, 'url': url})
|
||||
if links:
|
||||
result['external_links'] = json.dumps(links)
|
||||
self.log(f"Found {len(links)} Twitch external links", 'debug')
|
||||
|
||||
if result:
|
||||
self.log(f"Fetched Twitch profile via GQL for {channel_name}: {list(result.keys())}", 'debug')
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Could not fetch Twitch profile: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def get_creator(self, channel_url: str) -> Optional[Creator]:
|
||||
"""
|
||||
Get Creator object from channel URL
|
||||
"""
|
||||
info = await self.get_channel_info(channel_url)
|
||||
if not info:
|
||||
return None
|
||||
|
||||
channel_name = info.get('channel_name') or self.extract_channel_name(channel_url)
|
||||
|
||||
# Try to get the actual channel avatar (not clip thumbnail)
|
||||
avatar_url = await self.get_channel_avatar(channel_name)
|
||||
|
||||
return Creator(
|
||||
creator_id=info.get('channel_id') or channel_name,
|
||||
service_id='twitch',
|
||||
platform='twitch',
|
||||
username=channel_name or 'Unknown',
|
||||
display_name=channel_name,
|
||||
profile_image_url=avatar_url, # Use actual avatar, not clip thumbnail
|
||||
post_count=info.get('clip_count', 0)
|
||||
)
|
||||
|
||||
async def get_posts(self, channel_url: str, since_date: str = None,
|
||||
max_clips: int = None, progress_callback=None) -> List[Post]:
|
||||
"""
|
||||
Get clips as Post objects
|
||||
"""
|
||||
clips = await self.get_channel_clips(channel_url, since_date, max_clips, progress_callback)
|
||||
|
||||
posts = []
|
||||
for clip in clips:
|
||||
# Create attachment for the clip
|
||||
attachment = Attachment(
|
||||
name=f"{clip['title']}.mp4",
|
||||
file_type='video',
|
||||
extension='.mp4',
|
||||
server_path=clip['url'], # Use URL as server_path
|
||||
download_url=clip['url'],
|
||||
duration=clip.get('duration'),
|
||||
)
|
||||
|
||||
post = Post(
|
||||
post_id=clip['clip_id'],
|
||||
service_id='twitch',
|
||||
platform='twitch',
|
||||
creator_id=clip.get('channel_name', ''),
|
||||
title=clip['title'],
|
||||
content='', # Clips don't have descriptions
|
||||
published_at=clip.get('upload_date'),
|
||||
attachments=[attachment],
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
return posts
|
||||
484
modules/paid_content/utils.py
Normal file
484
modules/paid_content/utils.py
Normal file
@@ -0,0 +1,484 @@
|
||||
"""
|
||||
Utility functions for Paid Content feature
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Optional, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
def _extract_xenforo_search_query(parsed) -> Optional[str]:
|
||||
"""Extract the 'q' search parameter from a XenForo search URL."""
|
||||
from urllib.parse import parse_qs, unquote_plus
|
||||
qs = parse_qs(parsed.query)
|
||||
query = qs.get('q', [''])[0]
|
||||
if not query:
|
||||
m = re.search(r'[&?]q=([^&]+)', parsed.query)
|
||||
if m:
|
||||
query = unquote_plus(m.group(1))
|
||||
return query or None
|
||||
|
||||
|
||||
def parse_creator_url(url: str) -> Optional[Tuple[str, str, str]]:
|
||||
"""
|
||||
Parse a Coomer/Kemono/YouTube/Twitch/Fansly creator URL
|
||||
|
||||
Args:
|
||||
url: URL like https://coomer.party/onlyfans/user/creatorid
|
||||
or https://www.youtube.com/@channelhandle
|
||||
or https://www.youtube.com/channel/UCxxxxx
|
||||
or https://www.twitch.tv/username/clips
|
||||
or https://fansly.com/username
|
||||
|
||||
Returns:
|
||||
Tuple of (service_id, platform, creator_id) or None if invalid
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
host = parsed.netloc.lower()
|
||||
|
||||
# Handle YouTube URLs
|
||||
if 'youtube.com' in host or 'youtu.be' in host:
|
||||
channel_id = _extract_youtube_channel_id(url)
|
||||
if channel_id:
|
||||
return ('youtube', 'youtube', channel_id)
|
||||
return None
|
||||
|
||||
# Handle Twitch URLs
|
||||
if 'twitch.tv' in host:
|
||||
channel_name = _extract_twitch_channel_name(url)
|
||||
if channel_name:
|
||||
return ('twitch', 'twitch', channel_name)
|
||||
return None
|
||||
|
||||
# Handle Fansly URLs (direct API)
|
||||
if 'fansly.com' in host:
|
||||
username = _extract_fansly_username(url)
|
||||
if username:
|
||||
return ('fansly_direct', 'fansly', username)
|
||||
return None
|
||||
|
||||
# Handle OnlyFans URLs (direct API)
|
||||
if 'onlyfans.com' in host:
|
||||
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
|
||||
if path_parts:
|
||||
username = path_parts[0]
|
||||
if username.lower() not in ('my', 'api2', 'settings', 'search', 'notifications', 'chats', 'vault', 'lists', 'bookmarks', 'statements', 'help', 'terms', 'privacy', 'dmca', 'contact'):
|
||||
return ('onlyfans_direct', 'onlyfans', username)
|
||||
return None
|
||||
|
||||
# Handle Pornhub URLs
|
||||
if 'pornhub.com' in host:
|
||||
creator_id = _extract_pornhub_creator_id(url)
|
||||
if creator_id:
|
||||
return ('pornhub', 'pornhub', creator_id)
|
||||
return None
|
||||
|
||||
# Handle XHamster URLs
|
||||
if 'xhamster' in host:
|
||||
creator_id = _extract_xhamster_creator_id(url)
|
||||
if creator_id:
|
||||
return ('xhamster', 'xhamster', creator_id)
|
||||
return None
|
||||
|
||||
# Handle TikTok URLs
|
||||
if 'tiktok.com' in host:
|
||||
username = _extract_tiktok_username(url)
|
||||
if username:
|
||||
return ('tiktok', 'tiktok', username)
|
||||
return None
|
||||
|
||||
# Handle Instagram URLs
|
||||
if 'instagram.com' in host:
|
||||
username = _extract_instagram_username(url)
|
||||
if username:
|
||||
return ('instagram', 'instagram', username)
|
||||
return None
|
||||
|
||||
# Handle BestEyeCandy URLs
|
||||
if 'besteyecandy.com' in host:
|
||||
cid_match = re.search(r'cid-(\d+)', parsed.path)
|
||||
slug_match = re.search(r'/([^/]+)\.html$', parsed.path)
|
||||
if cid_match and slug_match:
|
||||
slug = slug_match.group(1)
|
||||
return ('besteyecandy', 'besteyecandy', f"{cid_match.group(1)}/{slug}")
|
||||
elif cid_match:
|
||||
return ('besteyecandy', 'besteyecandy', cid_match.group(1))
|
||||
return None
|
||||
|
||||
# Handle Coppermine gallery URLs
|
||||
# Match: domain.com/gallery/, domain.com/cpg/, domain.com/coppermine/
|
||||
# Also match direct index.php/thumbnails.php/displayimage.php pages
|
||||
if any(p in parsed.path.lower() for p in ['/gallery/', '/cpg/', '/coppermine/']) or \
|
||||
re.search(r'(?:index|thumbnails|displayimage)\.php', parsed.path):
|
||||
# Normalize to gallery root
|
||||
base_path = re.sub(
|
||||
r'(?:index|thumbnails|displayimage)\.php.*$', '', parsed.path
|
||||
)
|
||||
base_path = base_path.rstrip('/')
|
||||
if base_path:
|
||||
# Use domain + path as creator_id (e.g. kylie-jenner.org/gallery)
|
||||
creator_id = host.replace('www.', '') + base_path
|
||||
return ('coppermine', 'coppermine', creator_id)
|
||||
|
||||
# Handle Bellazon URLs (forum threads as creators)
|
||||
if 'bellazon' in host:
|
||||
match = re.search(r'/topic/(\d+)-([^/]+)', parsed.path)
|
||||
if match:
|
||||
topic_id = match.group(1)
|
||||
return ('bellazon', 'bellazon', topic_id)
|
||||
return None
|
||||
|
||||
# Handle Reddit URLs
|
||||
if 'reddit.com' in host:
|
||||
# Handle reddit.com/r/subreddit, old.reddit.com/r/subreddit, etc.
|
||||
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
|
||||
if len(path_parts) >= 2 and path_parts[0] == 'r':
|
||||
subreddit = path_parts[1].lower()
|
||||
return ('reddit', 'reddit', subreddit)
|
||||
return None
|
||||
|
||||
# Handle Snapchat URLs
|
||||
if 'snapchat.com' in host:
|
||||
# Handle snapchat.com/@username and story.snapchat.com/@username
|
||||
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
|
||||
if path_parts:
|
||||
username = path_parts[0].lstrip('@')
|
||||
if username:
|
||||
return ('snapchat', 'snapchat', username)
|
||||
return None
|
||||
|
||||
# Handle HQCelebCorner URLs
|
||||
if 'hqcelebcorner' in host:
|
||||
query = _extract_xenforo_search_query(parsed)
|
||||
if query:
|
||||
return ('hqcelebcorner', 'hqcelebcorner', query)
|
||||
return None
|
||||
|
||||
# Handle PicturePub URLs
|
||||
if 'picturepub' in host:
|
||||
query = _extract_xenforo_search_query(parsed)
|
||||
if query:
|
||||
return ('picturepub', 'picturepub', query)
|
||||
return None
|
||||
|
||||
# Handle Soundgasm URLs
|
||||
if 'soundgasm.net' in host:
|
||||
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
|
||||
if len(path_parts) >= 2 and path_parts[0] in ('u', 'user'):
|
||||
return ('soundgasm', 'soundgasm', path_parts[1])
|
||||
return None
|
||||
|
||||
# Handle Liltsome URLs (archive, maps to soundgasm platform)
|
||||
if 'liltsome.yerf.org' in host:
|
||||
# Hash-based routing: /#/artist/{name}
|
||||
fragment = parsed.fragment # e.g. "/artist/kinkyshibby"
|
||||
if fragment:
|
||||
parts = [p for p in fragment.strip('/').split('/') if p]
|
||||
if len(parts) >= 2 and parts[0] == 'artist':
|
||||
return ('soundgasm', 'soundgasm', parts[1])
|
||||
return None
|
||||
|
||||
# Determine service (Coomer/Kemono)
|
||||
if 'coomer' in host:
|
||||
service_id = 'coomer'
|
||||
elif 'kemono' in host:
|
||||
service_id = 'kemono'
|
||||
else:
|
||||
return None
|
||||
|
||||
# Parse path: /platform/user/creatorid
|
||||
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
|
||||
|
||||
if len(path_parts) >= 3 and path_parts[1] == 'user':
|
||||
platform = path_parts[0]
|
||||
creator_id = path_parts[2]
|
||||
return (service_id, platform, creator_id)
|
||||
|
||||
return None
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _extract_youtube_channel_id(url: str) -> Optional[str]:
|
||||
"""
|
||||
Extract channel identifier from various YouTube URL formats
|
||||
|
||||
Supports:
|
||||
- youtube.com/channel/UC...
|
||||
- youtube.com/@handle
|
||||
- youtube.com/c/channelname
|
||||
- youtube.com/user/username
|
||||
"""
|
||||
patterns = [
|
||||
r'youtube\.com/channel/([a-zA-Z0-9_-]+)',
|
||||
r'youtube\.com/@([a-zA-Z0-9_.-]+)',
|
||||
r'youtube\.com/c/([a-zA-Z0-9_-]+)',
|
||||
r'youtube\.com/user/([a-zA-Z0-9_-]+)',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _extract_twitch_channel_name(url: str) -> Optional[str]:
|
||||
"""
|
||||
Extract channel name from Twitch URL
|
||||
|
||||
Supports:
|
||||
- twitch.tv/username
|
||||
- twitch.tv/username/clips
|
||||
- m.twitch.tv/username/clips
|
||||
"""
|
||||
patterns = [
|
||||
r'twitch\.tv/([a-zA-Z0-9_]+)(?:/clips)?',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
return match.group(1).lower()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _extract_fansly_username(url: str) -> Optional[str]:
|
||||
"""
|
||||
Extract username from Fansly URL
|
||||
|
||||
Supports:
|
||||
- fansly.com/username
|
||||
- fansly.com/username/posts
|
||||
- fansly.com/username/media
|
||||
"""
|
||||
patterns = [
|
||||
r'fansly\.com/([a-zA-Z0-9_.-]+)(?:/(?:posts|media))?',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
username = match.group(1)
|
||||
# Filter out known non-username paths
|
||||
if username.lower() not in ('explore', 'search', 'settings', 'notifications', 'messages', 'live'):
|
||||
return username
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _extract_pornhub_creator_id(url: str) -> Optional[str]:
|
||||
"""Extract creator identifier from Pornhub URL, returns 'type/name' format"""
|
||||
patterns = [
|
||||
r'pornhub\.com/pornstar/([a-zA-Z0-9_-]+)',
|
||||
r'pornhub\.com/channels/([a-zA-Z0-9_-]+)',
|
||||
r'pornhub\.com/users/([a-zA-Z0-9_-]+)',
|
||||
r'pornhub\.com/model/([a-zA-Z0-9_-]+)',
|
||||
]
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
# Store as "type/name" to preserve the URL type
|
||||
type_match = re.search(r'pornhub\.com/(pornstar|channels|users|model)/', url)
|
||||
return f"{type_match.group(1)}/{match.group(1)}" if type_match else match.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def _extract_xhamster_creator_id(url: str) -> Optional[str]:
|
||||
"""Extract creator identifier from XHamster URL, returns 'type/name' format"""
|
||||
patterns = [
|
||||
r'xhamster\d*\.com/creators/([a-zA-Z0-9_-]+)',
|
||||
r'xhamster\d*\.com/channels/([a-zA-Z0-9_-]+)',
|
||||
]
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
type_match = re.search(r'xhamster\d*\.com/(creators|channels)/', url)
|
||||
return f"{type_match.group(1)}/{match.group(1)}" if type_match else match.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def _extract_tiktok_username(url: str) -> Optional[str]:
|
||||
"""Extract username from TikTok URL"""
|
||||
match = re.search(r'tiktok\.com/@([a-zA-Z0-9_.]+)', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def _extract_instagram_username(url: str) -> Optional[str]:
|
||||
"""Extract username from Instagram URL"""
|
||||
match = re.search(r'instagram\.com/([a-zA-Z0-9_.]+)/?', url)
|
||||
if match:
|
||||
username = match.group(1).lower()
|
||||
non_usernames = {
|
||||
'explore', 'reels', 'stories', 'p', 'tv', 'accounts',
|
||||
'direct', 'about', 'legal', 'developer', 'privacy',
|
||||
'terms', 'help', 'api', 'reel', 'tags'
|
||||
}
|
||||
if username not in non_usernames:
|
||||
return username
|
||||
return None
|
||||
|
||||
|
||||
def parse_post_url(url: str) -> Optional[Tuple[str, str, str, str]]:
|
||||
"""
|
||||
Parse a Coomer/Kemono post URL
|
||||
|
||||
Args:
|
||||
url: URL like https://coomer.party/onlyfans/user/creatorid/post/postid
|
||||
|
||||
Returns:
|
||||
Tuple of (service_id, platform, creator_id, post_id) or None if invalid
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
host = parsed.netloc.lower()
|
||||
|
||||
# Determine service
|
||||
if 'coomer' in host:
|
||||
service_id = 'coomer'
|
||||
elif 'kemono' in host:
|
||||
service_id = 'kemono'
|
||||
else:
|
||||
return None
|
||||
|
||||
# Parse path: /platform/user/creatorid/post/postid
|
||||
path_parts = [p for p in parsed.path.strip('/').split('/') if p]
|
||||
|
||||
if len(path_parts) >= 5 and path_parts[1] == 'user' and path_parts[3] == 'post':
|
||||
platform = path_parts[0]
|
||||
creator_id = path_parts[2]
|
||||
post_id = path_parts[4]
|
||||
return (service_id, platform, creator_id, post_id)
|
||||
|
||||
return None
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def format_file_size(size_bytes: int) -> str:
|
||||
"""Format file size in human-readable format"""
|
||||
if size_bytes is None:
|
||||
return 'Unknown'
|
||||
|
||||
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
||||
if abs(size_bytes) < 1024.0:
|
||||
return f"{size_bytes:.1f} {unit}"
|
||||
size_bytes /= 1024.0
|
||||
|
||||
return f"{size_bytes:.1f} PB"
|
||||
|
||||
|
||||
def sanitize_filename(name: str, max_length: int = 200) -> str:
|
||||
"""
|
||||
Sanitize a string for use in a filename
|
||||
|
||||
Args:
|
||||
name: String to sanitize
|
||||
max_length: Maximum length of result
|
||||
|
||||
Returns:
|
||||
Sanitized filename
|
||||
"""
|
||||
if not name:
|
||||
return 'unnamed'
|
||||
|
||||
# Remove/replace invalid characters
|
||||
name = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', name)
|
||||
name = re.sub(r'\s+', '-', name.strip())
|
||||
name = name.strip('.-')
|
||||
|
||||
if len(name) > max_length:
|
||||
name = name[:max_length]
|
||||
|
||||
return name or 'unnamed'
|
||||
|
||||
|
||||
def extract_platform_from_domain(domain: str) -> Optional[str]:
|
||||
"""Extract platform name from domain"""
|
||||
domain = domain.lower().replace('www.', '')
|
||||
|
||||
platform_domains = {
|
||||
'onlyfans.com': 'onlyfans',
|
||||
'fansly.com': 'fansly',
|
||||
'patreon.com': 'patreon',
|
||||
'fanbox.cc': 'fanbox',
|
||||
'gumroad.com': 'gumroad',
|
||||
'subscribestar.com': 'subscribestar',
|
||||
'subscribestar.adult': 'subscribestar',
|
||||
'discord.com': 'discord',
|
||||
'discord.gg': 'discord',
|
||||
'candfans.jp': 'candfans',
|
||||
}
|
||||
|
||||
return platform_domains.get(domain)
|
||||
|
||||
|
||||
def detect_content_type(filename: str) -> str:
|
||||
"""Detect content type from filename extension"""
|
||||
if not filename:
|
||||
return 'unknown'
|
||||
|
||||
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
|
||||
|
||||
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
|
||||
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv', 'mpeg', 'mpg', '3gp'}
|
||||
audio_exts = {'mp3', 'wav', 'flac', 'aac', 'm4a', 'ogg', 'wma'}
|
||||
archive_exts = {'zip', 'rar', '7z', 'tar', 'gz', 'bz2'}
|
||||
document_exts = {'pdf', 'doc', 'docx', 'txt', 'rtf', 'odt'}
|
||||
|
||||
if ext in image_exts:
|
||||
return 'image'
|
||||
elif ext in video_exts:
|
||||
return 'video'
|
||||
elif ext in audio_exts:
|
||||
return 'audio'
|
||||
elif ext in archive_exts:
|
||||
return 'archive'
|
||||
elif ext in document_exts:
|
||||
return 'document'
|
||||
else:
|
||||
return 'unknown'
|
||||
|
||||
|
||||
def get_service_platforms(service_id: str) -> list:
|
||||
"""Get supported platforms for a service"""
|
||||
platforms = {
|
||||
'coomer': ['onlyfans', 'fansly', 'candfans'],
|
||||
'kemono': ['patreon', 'fanbox', 'gumroad', 'subscribestar', 'discord'],
|
||||
'youtube': ['youtube'],
|
||||
'twitch': ['twitch'],
|
||||
'fansly_direct': ['fansly'],
|
||||
'onlyfans_direct': ['onlyfans'],
|
||||
'pornhub': ['pornhub'],
|
||||
'xhamster': ['xhamster'],
|
||||
'tiktok': ['tiktok'],
|
||||
'instagram': ['instagram'],
|
||||
'soundgasm': ['soundgasm'],
|
||||
'bellazon': ['bellazon'],
|
||||
'besteyecandy': ['besteyecandy'],
|
||||
'snapchat': ['snapchat'],
|
||||
'reddit': ['reddit'],
|
||||
'coppermine': ['coppermine'],
|
||||
'hqcelebcorner': ['hqcelebcorner'],
|
||||
'picturepub': ['picturepub'],
|
||||
}
|
||||
return platforms.get(service_id, [])
|
||||
|
||||
|
||||
def get_service_base_url(service_id: str) -> Optional[str]:
|
||||
"""
|
||||
Get base URL for a service.
|
||||
|
||||
Note: For dynamic URLs, use the database (paid_content_services table).
|
||||
These are fallback defaults only.
|
||||
"""
|
||||
# Import here to avoid circular dependency
|
||||
from .api_client import PaidContentAPIClient
|
||||
return PaidContentAPIClient.DEFAULT_SERVICE_URLS.get(service_id)
|
||||
744
modules/paid_content/xenforo_forum_client.py
Normal file
744
modules/paid_content/xenforo_forum_client.py
Normal file
@@ -0,0 +1,744 @@
|
||||
"""
|
||||
Generic XenForo Forum Client for Paid Content
|
||||
|
||||
Scrapes XenForo-based celebrity image forums (HQCelebCorner, PicturePub, etc.)
|
||||
treating each celebrity name as a "creator" and each matching thread as a post.
|
||||
|
||||
Images are hosted on external hosts (imagebam, pixhost, imagetwist, etc.)
|
||||
and resolved via ImageHostHandler from forum_downloader.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set
|
||||
from urllib.parse import urlparse, unquote_plus
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Post, Attachment
|
||||
|
||||
|
||||
class XenForoForumClient(LoggingMixin):
|
||||
"""Generic client for scraping XenForo-based forum threads."""
|
||||
|
||||
FLARESOLVERR_URL = 'http://localhost:8191/v1'
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||||
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
|
||||
|
||||
# External image host domains to look for in post links
|
||||
IMAGE_HOST_DOMAINS = [
|
||||
'imagebam.com', 'pixhost.to', 'imagetwist.com', 'imgur.com',
|
||||
'imgbox.com', 'postimg.cc', 'postimages.org', 'catbox.moe',
|
||||
'turboimagehost.com', 'imageban.ru', 'img.yt', 'acidimg.cc',
|
||||
'pixxxels.cc', 'imx.to', 'imgbb.com', 'ibb.co',
|
||||
]
|
||||
|
||||
def __init__(self, service_id: str, base_url: str, cookie_path: str, log_callback=None):
|
||||
self.SERVICE_ID = service_id
|
||||
self.BASE_URL = base_url.rstrip('/')
|
||||
self.COOKIE_PATH = cookie_path
|
||||
self._init_logger('PaidContent', log_callback, default_module=service_id)
|
||||
self._cookies: Optional[Dict[str, str]] = None
|
||||
self._image_host_handler = None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Cookie handling
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _load_cookies(self) -> Dict[str, str]:
|
||||
"""Load Playwright-format cookies and convert to {name: value} dict."""
|
||||
if self._cookies is not None:
|
||||
return self._cookies
|
||||
|
||||
try:
|
||||
cookie_path = Path(self.COOKIE_PATH)
|
||||
if cookie_path.exists():
|
||||
with open(cookie_path, 'r') as f:
|
||||
raw_cookies = json.load(f)
|
||||
self._cookies = {c['name']: c['value'] for c in raw_cookies}
|
||||
self.log(f"Loaded {len(self._cookies)} cookies from {self.COOKIE_PATH}", 'debug')
|
||||
else:
|
||||
self.log(f"Cookie file not found: {self.COOKIE_PATH}", 'warning')
|
||||
self._cookies = {}
|
||||
except Exception as e:
|
||||
self.log(f"Error loading cookies: {e}", 'warning')
|
||||
self._cookies = {}
|
||||
|
||||
return self._cookies
|
||||
|
||||
def _get_cookie_header(self) -> str:
|
||||
"""Build Cookie header string from loaded cookies."""
|
||||
cookies = self._load_cookies()
|
||||
return '; '.join(f'{k}={v}' for k, v in cookies.items())
|
||||
|
||||
def _get_request_headers(self) -> Dict[str, str]:
|
||||
"""Get headers with cookies for authenticated requests."""
|
||||
headers = dict(self.HEADERS)
|
||||
cookie_str = self._get_cookie_header()
|
||||
if cookie_str:
|
||||
headers['Cookie'] = cookie_str
|
||||
return headers
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Image host handling
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _get_image_host_handler(self):
|
||||
"""Get or create ImageHostHandler instance."""
|
||||
if self._image_host_handler is None:
|
||||
try:
|
||||
from modules.forum_downloader import ImageHostHandler
|
||||
self._image_host_handler = ImageHostHandler
|
||||
self.log("Loaded ImageHostHandler from forum_downloader", 'debug')
|
||||
except ImportError:
|
||||
self.log("ImageHostHandler not available", 'warning')
|
||||
self._image_host_handler = False # sentinel to avoid retrying
|
||||
return self._image_host_handler if self._image_host_handler is not False else None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# HTTP helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
|
||||
"""Fetch a page with cookies. Falls back to FlareSolverr on 403."""
|
||||
headers = self._get_request_headers()
|
||||
try:
|
||||
async with session.get(url, headers=headers, allow_redirects=True) as resp:
|
||||
if resp.status == 200:
|
||||
return await resp.text()
|
||||
if resp.status == 403:
|
||||
self.log(f"Got 403 for {url}, trying FlareSolverr", 'debug')
|
||||
return await self._fetch_via_flaresolverr(url)
|
||||
self.log(f"HTTP {resp.status} for {url}", 'warning')
|
||||
return None
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching {url}: {e}", 'warning')
|
||||
return await self._fetch_via_flaresolverr(url)
|
||||
|
||||
async def _fetch_via_flaresolverr(self, url: str) -> Optional[str]:
|
||||
"""Fetch a page using FlareSolverr to bypass Cloudflare."""
|
||||
try:
|
||||
import requests as std_requests
|
||||
except ImportError:
|
||||
self.log("requests library not available for FlareSolverr", 'warning')
|
||||
return None
|
||||
|
||||
fs_session_id = None
|
||||
try:
|
||||
# Create session
|
||||
resp = std_requests.post(self.FLARESOLVERR_URL, json={
|
||||
'cmd': 'sessions.create'
|
||||
}, timeout=30)
|
||||
data = resp.json()
|
||||
if data.get('status') != 'ok':
|
||||
self.log("Failed to create FlareSolverr session", 'warning')
|
||||
return None
|
||||
fs_session_id = data.get('session')
|
||||
|
||||
# Fetch page
|
||||
cookies = self._load_cookies()
|
||||
resp = std_requests.post(self.FLARESOLVERR_URL, json={
|
||||
'cmd': 'request.get',
|
||||
'url': url,
|
||||
'session': fs_session_id,
|
||||
'cookies': [{'name': k, 'value': v} for k, v in cookies.items()],
|
||||
'maxTimeout': 60000,
|
||||
}, timeout=70)
|
||||
page_data = resp.json()
|
||||
if page_data.get('status') == 'ok':
|
||||
return page_data.get('solution', {}).get('response', '')
|
||||
self.log(f"FlareSolverr failed for {url}: {page_data.get('message', 'unknown')}", 'warning')
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"FlareSolverr error for {url}: {e}", 'warning')
|
||||
return None
|
||||
finally:
|
||||
if fs_session_id:
|
||||
try:
|
||||
std_requests.post(self.FLARESOLVERR_URL, json={
|
||||
'cmd': 'sessions.destroy',
|
||||
'session': fs_session_id,
|
||||
}, timeout=10)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def search_threads(self, query: str) -> List[Dict]:
|
||||
"""Search for threads matching a celebrity name.
|
||||
|
||||
Returns list of {thread_id, title, url, reply_count}.
|
||||
"""
|
||||
threads = []
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
# XenForo search: POST form to /search/search
|
||||
search_url = f'{self.BASE_URL}/search/search'
|
||||
headers = self._get_request_headers()
|
||||
headers['Content-Type'] = 'application/x-www-form-urlencoded'
|
||||
|
||||
# Need CSRF token - fetch search page first
|
||||
search_page_url = f'{self.BASE_URL}/search/'
|
||||
page_html = await self._fetch_page(session, search_page_url)
|
||||
if not page_html:
|
||||
self.log("Failed to fetch search page", 'warning')
|
||||
return threads
|
||||
|
||||
# Extract CSRF token
|
||||
csrf_match = re.search(r'name="_xfToken"\s+value="([^"]+)"', page_html)
|
||||
xf_token = csrf_match.group(1) if csrf_match else ''
|
||||
|
||||
form_data = {
|
||||
'keywords': query,
|
||||
'search_type': 'post',
|
||||
'c[title_only]': '1',
|
||||
'order': 'date',
|
||||
'_xfToken': xf_token,
|
||||
}
|
||||
|
||||
try:
|
||||
async with session.post(search_url, headers=headers, data=form_data,
|
||||
allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"Search returned HTTP {resp.status}", 'warning')
|
||||
return threads
|
||||
result_html = await resp.text()
|
||||
result_url = str(resp.url)
|
||||
except Exception as e:
|
||||
self.log(f"Search failed: {e}", 'error')
|
||||
return threads
|
||||
|
||||
threads = self._parse_search_results(result_html)
|
||||
|
||||
# Handle search result pagination
|
||||
page = 2
|
||||
while True:
|
||||
next_url = self._find_next_search_page(result_html, result_url, page)
|
||||
if not next_url:
|
||||
break
|
||||
await asyncio.sleep(0.3)
|
||||
result_html = await self._fetch_page(session, next_url)
|
||||
if not result_html:
|
||||
break
|
||||
more = self._parse_search_results(result_html)
|
||||
if not more:
|
||||
break
|
||||
threads.extend(more)
|
||||
page += 1
|
||||
|
||||
self.log(f"Search for '{query}' found {len(threads)} threads", 'info')
|
||||
return threads
|
||||
|
||||
async def get_thread_info(self, thread_url: str) -> Optional[Dict]:
|
||||
"""Fetch page 1 of a thread and extract metadata.
|
||||
|
||||
Returns {thread_id, title, reply_count, page_count, url}.
|
||||
"""
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
page_html = await self._fetch_page(session, thread_url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
title = self._extract_title(page_html)
|
||||
page_count = self._extract_page_count(page_html)
|
||||
reply_count = self._extract_reply_count(page_html)
|
||||
thread_id = self._extract_thread_id(thread_url)
|
||||
|
||||
return {
|
||||
'thread_id': thread_id,
|
||||
'title': title or 'Untitled',
|
||||
'reply_count': reply_count,
|
||||
'page_count': page_count,
|
||||
'url': thread_url.split('#')[0].rstrip('/'),
|
||||
}
|
||||
except Exception as e:
|
||||
self.log(f"Error getting thread info for {thread_url}: {e}", 'error')
|
||||
return None
|
||||
|
||||
async def get_thread_images(self, thread_url: str, page_count: int = None,
|
||||
start_page: int = 1) -> List[Dict]:
|
||||
"""Scrape all pages of a thread and extract image host links.
|
||||
|
||||
Returns list of {url, host, post_number} dicts (deduplicated).
|
||||
"""
|
||||
images = []
|
||||
seen_urls: Set[str] = set()
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
# If page_count not provided, fetch page 1 to determine it
|
||||
if page_count is None:
|
||||
page1_html = await self._fetch_page(session, thread_url)
|
||||
if not page1_html:
|
||||
return images
|
||||
page_count = self._extract_page_count(page1_html)
|
||||
page_images = self._extract_image_links(page1_html)
|
||||
for img in page_images:
|
||||
if img['url'] not in seen_urls:
|
||||
seen_urls.add(img['url'])
|
||||
images.append(img)
|
||||
start_page = 2
|
||||
|
||||
for page_num in range(start_page, page_count + 1):
|
||||
page_url = self._build_page_url(thread_url, page_num)
|
||||
await asyncio.sleep(0.5) # Rate limit
|
||||
|
||||
page_html = await self._fetch_page(session, page_url)
|
||||
if not page_html:
|
||||
self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
|
||||
break
|
||||
|
||||
page_images = self._extract_image_links(page_html)
|
||||
new_count = 0
|
||||
for img in page_images:
|
||||
if img['url'] not in seen_urls:
|
||||
seen_urls.add(img['url'])
|
||||
images.append(img)
|
||||
new_count += 1
|
||||
|
||||
self.log(f"Page {page_num}/{page_count}: {new_count} new image links", 'debug')
|
||||
|
||||
self.log(f"Total: {len(images)} unique image links from {page_count} pages", 'info')
|
||||
return images
|
||||
|
||||
async def resolve_image_url(self, host_page_url: str, session: aiohttp.ClientSession = None) -> Optional[str]:
|
||||
"""Resolve an image host page URL to a direct image URL.
|
||||
|
||||
Uses ImageHostHandler from forum_downloader where possible.
|
||||
"""
|
||||
handler = self._get_image_host_handler()
|
||||
|
||||
# Try direct extraction without fetching the page
|
||||
if handler:
|
||||
direct = handler.extract_direct_url(host_page_url)
|
||||
if direct:
|
||||
return direct
|
||||
|
||||
# imgbox thumbnail → full image conversion (thumbs2 → images2)
|
||||
m = re.match(r'https?://thumbs(\d*)\.imgbox\.com/([a-f0-9]+/[a-f0-9]+/)(\w+)_t\.\w+', host_page_url)
|
||||
if m:
|
||||
return f"https://images{m.group(1)}.imgbox.com/{m.group(2)}{m.group(3)}_o.jpg"
|
||||
|
||||
# For hosts that need page content, fetch and parse
|
||||
own_session = session is None
|
||||
if own_session:
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
session = aiohttp.ClientSession(timeout=timeout)
|
||||
|
||||
try:
|
||||
# ImageBam requires sfw_inter=1 cookie to bypass consent page
|
||||
headers = dict(self.HEADERS)
|
||||
if 'imagebam' in host_page_url:
|
||||
headers['Cookie'] = 'sfw_inter=1'
|
||||
|
||||
try:
|
||||
async with session.get(host_page_url, headers=headers,
|
||||
allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
return None
|
||||
page_content = await resp.text()
|
||||
final_url = str(resp.url)
|
||||
except Exception as e:
|
||||
self.log(f"Failed to fetch image host page {host_page_url}: {e}", 'debug')
|
||||
return None
|
||||
|
||||
# Try handler with page content
|
||||
if handler:
|
||||
direct = handler.extract_direct_url(host_page_url, page_content=page_content)
|
||||
if direct:
|
||||
return direct
|
||||
|
||||
# Manual extraction fallbacks
|
||||
return self._extract_direct_image_from_html(host_page_url, page_content, final_url)
|
||||
|
||||
finally:
|
||||
if own_session:
|
||||
await session.close()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# HTML parsing helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _parse_search_results(self, html_content: str) -> List[Dict]:
|
||||
"""Parse XenForo search results page for thread links."""
|
||||
threads = []
|
||||
|
||||
# Parse each contentRow block to extract title, URL, and date
|
||||
for block_match in re.finditer(
|
||||
r'<div\s+class="contentRow[^"]*"[^>]*>(.*?)</div>\s*</div>\s*</div>',
|
||||
html_content, re.DOTALL
|
||||
):
|
||||
block = block_match.group(1)
|
||||
|
||||
# Extract thread URL and title
|
||||
title_match = re.search(
|
||||
r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
|
||||
block, re.DOTALL
|
||||
)
|
||||
if not title_match:
|
||||
continue
|
||||
|
||||
url = title_match.group(1)
|
||||
title_raw = title_match.group(2)
|
||||
title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
|
||||
title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
|
||||
title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
|
||||
title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
|
||||
|
||||
if not title:
|
||||
continue
|
||||
|
||||
if not url.startswith('http'):
|
||||
url = self.BASE_URL + url
|
||||
|
||||
thread_id = self._extract_thread_id(url)
|
||||
if not thread_id:
|
||||
continue
|
||||
|
||||
# Extract date from <time datetime="..."> tag
|
||||
published_at = None
|
||||
time_match = re.search(r'<time[^>]+datetime="([^"]+)"', block)
|
||||
if time_match:
|
||||
published_at = time_match.group(1)
|
||||
|
||||
threads.append({
|
||||
'thread_id': thread_id,
|
||||
'title': title,
|
||||
'url': url.split('#')[0].rstrip('/'),
|
||||
'reply_count': 0,
|
||||
'published_at': published_at,
|
||||
})
|
||||
|
||||
# Fallback: if contentRow block parsing found nothing, try simpler title-only parsing
|
||||
if not threads:
|
||||
for m in re.finditer(
|
||||
r'class="contentRow-title">\s*<a\s+href="([^"]*threads/[^"]*)"[^>]*>(.*?)</a>',
|
||||
html_content, re.DOTALL
|
||||
):
|
||||
url = m.group(1)
|
||||
title_raw = m.group(2)
|
||||
title_raw = re.sub(r'<span\s+class="label[^"]*"[^>]*>.*?</span>', '', title_raw)
|
||||
title_raw = re.sub(r'<span\s+class="label-append"[^>]*>.*?</span>', '', title_raw)
|
||||
title_raw = re.sub(r'<em\s+class="textHighlight"[^>]*>(.*?)</em>', r'\1', title_raw)
|
||||
title = html.unescape(re.sub(r'<[^>]+>', '', title_raw).strip())
|
||||
if not title:
|
||||
continue
|
||||
if not url.startswith('http'):
|
||||
url = self.BASE_URL + url
|
||||
thread_id = self._extract_thread_id(url)
|
||||
if not thread_id:
|
||||
continue
|
||||
threads.append({
|
||||
'thread_id': thread_id,
|
||||
'title': title,
|
||||
'url': url.split('#')[0].rstrip('/'),
|
||||
'reply_count': 0,
|
||||
'published_at': None,
|
||||
})
|
||||
|
||||
# Deduplicate by thread_id
|
||||
seen = set()
|
||||
unique = []
|
||||
for t in threads:
|
||||
if t['thread_id'] not in seen:
|
||||
seen.add(t['thread_id'])
|
||||
unique.append(t)
|
||||
|
||||
return unique
|
||||
|
||||
def _find_next_search_page(self, html_content: str, current_url: str, page_num: int) -> Optional[str]:
|
||||
"""Find URL for the next page of search results."""
|
||||
# XenForo pagination: <a href="...page-{N}..." class="pageNav-page">
|
||||
pattern = rf'<a\s+href="([^"]*)"[^>]*class="pageNav-jump[^"]*"[^>]*>\s*Next'
|
||||
m = re.search(pattern, html_content, re.IGNORECASE)
|
||||
if m:
|
||||
url = m.group(1)
|
||||
if not url.startswith('http'):
|
||||
url = self.BASE_URL + html.unescape(url)
|
||||
return url
|
||||
return None
|
||||
|
||||
# Domains/patterns for non-content images (reaction GIFs, emojis, signatures, etc.)
|
||||
JUNK_URL_PATTERNS = [
|
||||
'giphy.com', 'tenor.com', 'gfycat.com', # reaction GIFs
|
||||
'jsdelivr.net', 'joypixels', 'twemoji', # emoji CDNs
|
||||
'wp-content/', # WordPress media (blog graphics, profile pics)
|
||||
'/unicode/', '/emoji/', # emoji paths
|
||||
'haboodadi.com', # forum signature images
|
||||
]
|
||||
|
||||
# Image hosts that are permanently dead (DNS gone / domain expired)
|
||||
DEAD_HOSTS = [
|
||||
'someimage.com',
|
||||
]
|
||||
|
||||
def _extract_image_links(self, page_html: str) -> List[Dict]:
|
||||
"""Extract image host links from all posts on a page."""
|
||||
images = []
|
||||
|
||||
# Find all message bodies: XenForo uses <article class="message ..."> and
|
||||
# <div class="bbWrapper"> for post content
|
||||
for content_match in re.finditer(
|
||||
r'<div\s+class="bbWrapper">(.*?)</div>\s*(?:</div>|<div\s+class="(?:js-post|message))',
|
||||
page_html, re.DOTALL
|
||||
):
|
||||
content = content_match.group(1)
|
||||
|
||||
# Extract links to known image hosts
|
||||
for link_match in re.finditer(r'<a\s+[^>]*href="([^"]+)"[^>]*>', content):
|
||||
link_url = html.unescape(link_match.group(1))
|
||||
if self._is_image_host_url(link_url) and not self._is_junk_url(link_url):
|
||||
images.append({'url': link_url, 'host': self._identify_host(link_url)})
|
||||
|
||||
# Also catch direct image URLs (full-size, not thumbnails)
|
||||
# NOTE: Skip images hosted on known image host CDNs (imgbox, imgur, etc.)
|
||||
# — legitimate gallery images are posted as <a href> links to host pages
|
||||
# (handled above), while inline <img> from these hosts are signatures.
|
||||
for img_match in re.finditer(r'<img\s+[^>]*src="([^"]+)"[^>]*>', content):
|
||||
img_url = html.unescape(img_match.group(1))
|
||||
# Skip thumbnails, avatars, smilies, and junk
|
||||
if any(skip in img_url.lower() for skip in [
|
||||
'thumb', 'avatar', 'smili', 'emoji', 'icon', 'logo',
|
||||
'data/assets', '/styles/', 'xenforo'
|
||||
]):
|
||||
continue
|
||||
if self._is_junk_url(img_url):
|
||||
continue
|
||||
# Skip inline images from known image hosts — these are signatures,
|
||||
# not gallery content (gallery images come through as <a> links above)
|
||||
if self._is_image_host_url(img_url):
|
||||
continue
|
||||
if self._is_direct_image_url(img_url):
|
||||
images.append({'url': img_url, 'host': 'direct'})
|
||||
|
||||
return images
|
||||
|
||||
def _is_junk_url(self, url: str) -> bool:
|
||||
"""Filter out non-content images: reaction GIFs, emojis, blog graphics, dead hosts, etc."""
|
||||
url_lower = url.lower()
|
||||
if any(pat in url_lower for pat in self.JUNK_URL_PATTERNS):
|
||||
return True
|
||||
if any(host in url_lower for host in self.DEAD_HOSTS):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _is_image_host_url(self, url: str) -> bool:
|
||||
"""Check if a URL belongs to a known image hosting service."""
|
||||
try:
|
||||
domain = urlparse(url).netloc.lower()
|
||||
return any(host in domain for host in self.IMAGE_HOST_DOMAINS)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _is_direct_image_url(self, url: str) -> bool:
|
||||
"""Check if a URL points directly to an image file."""
|
||||
try:
|
||||
path = urlparse(url).path.lower()
|
||||
return any(path.endswith(f'.{ext}') for ext in self.IMAGE_EXTS)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _identify_host(self, url: str) -> str:
|
||||
"""Identify which image host a URL belongs to."""
|
||||
handler = self._get_image_host_handler()
|
||||
if handler:
|
||||
host = handler.identify_host(url)
|
||||
if host:
|
||||
return host
|
||||
# Fallback
|
||||
try:
|
||||
domain = urlparse(url).netloc.lower()
|
||||
for host_domain in self.IMAGE_HOST_DOMAINS:
|
||||
if host_domain in domain:
|
||||
return host_domain.split('.')[0]
|
||||
except Exception:
|
||||
pass
|
||||
return 'unknown'
|
||||
|
||||
def _extract_direct_image_from_html(self, url: str, page_content: str, final_url: str) -> Optional[str]:
|
||||
"""Manually extract direct image URL from host page HTML."""
|
||||
domain = urlparse(url).netloc.lower()
|
||||
|
||||
# imagebam: <img class="main-image ..." src="..."> (class may have extra classes)
|
||||
if 'imagebam' in domain:
|
||||
m = re.search(r'<img\s+[^>]*src="(https?://images\d*\.imagebam\.com/[^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
m = re.search(r'<img\s+[^>]*class="main-image[^"]*"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
# Alternative: og:image meta tag
|
||||
m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# pixhost: <img id="image" src="..."> or img.pixhost.to URL
|
||||
if 'pixhost' in domain:
|
||||
m = re.search(r'<img\s+[^>]*id="image"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
# Convert thumbnail URL to full: t{N}.pixhost.to/thumbs/ -> img{N}.pixhost.to/images/
|
||||
m = re.search(r'https?://t(\d+)\.pixhost\.to/thumbs/(\d+)/(.+)', url)
|
||||
if m:
|
||||
return f"https://img{m.group(1)}.pixhost.to/images/{m.group(2)}/{m.group(3)}"
|
||||
|
||||
# imagetwist: <img class="pic" src="...">
|
||||
if 'imagetwist' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="pic"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
m = re.search(r'<p\s+[^>]*style="text-align:center"[^>]*>\s*<img\s+[^>]*src="([^"]+)"',
|
||||
page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# imgbox: <img id="img" src="..."> or src before id
|
||||
if 'imgbox' in domain:
|
||||
m = re.search(r'<img\s+[^>]*id="img"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
m = re.search(r'<img\s+[^>]*src="([^"]+)"[^>]*id="img"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
# Direct image URL pattern
|
||||
m = re.search(r'(https?://images\d*\.imgbox\.com/[^\s"<>]+)', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# turboimagehost: <img class="uImage" src="...">
|
||||
if 'turboimagehost' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="uImage"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# acidimg: <img class="centred" src="...">
|
||||
if 'acidimg' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# pixxxels: same pattern as acidimg
|
||||
if 'pixxxels' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="centred"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# imx.to: <img class="image-show" src="...">
|
||||
if 'imx.to' in domain:
|
||||
m = re.search(r'<img\s+[^>]*class="image-show"[^>]*src="([^"]+)"', page_content)
|
||||
if m:
|
||||
return html.unescape(m.group(1))
|
||||
|
||||
# Generic: try og:image meta tag
|
||||
m = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_content)
|
||||
if m:
|
||||
img_url = html.unescape(m.group(1))
|
||||
if self._is_direct_image_url(img_url):
|
||||
return img_url
|
||||
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Utility helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _extract_title(page_html: str) -> Optional[str]:
|
||||
"""Extract thread title from XenForo <h1 class="p-title-value">."""
|
||||
m = re.search(r'<h1\s+class="p-title-value"[^>]*>(.*?)</h1>', page_html, re.DOTALL)
|
||||
if m:
|
||||
# Remove inner tags (like <span> for prefixes/labels, viewer count spans)
|
||||
title = re.sub(r'<[^>]+>', '', m.group(1))
|
||||
# Clean up non-breaking spaces and extra whitespace
|
||||
title = title.replace('\xa0', ' ')
|
||||
title = re.sub(r'\s*\(\d+\s*Viewer[s]?\)', '', title) # Remove "(1 Viewer)"
|
||||
title = re.sub(r'\s+', ' ', title).strip()
|
||||
return html.unescape(title)
|
||||
# Fallback: <title> — strip common XenForo site name suffixes
|
||||
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
title = html.unescape(m.group(1).strip())
|
||||
title = re.sub(r'\s*[-–—|]\s*(?:HQCelebCorner|PicturePub|XenForo).*$', '', title, flags=re.IGNORECASE).strip()
|
||||
return title
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_page_count(page_html: str) -> int:
|
||||
"""Extract total page count from XenForo pagination."""
|
||||
# <li class="pageNav-page"><a href="...">42</a></li>
|
||||
pages = re.findall(r'<li\s+class="pageNav-page[^"]*">\s*<a[^>]*>(\d+)</a>', page_html)
|
||||
if pages:
|
||||
return max(int(p) for p in pages)
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
def _extract_reply_count(page_html: str) -> int:
|
||||
"""Extract reply count from XenForo thread info."""
|
||||
# <dl class="pairs pairs--inline"><dt>Replies</dt><dd>123</dd></dl>
|
||||
m = re.search(r'<dt>Replies</dt>\s*<dd>([\d,]+)</dd>', page_html)
|
||||
if m:
|
||||
return int(m.group(1).replace(',', ''))
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
def _extract_thread_id(url: str) -> Optional[str]:
|
||||
"""Extract thread ID from XenForo URL.
|
||||
|
||||
Handles both formats:
|
||||
- /threads/title.12345/
|
||||
- /index.php?threads/title.12345/
|
||||
"""
|
||||
m = re.search(r'threads/[^/]*?\.(\d+)', url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
# Fallback: just /threads/{id}/
|
||||
m = re.search(r'threads/(\d+)', url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _build_page_url(thread_url: str, page_num: int) -> str:
|
||||
"""Build paginated thread URL for XenForo.
|
||||
|
||||
Handles: /index.php?threads/slug.12345/page-2
|
||||
"""
|
||||
# Remove existing page- suffix and fragment
|
||||
base = thread_url.split('#')[0].rstrip('/')
|
||||
base = re.sub(r'/page-\d+$', '', base)
|
||||
if page_num == 1:
|
||||
return base + '/'
|
||||
return f'{base}/page-{page_num}'
|
||||
|
||||
@staticmethod
|
||||
def _get_extension(filename_or_url: str) -> str:
|
||||
"""Get lowercase file extension."""
|
||||
clean = filename_or_url.split('?')[0].split('#')[0]
|
||||
if '.' in clean.split('/')[-1]:
|
||||
return clean.rsplit('.', 1)[-1].lower()
|
||||
return ''
|
||||
|
||||
@staticmethod
|
||||
def _filename_from_url(url: str) -> str:
|
||||
"""Extract filename from URL path."""
|
||||
path = urlparse(url).path
|
||||
name = path.rstrip('/').split('/')[-1]
|
||||
return name if name else 'unnamed.jpg'
|
||||
1414
modules/paid_content/xhamster_client.py
Normal file
1414
modules/paid_content/xhamster_client.py
Normal file
File diff suppressed because it is too large
Load Diff
1087
modules/paid_content/youtube_client.py
Normal file
1087
modules/paid_content/youtube_client.py
Normal file
File diff suppressed because it is too large
Load Diff
1025
modules/pg_adapter.py
Normal file
1025
modules/pg_adapter.py
Normal file
File diff suppressed because it is too large
Load Diff
690
modules/plex_client.py
Normal file
690
modules/plex_client.py
Normal file
@@ -0,0 +1,690 @@
|
||||
"""Plex Media Server client for linking appearances to library items"""
|
||||
import asyncio
|
||||
import uuid
|
||||
from typing import Dict, List, Optional, Any
|
||||
from web.backend.core.http_client import http_client
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('Plex')
|
||||
|
||||
# Plex API constants
|
||||
PLEX_TV_API = "https://plex.tv/api/v2"
|
||||
PLEX_AUTH_URL = "https://app.plex.tv/auth"
|
||||
CLIENT_IDENTIFIER = "media-downloader-appearances"
|
||||
PRODUCT_NAME = "Media Downloader"
|
||||
|
||||
|
||||
class PlexOAuth:
|
||||
"""Handle Plex OAuth PIN-based authentication flow"""
|
||||
|
||||
def __init__(self):
|
||||
self._headers = {
|
||||
'Accept': 'application/json',
|
||||
'X-Plex-Client-Identifier': CLIENT_IDENTIFIER,
|
||||
'X-Plex-Product': PRODUCT_NAME,
|
||||
'X-Plex-Version': '1.0.0',
|
||||
'X-Plex-Device': 'Web',
|
||||
'X-Plex-Platform': 'Web',
|
||||
}
|
||||
|
||||
async def create_pin(self) -> Optional[Dict]:
|
||||
"""
|
||||
Create a new PIN for authentication.
|
||||
|
||||
Returns:
|
||||
Dict with 'id', 'code', and 'auth_url' or None on failure
|
||||
"""
|
||||
try:
|
||||
url = f"{PLEX_TV_API}/pins"
|
||||
response = await http_client.post(
|
||||
url,
|
||||
headers=self._headers,
|
||||
data={'strong': 'true'}
|
||||
)
|
||||
data = response.json()
|
||||
|
||||
pin_id = data.get('id')
|
||||
pin_code = data.get('code')
|
||||
|
||||
if pin_id and pin_code:
|
||||
# Build the auth URL for the user to visit
|
||||
auth_url = (
|
||||
f"{PLEX_AUTH_URL}#?"
|
||||
f"clientID={CLIENT_IDENTIFIER}&"
|
||||
f"code={pin_code}&"
|
||||
f"context%5Bdevice%5D%5Bproduct%5D={PRODUCT_NAME.replace(' ', '%20')}"
|
||||
)
|
||||
|
||||
logger.info(f"Created Plex PIN {pin_id}")
|
||||
return {
|
||||
'id': pin_id,
|
||||
'code': pin_code,
|
||||
'auth_url': auth_url,
|
||||
'expires_at': data.get('expiresAt'),
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create Plex PIN: {e}")
|
||||
return None
|
||||
|
||||
async def check_pin(self, pin_id: int) -> Optional[str]:
|
||||
"""
|
||||
Check if the user has authenticated with the PIN.
|
||||
|
||||
Args:
|
||||
pin_id: The PIN ID returned from create_pin
|
||||
|
||||
Returns:
|
||||
The auth token if authenticated, None if still pending or expired
|
||||
"""
|
||||
try:
|
||||
url = f"{PLEX_TV_API}/pins/{pin_id}"
|
||||
response = await http_client.get(url, headers=self._headers)
|
||||
data = response.json()
|
||||
|
||||
auth_token = data.get('authToken')
|
||||
if auth_token:
|
||||
logger.info("Plex authentication successful")
|
||||
return auth_token
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to check Plex PIN: {e}")
|
||||
return None
|
||||
|
||||
async def wait_for_auth(self, pin_id: int, timeout: int = 120, poll_interval: int = 2) -> Optional[str]:
|
||||
"""
|
||||
Poll for authentication completion.
|
||||
|
||||
Args:
|
||||
pin_id: The PIN ID to check
|
||||
timeout: Maximum seconds to wait
|
||||
poll_interval: Seconds between checks
|
||||
|
||||
Returns:
|
||||
The auth token if successful, None on timeout/failure
|
||||
"""
|
||||
elapsed = 0
|
||||
while elapsed < timeout:
|
||||
token = await self.check_pin(pin_id)
|
||||
if token:
|
||||
return token
|
||||
|
||||
await asyncio.sleep(poll_interval)
|
||||
elapsed += poll_interval
|
||||
|
||||
logger.warning(f"Plex authentication timed out after {timeout}s")
|
||||
return None
|
||||
|
||||
async def get_user_info(self, token: str) -> Optional[Dict]:
|
||||
"""
|
||||
Get information about the authenticated user.
|
||||
|
||||
Args:
|
||||
token: Plex auth token
|
||||
|
||||
Returns:
|
||||
User info dict or None
|
||||
"""
|
||||
try:
|
||||
url = f"{PLEX_TV_API}/user"
|
||||
headers = {**self._headers, 'X-Plex-Token': token}
|
||||
response = await http_client.get(url, headers=headers)
|
||||
data = response.json()
|
||||
|
||||
return {
|
||||
'username': data.get('username'),
|
||||
'email': data.get('email'),
|
||||
'thumb': data.get('thumb'),
|
||||
'title': data.get('title'),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get Plex user info: {e}")
|
||||
return None
|
||||
|
||||
async def get_user_servers(self, token: str) -> List[Dict]:
|
||||
"""
|
||||
Get list of Plex servers available to the user.
|
||||
|
||||
Args:
|
||||
token: Plex auth token
|
||||
|
||||
Returns:
|
||||
List of server dictionaries
|
||||
"""
|
||||
try:
|
||||
url = f"{PLEX_TV_API}/resources"
|
||||
headers = {**self._headers, 'X-Plex-Token': token}
|
||||
params = {'includeHttps': 1, 'includeRelay': 1}
|
||||
response = await http_client.get(url, headers=headers, params=params)
|
||||
data = response.json()
|
||||
|
||||
servers = []
|
||||
for resource in data:
|
||||
if resource.get('provides') == 'server':
|
||||
connections = resource.get('connections', [])
|
||||
|
||||
# Prefer non-local (relay/remote) connections for server-to-server communication
|
||||
# Local connections often use internal IPs that aren't reachable externally
|
||||
remote_conn = next((c for c in connections if not c.get('local') and c.get('relay')), None)
|
||||
https_conn = next((c for c in connections if not c.get('local') and 'https' in c.get('uri', '')), None)
|
||||
any_remote = next((c for c in connections if not c.get('local')), None)
|
||||
local_conn = next((c for c in connections if c.get('local')), None)
|
||||
|
||||
# Try in order: relay, https remote, any remote, local
|
||||
best_conn = remote_conn or https_conn or any_remote or local_conn or (connections[0] if connections else None)
|
||||
|
||||
if best_conn:
|
||||
# Also include all connection URLs for debugging/manual selection
|
||||
all_urls = [{'url': c.get('uri'), 'local': c.get('local', False), 'relay': c.get('relay', False)} for c in connections]
|
||||
|
||||
servers.append({
|
||||
'name': resource.get('name'),
|
||||
'machineIdentifier': resource.get('clientIdentifier'),
|
||||
'owned': resource.get('owned', False),
|
||||
'url': best_conn.get('uri'),
|
||||
'local': best_conn.get('local', False),
|
||||
'relay': best_conn.get('relay', False),
|
||||
'accessToken': resource.get('accessToken'),
|
||||
'all_connections': all_urls,
|
||||
})
|
||||
|
||||
return servers
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get Plex servers: {e}")
|
||||
return []
|
||||
|
||||
|
||||
class PlexClient:
|
||||
"""Client for interacting with Plex Media Server API"""
|
||||
|
||||
def __init__(self, base_url: str, token: str):
|
||||
"""
|
||||
Initialize Plex client.
|
||||
|
||||
Args:
|
||||
base_url: Plex server URL (e.g., 'http://192.168.1.100:32400')
|
||||
token: Plex authentication token
|
||||
"""
|
||||
self.base_url = base_url.rstrip('/')
|
||||
self.token = token
|
||||
self._headers = {
|
||||
'X-Plex-Token': token,
|
||||
'Accept': 'application/json'
|
||||
}
|
||||
|
||||
async def test_connection(self) -> bool:
|
||||
"""
|
||||
Test connection to Plex server.
|
||||
|
||||
Returns:
|
||||
True if connection successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
url = f"{self.base_url}/identity"
|
||||
response = await http_client.get(url, headers=self._headers)
|
||||
data = response.json()
|
||||
server_name = data.get('MediaContainer', {}).get('friendlyName', 'Unknown')
|
||||
logger.info(f"Connected to Plex server: {server_name}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Plex connection test failed: {e}")
|
||||
return False
|
||||
|
||||
async def get_libraries(self) -> List[Dict]:
|
||||
"""
|
||||
Get list of Plex libraries.
|
||||
|
||||
Returns:
|
||||
List of library dictionaries with id, title, type
|
||||
"""
|
||||
try:
|
||||
url = f"{self.base_url}/library/sections"
|
||||
response = await http_client.get(url, headers=self._headers)
|
||||
data = response.json()
|
||||
|
||||
libraries = []
|
||||
for section in data.get('MediaContainer', {}).get('Directory', []):
|
||||
libraries.append({
|
||||
'id': section.get('key'),
|
||||
'title': section.get('title'),
|
||||
'type': section.get('type'), # 'movie', 'show', 'artist', etc.
|
||||
'uuid': section.get('uuid'),
|
||||
})
|
||||
return libraries
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get Plex libraries: {e}")
|
||||
return []
|
||||
|
||||
async def search_by_tmdb_id(self, tmdb_id: int, media_type: str = 'movie') -> Optional[Dict]:
|
||||
"""
|
||||
Search for an item in Plex library by TMDB ID.
|
||||
|
||||
Args:
|
||||
tmdb_id: The Movie Database ID
|
||||
media_type: 'movie' or 'show'
|
||||
|
||||
Returns:
|
||||
Plex item dict with ratingKey, title, etc. or None if not found
|
||||
"""
|
||||
try:
|
||||
# Plex uses guid format like: tmdb://12345
|
||||
guid = f"tmdb://{tmdb_id}"
|
||||
|
||||
# Search across all libraries
|
||||
url = f"{self.base_url}/library/all"
|
||||
params = {
|
||||
'guid': guid,
|
||||
'type': 1 if media_type == 'movie' else 2 # 1=movie, 2=show
|
||||
}
|
||||
response = await http_client.get(url, headers=self._headers, params=params)
|
||||
data = response.json()
|
||||
|
||||
items = data.get('MediaContainer', {}).get('Metadata', [])
|
||||
if items:
|
||||
item = items[0]
|
||||
return {
|
||||
'ratingKey': item.get('ratingKey'),
|
||||
'title': item.get('title'),
|
||||
'year': item.get('year'),
|
||||
'thumb': item.get('thumb'),
|
||||
'type': item.get('type'),
|
||||
'librarySectionID': item.get('librarySectionID'),
|
||||
}
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.debug(f"TMDB search failed for {tmdb_id}: {e}")
|
||||
return None
|
||||
|
||||
async def search_by_title(self, title: str, year: Optional[int] = None,
|
||||
media_type: str = 'movie') -> Optional[Dict]:
|
||||
"""
|
||||
Search for an item in Plex library by title.
|
||||
|
||||
Args:
|
||||
title: Movie or show title
|
||||
year: Optional release year for more accurate matching
|
||||
media_type: 'movie' or 'show'
|
||||
|
||||
Returns:
|
||||
Plex item dict or None if not found
|
||||
"""
|
||||
try:
|
||||
url = f"{self.base_url}/search"
|
||||
params = {
|
||||
'query': title,
|
||||
'type': 1 if media_type == 'movie' else 2
|
||||
}
|
||||
response = await http_client.get(url, headers=self._headers, params=params)
|
||||
data = response.json()
|
||||
|
||||
items = data.get('MediaContainer', {}).get('Metadata', [])
|
||||
|
||||
# If year provided, filter for matching year
|
||||
if year and items:
|
||||
for item in items:
|
||||
if item.get('year') == year:
|
||||
return {
|
||||
'ratingKey': item.get('ratingKey'),
|
||||
'title': item.get('title'),
|
||||
'year': item.get('year'),
|
||||
'thumb': item.get('thumb'),
|
||||
'type': item.get('type'),
|
||||
'librarySectionID': item.get('librarySectionID'),
|
||||
}
|
||||
|
||||
# Return first result if no exact year match
|
||||
if items:
|
||||
item = items[0]
|
||||
return {
|
||||
'ratingKey': item.get('ratingKey'),
|
||||
'title': item.get('title'),
|
||||
'year': item.get('year'),
|
||||
'thumb': item.get('thumb'),
|
||||
'type': item.get('type'),
|
||||
'librarySectionID': item.get('librarySectionID'),
|
||||
}
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.debug(f"Title search failed for '{title}': {e}")
|
||||
return None
|
||||
|
||||
async def get_episode(self, show_rating_key: str, season: int, episode: int) -> Optional[Dict]:
|
||||
"""
|
||||
Get a specific episode from a TV show.
|
||||
|
||||
Args:
|
||||
show_rating_key: Plex ratingKey for the show
|
||||
season: Season number
|
||||
episode: Episode number
|
||||
|
||||
Returns:
|
||||
Episode dict with ratingKey, title, etc. or None if not found
|
||||
"""
|
||||
try:
|
||||
# Get all episodes of the show
|
||||
url = f"{self.base_url}/library/metadata/{show_rating_key}/allLeaves"
|
||||
response = await http_client.get(url, headers=self._headers)
|
||||
data = response.json()
|
||||
|
||||
episodes = data.get('MediaContainer', {}).get('Metadata', [])
|
||||
for ep in episodes:
|
||||
if ep.get('parentIndex') == season and ep.get('index') == episode:
|
||||
return {
|
||||
'ratingKey': ep.get('ratingKey'),
|
||||
'title': ep.get('title'),
|
||||
'season': season,
|
||||
'episode': episode,
|
||||
'show_rating_key': show_rating_key,
|
||||
'type': 'episode',
|
||||
}
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.debug(f"Episode search failed for S{season}E{episode}: {e}")
|
||||
return None
|
||||
|
||||
async def get_all_episodes(self, show_rating_key: str) -> Dict[tuple, Dict]:
|
||||
"""
|
||||
Get all episodes for a TV show, indexed by (season, episode) tuple.
|
||||
|
||||
Args:
|
||||
show_rating_key: Plex ratingKey for the show
|
||||
|
||||
Returns:
|
||||
Dict mapping (season_num, episode_num) to episode info
|
||||
"""
|
||||
episodes_map = {}
|
||||
try:
|
||||
url = f"{self.base_url}/library/metadata/{show_rating_key}/allLeaves"
|
||||
response = await http_client.get(url, headers=self._headers)
|
||||
data = response.json()
|
||||
|
||||
episodes = data.get('MediaContainer', {}).get('Metadata', [])
|
||||
for ep in episodes:
|
||||
season = ep.get('parentIndex')
|
||||
episode = ep.get('index')
|
||||
if season is not None and episode is not None:
|
||||
episodes_map[(season, episode)] = {
|
||||
'ratingKey': ep.get('ratingKey'),
|
||||
'title': ep.get('title'),
|
||||
'season': season,
|
||||
'episode': episode,
|
||||
'show_rating_key': show_rating_key,
|
||||
'air_date': ep.get('originallyAvailableAt'),
|
||||
}
|
||||
|
||||
logger.debug(f"Found {len(episodes_map)} episodes for show {show_rating_key}")
|
||||
return episodes_map
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get episodes for show {show_rating_key}: {e}")
|
||||
return {}
|
||||
|
||||
def get_watch_url(self, rating_key: str) -> str:
|
||||
"""
|
||||
Generate a direct watch URL for a Plex item.
|
||||
|
||||
Args:
|
||||
rating_key: Plex ratingKey for the item
|
||||
|
||||
Returns:
|
||||
URL to open the item in Plex Web
|
||||
"""
|
||||
# Extract server machine identifier from base URL or use a generic format
|
||||
# Plex Web URL format: /web/index.html#!/server/{machineId}/details?key=/library/metadata/{ratingKey}
|
||||
return f"{self.base_url}/web/index.html#!/server/1/details?key=%2Flibrary%2Fmetadata%2F{rating_key}"
|
||||
|
||||
async def get_server_identity(self) -> Optional[Dict]:
|
||||
"""
|
||||
Get Plex server identity including machine identifier.
|
||||
|
||||
Returns:
|
||||
Server identity dict or None
|
||||
"""
|
||||
try:
|
||||
url = f"{self.base_url}/identity"
|
||||
response = await http_client.get(url, headers=self._headers)
|
||||
data = response.json()
|
||||
container = data.get('MediaContainer', {})
|
||||
return {
|
||||
'machineIdentifier': container.get('machineIdentifier'),
|
||||
'friendlyName': container.get('friendlyName'),
|
||||
'version': container.get('version'),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get server identity: {e}")
|
||||
return None
|
||||
|
||||
def get_full_watch_url(self, rating_key: str, machine_id: str) -> str:
|
||||
"""
|
||||
Generate a complete Plex watch URL with machine identifier.
|
||||
|
||||
Args:
|
||||
rating_key: Plex ratingKey for the item
|
||||
machine_id: Plex server machine identifier
|
||||
|
||||
Returns:
|
||||
Complete Plex Web URL
|
||||
"""
|
||||
encoded_key = f"%2Flibrary%2Fmetadata%2F{rating_key}"
|
||||
return f"{self.base_url}/web/index.html#!/server/{machine_id}/details?key={encoded_key}"
|
||||
|
||||
async def search_by_actor(self, actor_name: str) -> List[Dict]:
|
||||
"""
|
||||
Search Plex library for all movies and TV shows featuring an actor.
|
||||
|
||||
Uses Plex's actor filter to find all content with the actor in cast.
|
||||
|
||||
Args:
|
||||
actor_name: Name of the actor to search for
|
||||
|
||||
Returns:
|
||||
List of appearances with show/movie info and role details
|
||||
"""
|
||||
appearances = []
|
||||
seen_keys = set() # Track to avoid duplicates
|
||||
actor_name_lower = actor_name.lower()
|
||||
|
||||
try:
|
||||
# Get all libraries
|
||||
libraries = await self.get_libraries()
|
||||
|
||||
for library in libraries:
|
||||
lib_key = library.get('id')
|
||||
lib_type = library.get('type')
|
||||
|
||||
# Only search movie and show libraries
|
||||
if lib_type not in ('movie', 'show'):
|
||||
continue
|
||||
|
||||
try:
|
||||
# Use actor filter to find all content featuring this actor
|
||||
# This is the most reliable method in Plex
|
||||
url = f"{self.base_url}/library/sections/{lib_key}/all"
|
||||
params = {
|
||||
'type': 1 if lib_type == 'movie' else 2, # 1=movie, 2=show
|
||||
'actor': actor_name, # Plex accepts actor name directly
|
||||
}
|
||||
|
||||
response = await http_client.get(url, headers=self._headers, params=params)
|
||||
data = response.json()
|
||||
items = data.get('MediaContainer', {}).get('Metadata', [])
|
||||
|
||||
logger.debug(f"Found {len(items)} {lib_type}s for '{actor_name}' in library {library.get('title')}")
|
||||
|
||||
for item in items:
|
||||
rating_key = item.get('ratingKey')
|
||||
if not rating_key or rating_key in seen_keys:
|
||||
continue
|
||||
|
||||
seen_keys.add(rating_key)
|
||||
|
||||
# Get detailed metadata for character name
|
||||
detail_url = f"{self.base_url}/library/metadata/{rating_key}"
|
||||
detail_response = await http_client.get(detail_url, headers=self._headers)
|
||||
detail_data = detail_response.json()
|
||||
detail_items = detail_data.get('MediaContainer', {}).get('Metadata', [])
|
||||
|
||||
if not detail_items:
|
||||
continue
|
||||
|
||||
detail = detail_items[0]
|
||||
|
||||
# Find the actor's role/character name
|
||||
character_name = None
|
||||
roles = detail.get('Role', [])
|
||||
for role in roles:
|
||||
role_tag = (role.get('tag') or '').lower()
|
||||
if actor_name_lower in role_tag or role_tag in actor_name_lower:
|
||||
character_name = role.get('role')
|
||||
break
|
||||
|
||||
# Build poster URL with auth token
|
||||
thumb = detail.get('thumb')
|
||||
poster_url = None
|
||||
if thumb:
|
||||
poster_url = f"{self.base_url}{thumb}?X-Plex-Token={self.token}"
|
||||
|
||||
# Build appearance data
|
||||
appearance = {
|
||||
'appearance_type': 'Movie' if lib_type == 'movie' else 'TV',
|
||||
'show_name': detail.get('title'),
|
||||
'episode_title': None,
|
||||
'network': detail.get('studio'),
|
||||
'appearance_date': detail.get('originallyAvailableAt'),
|
||||
'year': detail.get('year'),
|
||||
'status': 'aired',
|
||||
'description': detail.get('summary'),
|
||||
'poster_url': poster_url,
|
||||
'credit_type': 'acting',
|
||||
'character_name': character_name,
|
||||
'plex_rating_key': rating_key,
|
||||
'plex_library_id': lib_key,
|
||||
'source': 'plex',
|
||||
}
|
||||
|
||||
# For TV shows, get episode count
|
||||
if lib_type == 'show':
|
||||
appearance['episode_count'] = detail.get('leafCount', 1)
|
||||
|
||||
appearances.append(appearance)
|
||||
logger.info(f"Found Plex appearance: {actor_name} in '{detail.get('title')}'" +
|
||||
(f" as {character_name}" if character_name else ""))
|
||||
|
||||
# Small delay between detail requests
|
||||
await asyncio.sleep(0.02)
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error searching library {lib_key}: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"Found {len(appearances)} Plex appearances for {actor_name}")
|
||||
return appearances
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to search Plex by actor: {e}")
|
||||
return []
|
||||
|
||||
async def batch_match_appearances(self, appearances: List[Dict], on_match=None) -> Dict[int, Dict]:
|
||||
"""
|
||||
Match multiple appearances to Plex library items.
|
||||
|
||||
Args:
|
||||
appearances: List of appearance dicts with tmdb_show_id or tmdb_movie_id
|
||||
on_match: Optional async callback(appearance_id, match_info) called for each match
|
||||
|
||||
Returns:
|
||||
Dict mapping appearance ID to Plex match info {rating_key, library_id}
|
||||
"""
|
||||
matches = {}
|
||||
server_info = await self.get_server_identity()
|
||||
machine_id = server_info.get('machineIdentifier') if server_info else None
|
||||
|
||||
# Dedupe by TMDB ID to avoid redundant searches
|
||||
tmdb_cache: Dict[tuple, Optional[Dict]] = {}
|
||||
# Cache episode lookups per show
|
||||
episode_cache: Dict[str, Dict[tuple, Optional[Dict]]] = {}
|
||||
|
||||
for appearance in appearances:
|
||||
appearance_id = appearance.get('id')
|
||||
if not appearance_id:
|
||||
continue
|
||||
|
||||
# Determine media type and TMDB ID
|
||||
tmdb_id = appearance.get('tmdb_movie_id') or appearance.get('tmdb_show_id')
|
||||
is_movie = appearance.get('appearance_type') == 'Movie'
|
||||
media_type = 'movie' if is_movie else 'show'
|
||||
|
||||
if not tmdb_id:
|
||||
continue
|
||||
|
||||
cache_key = (tmdb_id, media_type)
|
||||
|
||||
# Check cache first
|
||||
if cache_key in tmdb_cache:
|
||||
plex_item = tmdb_cache[cache_key]
|
||||
else:
|
||||
# Rate limiting
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Try TMDB ID first
|
||||
plex_item = await self.search_by_tmdb_id(tmdb_id, media_type)
|
||||
|
||||
# Fall back to title search if no TMDB match
|
||||
if not plex_item:
|
||||
title = appearance.get('movie_name') or appearance.get('show_name')
|
||||
year = None
|
||||
if appearance.get('release_date'):
|
||||
try:
|
||||
year = int(appearance['release_date'][:4])
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
if title:
|
||||
plex_item = await self.search_by_title(title, year, media_type)
|
||||
|
||||
tmdb_cache[cache_key] = plex_item
|
||||
|
||||
if plex_item:
|
||||
show_rating_key = plex_item.get('ratingKey') # Always the show/movie key
|
||||
rating_key = show_rating_key if is_movie else None # Movies get the key, TV starts with None
|
||||
library_id = plex_item.get('librarySectionID')
|
||||
|
||||
# For TV shows with season/episode data, try to match the specific episode
|
||||
season = appearance.get('season_number')
|
||||
episode = appearance.get('episode_number')
|
||||
if not is_movie and season and episode:
|
||||
# Check episode cache first
|
||||
show_key = str(show_rating_key)
|
||||
ep_key = (season, episode)
|
||||
if show_key not in episode_cache:
|
||||
episode_cache[show_key] = {}
|
||||
|
||||
if ep_key in episode_cache[show_key]:
|
||||
episode_item = episode_cache[show_key][ep_key]
|
||||
else:
|
||||
episode_item = await self.get_episode(show_rating_key, season, episode)
|
||||
episode_cache[show_key][ep_key] = episode_item
|
||||
|
||||
if episode_item:
|
||||
rating_key = episode_item.get('ratingKey') # Episode-specific key
|
||||
# If episode not found, rating_key stays None - episode not in Plex
|
||||
|
||||
match_info = {
|
||||
'plex_rating_key': rating_key, # Episode key if found, movie key for movies, None for missing TV episodes
|
||||
'plex_show_rating_key': show_rating_key if not is_movie else None, # Show key for TV (for series-level navigation)
|
||||
'plex_library_id': library_id,
|
||||
'plex_watch_url': self.get_full_watch_url(rating_key, machine_id) if (rating_key and machine_id) else (self.get_watch_url(rating_key) if rating_key else None),
|
||||
}
|
||||
matches[appearance_id] = match_info
|
||||
|
||||
# Call the on_match callback for real-time updates
|
||||
if on_match:
|
||||
await on_match(appearance_id, match_info)
|
||||
|
||||
logger.info(f"Matched {len(matches)} of {len(appearances)} appearances to Plex library")
|
||||
return matches
|
||||
445
modules/podchaser_client.py
Normal file
445
modules/podchaser_client.py
Normal file
@@ -0,0 +1,445 @@
|
||||
"""Podchaser GraphQL API client for podcast guest appearances tracking"""
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Optional
|
||||
from web.backend.core.http_client import http_client
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('Podchaser')
|
||||
|
||||
class PodchaserClient:
|
||||
"""Client for interacting with the Podchaser GraphQL API"""
|
||||
|
||||
API_URL = "https://api.podchaser.com/graphql"
|
||||
|
||||
def __init__(self, api_key: str):
|
||||
# API key is actually the access token (already exchanged from client credentials)
|
||||
self.api_key = api_key
|
||||
self.headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
@classmethod
|
||||
async def from_client_credentials(cls, client_id: str, client_secret: str):
|
||||
"""
|
||||
Create a PodchaserClient by exchanging client credentials for an access token
|
||||
|
||||
Args:
|
||||
client_id: Podchaser client ID
|
||||
client_secret: Podchaser client secret
|
||||
|
||||
Returns:
|
||||
PodchaserClient instance with access token
|
||||
"""
|
||||
from web.backend.core.http_client import http_client
|
||||
|
||||
mutation = """
|
||||
mutation GetToken($client_id: String!, $client_secret: String!) {
|
||||
requestAccessToken(
|
||||
input: {
|
||||
grant_type: CLIENT_CREDENTIALS
|
||||
client_id: $client_id
|
||||
client_secret: $client_secret
|
||||
}
|
||||
) {
|
||||
access_token
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
variables = {
|
||||
"client_id": client_id,
|
||||
"client_secret": client_secret
|
||||
}
|
||||
|
||||
try:
|
||||
response = await http_client.post(
|
||||
cls.API_URL,
|
||||
json={"query": mutation, "variables": variables},
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
data = response.json()
|
||||
|
||||
if "errors" in data:
|
||||
logger.error(f"Failed to get Podchaser access token: {data['errors']}")
|
||||
raise Exception(f"Podchaser authentication failed: {data['errors']}")
|
||||
|
||||
access_token = data.get("data", {}).get("requestAccessToken", {}).get("access_token")
|
||||
|
||||
if not access_token:
|
||||
raise Exception("No access token returned from Podchaser")
|
||||
|
||||
logger.info("Successfully obtained Podchaser access token")
|
||||
return cls(access_token)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting Podchaser access token: {e}")
|
||||
raise
|
||||
|
||||
async def _execute_query(self, query: str, variables: Optional[Dict] = None) -> Dict:
|
||||
"""Execute a GraphQL query"""
|
||||
try:
|
||||
payload = {"query": query}
|
||||
if variables:
|
||||
payload["variables"] = variables
|
||||
|
||||
response = await http_client.post(
|
||||
self.API_URL,
|
||||
json=payload,
|
||||
headers=self.headers
|
||||
)
|
||||
|
||||
data = response.json()
|
||||
|
||||
if "errors" in data:
|
||||
logger.error(f"GraphQL errors: {data['errors']}")
|
||||
return {}
|
||||
|
||||
return data.get("data", {})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Podchaser API error: {e}")
|
||||
return {}
|
||||
|
||||
async def search_creator_by_creators_endpoint(self, name: str) -> Optional[Dict]:
|
||||
"""
|
||||
Search for a creator using the creators endpoint
|
||||
This is more direct than searching via credits or podcasts
|
||||
"""
|
||||
query = """
|
||||
query FindCreator($term: String!) {
|
||||
creators(searchTerm: $term, first: 10) {
|
||||
data {
|
||||
pcid
|
||||
name
|
||||
informalName
|
||||
subtitle
|
||||
imageUrl
|
||||
url
|
||||
episodeAppearanceCount
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
variables = {"term": name}
|
||||
data = await self._execute_query(query, variables)
|
||||
|
||||
if data and "creators" in data and data["creators"]["data"]:
|
||||
creators = data["creators"]["data"]
|
||||
# Prefer exact case-insensitive match
|
||||
name_lower = name.strip().lower()
|
||||
for creator in creators:
|
||||
if creator.get("name") and creator["name"].strip().lower() == name_lower:
|
||||
logger.info(f"Found exact creator match: {creator['name']} (pcid: {creator['pcid']})")
|
||||
return creator
|
||||
|
||||
# Return first result if no exact match
|
||||
if creators:
|
||||
logger.info(f"Found creator: {creators[0]['name']} (pcid: {creators[0]['pcid']})")
|
||||
return creators[0]
|
||||
|
||||
return None
|
||||
|
||||
async def search_creator(self, name: str) -> Optional[Dict]:
|
||||
"""
|
||||
Search for a creator by name using the creators endpoint
|
||||
Returns the first matching creator or None
|
||||
"""
|
||||
return await self.search_creator_by_creators_endpoint(name)
|
||||
|
||||
async def get_creator_guest_appearances(self, creator_id: str, days_back: int = 30, days_ahead: int = 365) -> List[Dict]:
|
||||
"""
|
||||
Get all guest AND host appearances (episodeCredits) for a creator
|
||||
Filters for recent and upcoming episodes
|
||||
|
||||
Args:
|
||||
creator_id: Podchaser creator ID
|
||||
days_back: How many days in the past to search
|
||||
days_ahead: How many days in the future to search
|
||||
|
||||
Returns:
|
||||
List of episode appearances with metadata (both guest and host roles)
|
||||
"""
|
||||
today = datetime.now().date()
|
||||
cutoff_past = today - timedelta(days=days_back)
|
||||
cutoff_future = today + timedelta(days=days_ahead)
|
||||
|
||||
query = """
|
||||
query GetCreatorAppearances($creatorId: String!, $page: Int) {
|
||||
creator(identifier: {type: PCID, id: $creatorId}) {
|
||||
pcid
|
||||
name
|
||||
episodeCredits(
|
||||
filters: { role: ["guest", "host"] }
|
||||
first: 20
|
||||
page: $page
|
||||
sort: {sortBy: DATE, direction: DESCENDING}
|
||||
) {
|
||||
data {
|
||||
role {
|
||||
code
|
||||
title
|
||||
}
|
||||
episode {
|
||||
id
|
||||
title
|
||||
description
|
||||
url
|
||||
imageUrl
|
||||
audioUrl
|
||||
airDate
|
||||
podcast {
|
||||
id
|
||||
title
|
||||
imageUrl
|
||||
url
|
||||
categories {
|
||||
title
|
||||
slug
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
paginatorInfo {
|
||||
currentPage
|
||||
hasMorePages
|
||||
lastPage
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
page = 1
|
||||
max_pages = 10 # Limit to prevent excessive API calls
|
||||
appearances = []
|
||||
|
||||
while page <= max_pages:
|
||||
variables = {
|
||||
"creatorId": str(creator_id),
|
||||
"page": page
|
||||
}
|
||||
|
||||
data = await self._execute_query(query, variables)
|
||||
|
||||
if not data or "creator" not in data or not data["creator"]:
|
||||
break
|
||||
|
||||
creator_data = data["creator"]
|
||||
episode_credits = creator_data.get("episodeCredits", {}).get("data", [])
|
||||
|
||||
logger.info(f"Fetched {len(episode_credits)} episodes from Podchaser (page {page})")
|
||||
|
||||
for credit in episode_credits:
|
||||
episode = credit.get("episode")
|
||||
if not episode:
|
||||
continue
|
||||
|
||||
# Check air date
|
||||
air_date_str = episode.get("airDate")
|
||||
if not air_date_str:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Handle both "YYYY-MM-DD" and "YYYY-MM-DD HH:MM:SS" formats
|
||||
# Take only the date part (first 10 characters for YYYY-MM-DD)
|
||||
date_part = air_date_str[:10] if len(air_date_str) >= 10 else air_date_str
|
||||
air_date = datetime.strptime(date_part, "%Y-%m-%d").date()
|
||||
|
||||
# Only include episodes within our time window
|
||||
if cutoff_past <= air_date <= cutoff_future:
|
||||
podcast = episode.get("podcast", {})
|
||||
|
||||
role_obj = credit.get("role", {})
|
||||
role_name = role_obj.get("title") if isinstance(role_obj, dict) else None
|
||||
|
||||
appearances.append({
|
||||
"podchaser_episode_id": episode.get("id"),
|
||||
"episode_title": episode.get("title"),
|
||||
"podcast_name": podcast.get("title"),
|
||||
"description": episode.get("description"),
|
||||
"air_date": air_date_str,
|
||||
"episode_url": episode.get("url"),
|
||||
"audio_url": episode.get("audioUrl"),
|
||||
"poster_url": episode.get("imageUrl") or podcast.get("imageUrl"),
|
||||
"role": role_name,
|
||||
"podchaser_podcast_id": podcast.get("id"),
|
||||
})
|
||||
except ValueError as e:
|
||||
logger.debug(f"Date parse error for episode: {e}")
|
||||
continue
|
||||
|
||||
# Check if there are more pages
|
||||
paginator = creator_data.get("episodeCredits", {}).get("paginatorInfo", {})
|
||||
if not paginator.get("hasMorePages"):
|
||||
break
|
||||
|
||||
page += 1
|
||||
await asyncio.sleep(0.15) # Rate limiting
|
||||
|
||||
logger.info(f"Returning {len(appearances)} guest/host appearances for creator {creator_id}")
|
||||
return appearances
|
||||
|
||||
async def get_creator_podcast_episodes(self, creator_name: str, days_back: int = 30, days_ahead: int = 365) -> List[Dict]:
|
||||
"""
|
||||
Get podcast episodes where the creator is a host
|
||||
Searches for podcasts by the creator's name and returns recent episodes
|
||||
|
||||
Args:
|
||||
creator_name: Creator's name to search for
|
||||
days_back: How many days in the past to search
|
||||
days_ahead: How many days in the future to search
|
||||
|
||||
Returns:
|
||||
List of podcast episodes with metadata
|
||||
"""
|
||||
today = datetime.now().date()
|
||||
cutoff_past = today - timedelta(days=days_back)
|
||||
cutoff_future = today + timedelta(days=days_ahead)
|
||||
|
||||
# Search for podcasts by creator name
|
||||
query = """
|
||||
query SearchPodcastByHost($searchTerm: String!) {
|
||||
podcasts(searchTerm: $searchTerm, first: 5) {
|
||||
data {
|
||||
id
|
||||
title
|
||||
imageUrl
|
||||
url
|
||||
credits(first: 20) {
|
||||
data {
|
||||
role {
|
||||
code
|
||||
title
|
||||
}
|
||||
creator {
|
||||
pcid
|
||||
name
|
||||
}
|
||||
}
|
||||
}
|
||||
episodes(first: 50, sort: {sortBy: AIR_DATE, direction: DESCENDING}) {
|
||||
data {
|
||||
id
|
||||
title
|
||||
description
|
||||
url
|
||||
imageUrl
|
||||
audioUrl
|
||||
airDate
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
variables = {"searchTerm": creator_name}
|
||||
data = await self._execute_query(query, variables)
|
||||
|
||||
appearances = []
|
||||
|
||||
if data and "podcasts" in data and data["podcasts"]["data"]:
|
||||
for podcast in data["podcasts"]["data"]:
|
||||
# Check if the creator is a host of this podcast
|
||||
credits = podcast.get("credits", {}).get("data", [])
|
||||
is_host = False
|
||||
host_role = None
|
||||
|
||||
for credit in credits:
|
||||
creator = credit.get("creator", {})
|
||||
role = credit.get("role", {})
|
||||
|
||||
# Check if this is our creator and they're a host
|
||||
if (role.get("code") == "host" and
|
||||
creator.get("name") and
|
||||
(creator_name.lower() in creator["name"].lower() or
|
||||
creator["name"].lower() in creator_name.lower())):
|
||||
is_host = True
|
||||
host_role = role.get("title")
|
||||
break
|
||||
|
||||
if not is_host:
|
||||
continue
|
||||
|
||||
# Get episodes from this podcast
|
||||
episodes = podcast.get("episodes", {}).get("data", [])
|
||||
|
||||
for episode in episodes:
|
||||
air_date_str = episode.get("airDate")
|
||||
if not air_date_str:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Handle both "YYYY-MM-DD" and "YYYY-MM-DD HH:MM:SS" formats
|
||||
# Take only the date part (first 10 characters for YYYY-MM-DD)
|
||||
date_part = air_date_str[:10] if len(air_date_str) >= 10 else air_date_str
|
||||
air_date = datetime.strptime(date_part, "%Y-%m-%d").date()
|
||||
|
||||
# Only include episodes within our time window
|
||||
if cutoff_past <= air_date <= cutoff_future:
|
||||
appearances.append({
|
||||
"podchaser_episode_id": episode.get("id"),
|
||||
"episode_title": episode.get("title"),
|
||||
"podcast_name": podcast.get("title"),
|
||||
"description": episode.get("description"),
|
||||
"air_date": air_date_str,
|
||||
"episode_url": episode.get("url"),
|
||||
"audio_url": episode.get("audioUrl"),
|
||||
"poster_url": episode.get("imageUrl") or podcast.get("imageUrl"),
|
||||
"role": host_role,
|
||||
"podchaser_podcast_id": podcast.get("id"),
|
||||
})
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return appearances
|
||||
|
||||
async def find_upcoming_podcast_appearances(self, creator_id: str, creator_name: str = None) -> List[Dict]:
|
||||
"""
|
||||
Find upcoming podcast appearances for a creator
|
||||
Includes both guest appearances (episodeCredits) and hosted podcast episodes
|
||||
Returns episodes that haven't aired yet or aired within last 90 days
|
||||
|
||||
Args:
|
||||
creator_id: Podchaser creator ID (pcid)
|
||||
creator_name: Creator's name (required for podcast search)
|
||||
"""
|
||||
# Get both guest appearances and hosted episodes
|
||||
guest_appearances = await self.get_creator_guest_appearances(
|
||||
creator_id,
|
||||
days_back=365, # Look back 1 year for recent episodes
|
||||
days_ahead=365
|
||||
)
|
||||
|
||||
# For hosted episodes, we need the creator name
|
||||
hosted_episodes = []
|
||||
if creator_name:
|
||||
hosted_episodes = await self.get_creator_podcast_episodes(
|
||||
creator_name,
|
||||
days_back=365, # Look back 1 year for recent episodes
|
||||
days_ahead=365
|
||||
)
|
||||
else:
|
||||
logger.warning(f"No creator name provided for {creator_id}, skipping podcast host search")
|
||||
|
||||
# Combine and deduplicate by episode ID
|
||||
all_appearances = {}
|
||||
for appearance in guest_appearances + hosted_episodes:
|
||||
episode_id = appearance.get("podchaser_episode_id")
|
||||
if episode_id:
|
||||
# If duplicate, prefer the one with more info (hosted episodes usually have more)
|
||||
if episode_id not in all_appearances or len(str(appearance.get("description", ""))) > len(str(all_appearances[episode_id].get("description", ""))):
|
||||
all_appearances[episode_id] = appearance
|
||||
|
||||
# Sort by air date
|
||||
sorted_appearances = sorted(
|
||||
all_appearances.values(),
|
||||
key=lambda x: x.get("air_date", ""),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
return sorted_appearances
|
||||
873
modules/private_gallery_crypto.py
Normal file
873
modules/private_gallery_crypto.py
Normal file
@@ -0,0 +1,873 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Private Gallery Encryption Module
|
||||
|
||||
Provides security features for the Private Gallery:
|
||||
- Password hashing with bcrypt
|
||||
- Key derivation with Argon2id
|
||||
- File encryption/decryption with AES-256-GCM
|
||||
- Field encryption with Fernet
|
||||
- Session token management
|
||||
"""
|
||||
|
||||
import os
|
||||
import secrets
|
||||
import hashlib
|
||||
import base64
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional, Dict, Tuple
|
||||
from pathlib import Path
|
||||
from threading import Lock
|
||||
|
||||
try:
|
||||
import bcrypt
|
||||
except ImportError:
|
||||
bcrypt = None
|
||||
|
||||
try:
|
||||
from argon2 import PasswordHasher
|
||||
from argon2.low_level import hash_secret_raw, Type
|
||||
ARGON2_AVAILABLE = True
|
||||
except ImportError:
|
||||
ARGON2_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from cryptography.fernet import Fernet
|
||||
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
|
||||
from cryptography.hazmat.primitives import hashes
|
||||
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
|
||||
CRYPTO_AVAILABLE = True
|
||||
except ImportError:
|
||||
CRYPTO_AVAILABLE = False
|
||||
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('PrivateGalleryCrypto')
|
||||
|
||||
|
||||
class PrivateGalleryCrypto:
|
||||
"""
|
||||
Handles all encryption operations for the Private Gallery.
|
||||
|
||||
Security features:
|
||||
- Passwords hashed with bcrypt (cost factor 12)
|
||||
- Encryption key derived from password using Argon2id
|
||||
- Files encrypted with AES-256-GCM
|
||||
- Database fields encrypted with Fernet (AES-128-CBC + HMAC)
|
||||
- Session tokens with configurable timeout
|
||||
"""
|
||||
|
||||
# Argon2id parameters (OWASP recommended)
|
||||
ARGON2_TIME_COST = 3
|
||||
ARGON2_MEMORY_COST = 65536 # 64 MiB
|
||||
ARGON2_PARALLELISM = 4
|
||||
ARGON2_HASH_LENGTH = 32 # 256 bits for AES-256
|
||||
|
||||
# AES-GCM parameters
|
||||
AES_KEY_SIZE = 32 # 256 bits
|
||||
AES_NONCE_SIZE = 12 # 96 bits (GCM recommended)
|
||||
AES_TAG_SIZE = 16 # 128 bits
|
||||
|
||||
# Encryption chunk size for streaming large files
|
||||
CHUNK_SIZE = 8 * 1024 * 1024 # 8 MB chunks
|
||||
CHUNKED_THRESHOLD = 50 * 1024 * 1024 # Use chunked encryption for files > 50 MB
|
||||
CHUNKED_MAGIC = b'\x01PGCE' # Magic bytes: version 1, Private Gallery Chunked Encryption
|
||||
|
||||
def __init__(self):
|
||||
self._sessions: Dict[str, Dict] = {} # token -> {expiry, username}
|
||||
self._session_lock = Lock()
|
||||
self._derived_key: Optional[bytes] = None
|
||||
self._fernet: Optional[Fernet] = None
|
||||
self._aesgcm: Optional[AESGCM] = None
|
||||
|
||||
# Check dependencies
|
||||
if not bcrypt:
|
||||
logger.warning("bcrypt not available - password hashing will use fallback")
|
||||
if not ARGON2_AVAILABLE:
|
||||
logger.warning("argon2-cffi not available - key derivation will use PBKDF2")
|
||||
if not CRYPTO_AVAILABLE:
|
||||
raise ImportError("cryptography library required for Private Gallery")
|
||||
|
||||
# =========================================================================
|
||||
# PASSWORD HASHING (bcrypt)
|
||||
# =========================================================================
|
||||
|
||||
def hash_password(self, password: str) -> str:
|
||||
"""
|
||||
Hash a password using bcrypt with cost factor 12.
|
||||
|
||||
Args:
|
||||
password: Plain text password
|
||||
|
||||
Returns:
|
||||
bcrypt hash string (includes salt)
|
||||
"""
|
||||
if bcrypt:
|
||||
salt = bcrypt.gensalt(rounds=12)
|
||||
hashed = bcrypt.hashpw(password.encode('utf-8'), salt)
|
||||
return hashed.decode('utf-8')
|
||||
else:
|
||||
# Fallback to PBKDF2 if bcrypt not available
|
||||
salt = secrets.token_bytes(16)
|
||||
kdf = PBKDF2HMAC(
|
||||
algorithm=hashes.SHA256(),
|
||||
length=32,
|
||||
salt=salt,
|
||||
iterations=600000,
|
||||
)
|
||||
key = kdf.derive(password.encode('utf-8'))
|
||||
return f"pbkdf2${base64.b64encode(salt).decode()}${base64.b64encode(key).decode()}"
|
||||
|
||||
def verify_password(self, password: str, password_hash: str) -> bool:
|
||||
"""
|
||||
Verify a password against its hash.
|
||||
|
||||
Args:
|
||||
password: Plain text password to check
|
||||
password_hash: Stored hash to verify against
|
||||
|
||||
Returns:
|
||||
True if password matches
|
||||
"""
|
||||
try:
|
||||
if password_hash.startswith('pbkdf2$'):
|
||||
# PBKDF2 fallback hash
|
||||
parts = password_hash.split('$')
|
||||
if len(parts) != 3:
|
||||
return False
|
||||
salt = base64.b64decode(parts[1])
|
||||
stored_key = base64.b64decode(parts[2])
|
||||
kdf = PBKDF2HMAC(
|
||||
algorithm=hashes.SHA256(),
|
||||
length=32,
|
||||
salt=salt,
|
||||
iterations=600000,
|
||||
)
|
||||
try:
|
||||
kdf.verify(password.encode('utf-8'), stored_key)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
elif bcrypt:
|
||||
return bcrypt.checkpw(
|
||||
password.encode('utf-8'),
|
||||
password_hash.encode('utf-8')
|
||||
)
|
||||
else:
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Password verification failed: {e}")
|
||||
return False
|
||||
|
||||
# =========================================================================
|
||||
# KEY DERIVATION (Argon2id or PBKDF2)
|
||||
# =========================================================================
|
||||
|
||||
def derive_key(self, password: str, salt: bytes) -> bytes:
|
||||
"""
|
||||
Derive an encryption key from password using Argon2id.
|
||||
|
||||
Args:
|
||||
password: User's password
|
||||
salt: Random salt (should be stored)
|
||||
|
||||
Returns:
|
||||
32-byte derived key for AES-256
|
||||
"""
|
||||
if ARGON2_AVAILABLE:
|
||||
key = hash_secret_raw(
|
||||
secret=password.encode('utf-8'),
|
||||
salt=salt,
|
||||
time_cost=self.ARGON2_TIME_COST,
|
||||
memory_cost=self.ARGON2_MEMORY_COST,
|
||||
parallelism=self.ARGON2_PARALLELISM,
|
||||
hash_len=self.ARGON2_HASH_LENGTH,
|
||||
type=Type.ID # Argon2id
|
||||
)
|
||||
return key
|
||||
else:
|
||||
# Fallback to PBKDF2 with high iterations
|
||||
kdf = PBKDF2HMAC(
|
||||
algorithm=hashes.SHA256(),
|
||||
length=self.AES_KEY_SIZE,
|
||||
salt=salt,
|
||||
iterations=600000, # OWASP recommended minimum
|
||||
)
|
||||
return kdf.derive(password.encode('utf-8'))
|
||||
|
||||
def generate_salt(self) -> bytes:
|
||||
"""Generate a cryptographically secure random salt."""
|
||||
return secrets.token_bytes(16)
|
||||
|
||||
def initialize_encryption(self, password: str, salt: bytes) -> None:
|
||||
"""
|
||||
Initialize encryption with derived key.
|
||||
Must be called after successful unlock.
|
||||
|
||||
Args:
|
||||
password: User's password
|
||||
salt: Stored salt for key derivation
|
||||
"""
|
||||
self._derived_key = self.derive_key(password, salt)
|
||||
|
||||
# Initialize Fernet for field encryption
|
||||
# Fernet requires a 32-byte key, base64-encoded
|
||||
fernet_key = base64.urlsafe_b64encode(self._derived_key)
|
||||
self._fernet = Fernet(fernet_key)
|
||||
|
||||
# Initialize AES-GCM for file encryption
|
||||
self._aesgcm = AESGCM(self._derived_key)
|
||||
|
||||
logger.info("Encryption initialized successfully")
|
||||
|
||||
def clear_encryption(self) -> None:
|
||||
"""Clear encryption keys from memory (on lock)."""
|
||||
self._derived_key = None
|
||||
self._fernet = None
|
||||
self._aesgcm = None
|
||||
logger.info("Encryption keys cleared")
|
||||
|
||||
def is_initialized(self) -> bool:
|
||||
"""Check if encryption is initialized (unlocked)."""
|
||||
return self._derived_key is not None
|
||||
|
||||
# =========================================================================
|
||||
# FIELD ENCRYPTION (Fernet - for database fields)
|
||||
# =========================================================================
|
||||
|
||||
def encrypt_field(self, plaintext: str) -> str:
|
||||
"""
|
||||
Encrypt a database field value.
|
||||
|
||||
Args:
|
||||
plaintext: Plain text to encrypt
|
||||
|
||||
Returns:
|
||||
Base64-encoded encrypted string
|
||||
"""
|
||||
if not self._fernet:
|
||||
raise RuntimeError("Encryption not initialized - call initialize_encryption first")
|
||||
|
||||
if not plaintext:
|
||||
return ""
|
||||
|
||||
encrypted = self._fernet.encrypt(plaintext.encode('utf-8'))
|
||||
return base64.urlsafe_b64encode(encrypted).decode('utf-8')
|
||||
|
||||
def decrypt_field(self, ciphertext: str) -> str:
|
||||
"""
|
||||
Decrypt a database field value.
|
||||
|
||||
Args:
|
||||
ciphertext: Base64-encoded encrypted string
|
||||
|
||||
Returns:
|
||||
Decrypted plain text
|
||||
"""
|
||||
if not self._fernet:
|
||||
raise RuntimeError("Encryption not initialized - call initialize_encryption first")
|
||||
|
||||
if not ciphertext:
|
||||
return ""
|
||||
|
||||
try:
|
||||
encrypted = base64.urlsafe_b64decode(ciphertext.encode('utf-8'))
|
||||
decrypted = self._fernet.decrypt(encrypted)
|
||||
return decrypted.decode('utf-8')
|
||||
except Exception as e:
|
||||
logger.error(f"Field decryption failed: {e}")
|
||||
return "[Decryption Error]"
|
||||
|
||||
# =========================================================================
|
||||
# FILE ENCRYPTION (AES-256-GCM)
|
||||
# =========================================================================
|
||||
|
||||
def encrypt_file(self, input_path: Path, output_path: Path) -> bool:
|
||||
"""
|
||||
Encrypt a file using AES-256-GCM.
|
||||
|
||||
Small files (<=50MB): single-shot format
|
||||
[12-byte nonce][encrypted data + 16-byte tag]
|
||||
|
||||
Large files (>50MB): chunked format for memory efficiency
|
||||
[5-byte magic 0x01PGCE][4-byte chunk_size BE]
|
||||
[12-byte nonce][encrypted chunk + 16-byte tag] (repeated)
|
||||
|
||||
Args:
|
||||
input_path: Path to plaintext file
|
||||
output_path: Path for encrypted output
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
if not self._aesgcm:
|
||||
raise RuntimeError("Encryption not initialized")
|
||||
|
||||
try:
|
||||
file_size = input_path.stat().st_size
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if file_size <= self.CHUNKED_THRESHOLD:
|
||||
# Small file: single-shot encryption (backward compatible)
|
||||
nonce = secrets.token_bytes(self.AES_NONCE_SIZE)
|
||||
with open(input_path, 'rb') as f:
|
||||
plaintext = f.read()
|
||||
ciphertext = self._aesgcm.encrypt(nonce, plaintext, None)
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(nonce)
|
||||
f.write(ciphertext)
|
||||
else:
|
||||
# Large file: chunked encryption
|
||||
import struct
|
||||
with open(input_path, 'rb') as fin, open(output_path, 'wb') as fout:
|
||||
# Write header
|
||||
fout.write(self.CHUNKED_MAGIC)
|
||||
fout.write(struct.pack('>I', self.CHUNK_SIZE))
|
||||
|
||||
# Encrypt in chunks
|
||||
while True:
|
||||
chunk = fin.read(self.CHUNK_SIZE)
|
||||
if not chunk:
|
||||
break
|
||||
nonce = secrets.token_bytes(self.AES_NONCE_SIZE)
|
||||
encrypted_chunk = self._aesgcm.encrypt(nonce, chunk, None)
|
||||
# Write chunk: nonce + encrypted data (includes GCM tag)
|
||||
fout.write(nonce)
|
||||
fout.write(struct.pack('>I', len(encrypted_chunk)))
|
||||
fout.write(encrypted_chunk)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"File encryption failed: {e}")
|
||||
# Clean up partial output
|
||||
if output_path.exists():
|
||||
try:
|
||||
output_path.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
def _is_chunked_format(self, input_path: Path) -> bool:
|
||||
"""Check if an encrypted file uses the chunked format."""
|
||||
try:
|
||||
with open(input_path, 'rb') as f:
|
||||
magic = f.read(len(self.CHUNKED_MAGIC))
|
||||
return magic == self.CHUNKED_MAGIC
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def decrypt_file(self, input_path: Path, output_path: Optional[Path] = None) -> Optional[bytes]:
|
||||
"""
|
||||
Decrypt a file encrypted with AES-256-GCM.
|
||||
Handles both single-shot and chunked formats.
|
||||
|
||||
Args:
|
||||
input_path: Path to encrypted file
|
||||
output_path: Optional path to write decrypted file
|
||||
|
||||
Returns:
|
||||
Decrypted bytes if output_path is None, else None on success
|
||||
"""
|
||||
if not self._aesgcm:
|
||||
raise RuntimeError("Encryption not initialized")
|
||||
|
||||
try:
|
||||
if self._is_chunked_format(input_path):
|
||||
return self._decrypt_file_chunked(input_path, output_path)
|
||||
|
||||
# Single-shot format: [nonce][ciphertext+tag]
|
||||
with open(input_path, 'rb') as f:
|
||||
nonce = f.read(self.AES_NONCE_SIZE)
|
||||
if len(nonce) != self.AES_NONCE_SIZE:
|
||||
raise ValueError("Invalid encrypted file: missing nonce")
|
||||
ciphertext = f.read()
|
||||
|
||||
plaintext = self._aesgcm.decrypt(nonce, ciphertext, None)
|
||||
|
||||
if output_path:
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(plaintext)
|
||||
return None
|
||||
|
||||
return plaintext
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"File decryption failed: {e}")
|
||||
return None
|
||||
|
||||
def _decrypt_file_chunked(self, input_path: Path, output_path: Optional[Path] = None) -> Optional[bytes]:
|
||||
"""Decrypt a chunked-format encrypted file."""
|
||||
import struct
|
||||
|
||||
try:
|
||||
parts = [] if output_path is None else None
|
||||
|
||||
with open(input_path, 'rb') as fin:
|
||||
# Read header
|
||||
magic = fin.read(len(self.CHUNKED_MAGIC))
|
||||
if magic != self.CHUNKED_MAGIC:
|
||||
raise ValueError("Invalid chunked file header")
|
||||
chunk_size_bytes = fin.read(4)
|
||||
# chunk_size from header (informational, actual sizes are per-chunk)
|
||||
struct.unpack('>I', chunk_size_bytes)
|
||||
|
||||
fout = None
|
||||
if output_path:
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fout = open(output_path, 'wb')
|
||||
|
||||
try:
|
||||
while True:
|
||||
# Read chunk: [12-byte nonce][4-byte encrypted_len][encrypted data]
|
||||
nonce = fin.read(self.AES_NONCE_SIZE)
|
||||
if len(nonce) == 0:
|
||||
break # EOF
|
||||
if len(nonce) != self.AES_NONCE_SIZE:
|
||||
raise ValueError("Truncated chunk nonce")
|
||||
|
||||
enc_len_bytes = fin.read(4)
|
||||
if len(enc_len_bytes) != 4:
|
||||
raise ValueError("Truncated chunk length")
|
||||
enc_len = struct.unpack('>I', enc_len_bytes)[0]
|
||||
|
||||
encrypted_chunk = fin.read(enc_len)
|
||||
if len(encrypted_chunk) != enc_len:
|
||||
raise ValueError("Truncated chunk data")
|
||||
|
||||
decrypted_chunk = self._aesgcm.decrypt(nonce, encrypted_chunk, None)
|
||||
|
||||
if fout:
|
||||
fout.write(decrypted_chunk)
|
||||
else:
|
||||
parts.append(decrypted_chunk)
|
||||
finally:
|
||||
if fout:
|
||||
fout.close()
|
||||
|
||||
if output_path:
|
||||
return None
|
||||
return b''.join(parts)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Chunked file decryption failed for {input_path}: {type(e).__name__}: {e}")
|
||||
return None
|
||||
|
||||
def re_encrypt_to_chunked(self, file_path: Path) -> bool:
|
||||
"""
|
||||
Re-encrypt a single-shot encrypted file to chunked format in-place.
|
||||
Decrypts and re-encrypts in chunks to avoid loading the entire file into memory.
|
||||
|
||||
Args:
|
||||
file_path: Path to the single-shot encrypted file
|
||||
|
||||
Returns:
|
||||
True if successful, False if already chunked or on error
|
||||
"""
|
||||
if not self._aesgcm:
|
||||
raise RuntimeError("Encryption not initialized")
|
||||
|
||||
if self._is_chunked_format(file_path):
|
||||
return False # Already chunked
|
||||
|
||||
import struct
|
||||
temp_path = file_path.with_suffix(f'.enc.{secrets.token_hex(4)}.tmp')
|
||||
|
||||
try:
|
||||
# Decrypt the single-shot file fully (required by AES-GCM)
|
||||
with open(file_path, 'rb') as f:
|
||||
nonce = f.read(self.AES_NONCE_SIZE)
|
||||
if len(nonce) != self.AES_NONCE_SIZE:
|
||||
raise ValueError("Invalid encrypted file")
|
||||
ciphertext = f.read()
|
||||
|
||||
plaintext = self._aesgcm.decrypt(nonce, ciphertext, None)
|
||||
del ciphertext # Free memory
|
||||
|
||||
# Write chunked format to temp file
|
||||
with open(temp_path, 'wb') as fout:
|
||||
fout.write(self.CHUNKED_MAGIC)
|
||||
fout.write(struct.pack('>I', self.CHUNK_SIZE))
|
||||
|
||||
offset = 0
|
||||
while offset < len(plaintext):
|
||||
chunk = plaintext[offset:offset + self.CHUNK_SIZE]
|
||||
offset += len(chunk)
|
||||
chunk_nonce = secrets.token_bytes(self.AES_NONCE_SIZE)
|
||||
encrypted_chunk = self._aesgcm.encrypt(chunk_nonce, chunk, None)
|
||||
fout.write(chunk_nonce)
|
||||
fout.write(struct.pack('>I', len(encrypted_chunk)))
|
||||
fout.write(encrypted_chunk)
|
||||
|
||||
del plaintext # Free memory
|
||||
|
||||
# Atomic replace
|
||||
temp_path.replace(file_path)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Re-encryption to chunked failed for {file_path}: {e}")
|
||||
if temp_path.exists():
|
||||
try:
|
||||
temp_path.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
def decrypt_file_streaming(self, input_path: Path) -> Optional[bytes]:
|
||||
"""
|
||||
Decrypt a file and return bytes for streaming.
|
||||
Only suitable for small files (single-shot format, ≤50MB).
|
||||
For large chunked files, use decrypt_file_generator() instead.
|
||||
|
||||
Args:
|
||||
input_path: Path to encrypted file
|
||||
|
||||
Returns:
|
||||
Decrypted bytes or None on error
|
||||
"""
|
||||
return self.decrypt_file(input_path, output_path=None)
|
||||
|
||||
def decrypt_file_generator(self, input_path: Path):
|
||||
"""
|
||||
Generator that yields decrypted chunks for streaming large files.
|
||||
For chunked files, yields one decrypted chunk at a time (~8MB each).
|
||||
For single-shot files, yields the entire content at once.
|
||||
|
||||
Args:
|
||||
input_path: Path to encrypted file
|
||||
|
||||
Yields:
|
||||
bytes: Decrypted data chunks
|
||||
"""
|
||||
import struct
|
||||
|
||||
if not self._aesgcm:
|
||||
raise RuntimeError("Encryption not initialized")
|
||||
|
||||
if self._is_chunked_format(input_path):
|
||||
with open(input_path, 'rb') as fin:
|
||||
# Skip header
|
||||
fin.read(len(self.CHUNKED_MAGIC))
|
||||
fin.read(4)
|
||||
|
||||
while True:
|
||||
nonce = fin.read(self.AES_NONCE_SIZE)
|
||||
if len(nonce) == 0:
|
||||
break
|
||||
if len(nonce) != self.AES_NONCE_SIZE:
|
||||
raise ValueError("Truncated chunk nonce")
|
||||
|
||||
enc_len_bytes = fin.read(4)
|
||||
if len(enc_len_bytes) != 4:
|
||||
raise ValueError("Truncated chunk length")
|
||||
enc_len = struct.unpack('>I', enc_len_bytes)[0]
|
||||
|
||||
encrypted_chunk = fin.read(enc_len)
|
||||
if len(encrypted_chunk) != enc_len:
|
||||
raise ValueError("Truncated chunk data")
|
||||
|
||||
yield self._aesgcm.decrypt(nonce, encrypted_chunk, None)
|
||||
else:
|
||||
# Single-shot: yield everything at once (≤50MB)
|
||||
with open(input_path, 'rb') as f:
|
||||
nonce = f.read(self.AES_NONCE_SIZE)
|
||||
if len(nonce) != self.AES_NONCE_SIZE:
|
||||
raise ValueError("Invalid encrypted file: missing nonce")
|
||||
ciphertext = f.read()
|
||||
yield self._aesgcm.decrypt(nonce, ciphertext, None)
|
||||
|
||||
def decrypt_file_range_generator(self, input_path: Path, start: int, end: int):
|
||||
"""
|
||||
Generator that yields only the decrypted bytes for a specific byte range.
|
||||
For chunked files, only decrypts the necessary chunks and slices them.
|
||||
For single-shot files, decrypts all and slices.
|
||||
|
||||
Args:
|
||||
input_path: Path to encrypted file
|
||||
start: Start byte offset (inclusive)
|
||||
end: End byte offset (inclusive)
|
||||
|
||||
Yields:
|
||||
bytes: Decrypted data for the requested range
|
||||
"""
|
||||
import struct
|
||||
|
||||
if not self._aesgcm:
|
||||
raise RuntimeError("Encryption not initialized")
|
||||
|
||||
if not self._is_chunked_format(input_path):
|
||||
# Single-shot: decrypt all and slice (file is ≤50MB)
|
||||
with open(input_path, 'rb') as f:
|
||||
nonce = f.read(self.AES_NONCE_SIZE)
|
||||
ciphertext = f.read()
|
||||
plaintext = self._aesgcm.decrypt(nonce, ciphertext, None)
|
||||
yield plaintext[start:end + 1]
|
||||
return
|
||||
|
||||
chunk_size = self.CHUNK_SIZE
|
||||
first_chunk = start // chunk_size
|
||||
last_chunk = end // chunk_size
|
||||
|
||||
# Header: 5 magic + 4 chunk_size = 9 bytes
|
||||
header_size = len(self.CHUNKED_MAGIC) + 4
|
||||
# Each full encrypted chunk: 12 nonce + 4 length + (chunk_size + 16 tag)
|
||||
enc_chunk_stride = self.AES_NONCE_SIZE + 4 + chunk_size + self.AES_TAG_SIZE
|
||||
|
||||
with open(input_path, 'rb') as fin:
|
||||
for chunk_idx in range(first_chunk, last_chunk + 1):
|
||||
# Seek to this chunk's position in the encrypted file
|
||||
fin.seek(header_size + chunk_idx * enc_chunk_stride)
|
||||
|
||||
nonce = fin.read(self.AES_NONCE_SIZE)
|
||||
if len(nonce) == 0:
|
||||
break
|
||||
if len(nonce) != self.AES_NONCE_SIZE:
|
||||
raise ValueError("Truncated chunk nonce")
|
||||
|
||||
enc_len_bytes = fin.read(4)
|
||||
if len(enc_len_bytes) != 4:
|
||||
raise ValueError("Truncated chunk length")
|
||||
enc_len = struct.unpack('>I', enc_len_bytes)[0]
|
||||
|
||||
encrypted_chunk = fin.read(enc_len)
|
||||
if len(encrypted_chunk) != enc_len:
|
||||
raise ValueError("Truncated chunk data")
|
||||
|
||||
decrypted_chunk = self._aesgcm.decrypt(nonce, encrypted_chunk, None)
|
||||
|
||||
# Calculate which part of this chunk we need
|
||||
chunk_start_byte = chunk_idx * chunk_size
|
||||
slice_start = max(start - chunk_start_byte, 0)
|
||||
slice_end = min(end - chunk_start_byte + 1, len(decrypted_chunk))
|
||||
|
||||
yield decrypted_chunk[slice_start:slice_end]
|
||||
|
||||
# =========================================================================
|
||||
# SESSION MANAGEMENT
|
||||
# =========================================================================
|
||||
|
||||
def create_session(self, username: str = "user", timeout_minutes: int = 30) -> str:
|
||||
"""
|
||||
Create a new session token.
|
||||
|
||||
Args:
|
||||
username: Username for the session
|
||||
timeout_minutes: Session timeout in minutes
|
||||
|
||||
Returns:
|
||||
Session token string
|
||||
"""
|
||||
token = secrets.token_urlsafe(32)
|
||||
expiry = datetime.now() + timedelta(minutes=timeout_minutes)
|
||||
|
||||
with self._session_lock:
|
||||
self._sessions[token] = {
|
||||
'expiry': expiry,
|
||||
'username': username,
|
||||
'created_at': datetime.now()
|
||||
}
|
||||
|
||||
logger.info(f"Created session for {username}, expires in {timeout_minutes} minutes")
|
||||
return token
|
||||
|
||||
def verify_session(self, token: str) -> Optional[Dict]:
|
||||
"""
|
||||
Verify a session token is valid and not expired.
|
||||
|
||||
Args:
|
||||
token: Session token to verify
|
||||
|
||||
Returns:
|
||||
Session info dict if valid, None otherwise
|
||||
"""
|
||||
with self._session_lock:
|
||||
session = self._sessions.get(token)
|
||||
|
||||
if not session:
|
||||
return None
|
||||
|
||||
if datetime.now() > session['expiry']:
|
||||
# Expired - remove it
|
||||
del self._sessions[token]
|
||||
return None
|
||||
|
||||
return session
|
||||
|
||||
def refresh_session(self, token: str, timeout_minutes: int = 30) -> bool:
|
||||
"""
|
||||
Refresh a session's expiry time.
|
||||
|
||||
Args:
|
||||
token: Session token to refresh
|
||||
timeout_minutes: New timeout in minutes
|
||||
|
||||
Returns:
|
||||
True if refreshed, False if token invalid
|
||||
"""
|
||||
with self._session_lock:
|
||||
session = self._sessions.get(token)
|
||||
|
||||
if not session:
|
||||
return False
|
||||
|
||||
if datetime.now() > session['expiry']:
|
||||
del self._sessions[token]
|
||||
return False
|
||||
|
||||
session['expiry'] = datetime.now() + timedelta(minutes=timeout_minutes)
|
||||
return True
|
||||
|
||||
def invalidate_session(self, token: str) -> bool:
|
||||
"""
|
||||
Invalidate a session token (logout/lock).
|
||||
|
||||
Args:
|
||||
token: Session token to invalidate
|
||||
|
||||
Returns:
|
||||
True if invalidated, False if not found
|
||||
"""
|
||||
with self._session_lock:
|
||||
if token in self._sessions:
|
||||
del self._sessions[token]
|
||||
return True
|
||||
return False
|
||||
|
||||
def invalidate_all_sessions(self) -> int:
|
||||
"""
|
||||
Invalidate all sessions (master lock).
|
||||
|
||||
Returns:
|
||||
Number of sessions invalidated
|
||||
"""
|
||||
with self._session_lock:
|
||||
count = len(self._sessions)
|
||||
self._sessions.clear()
|
||||
return count
|
||||
|
||||
def cleanup_expired_sessions(self) -> int:
|
||||
"""
|
||||
Remove all expired sessions.
|
||||
|
||||
Returns:
|
||||
Number of sessions removed
|
||||
"""
|
||||
with self._session_lock:
|
||||
now = datetime.now()
|
||||
expired = [t for t, s in self._sessions.items() if now > s['expiry']]
|
||||
for token in expired:
|
||||
del self._sessions[token]
|
||||
return len(expired)
|
||||
|
||||
def get_active_session_count(self) -> int:
|
||||
"""Get count of active (non-expired) sessions."""
|
||||
self.cleanup_expired_sessions()
|
||||
return len(self._sessions)
|
||||
|
||||
|
||||
# Global instance
|
||||
_crypto_instance: Optional[PrivateGalleryCrypto] = None
|
||||
_crypto_lock = Lock()
|
||||
|
||||
|
||||
def get_private_gallery_crypto() -> PrivateGalleryCrypto:
|
||||
"""Get or create the global crypto instance."""
|
||||
global _crypto_instance
|
||||
|
||||
with _crypto_lock:
|
||||
if _crypto_instance is None:
|
||||
_crypto_instance = PrivateGalleryCrypto()
|
||||
return _crypto_instance
|
||||
|
||||
|
||||
def export_key_to_file(path: str) -> bool:
|
||||
"""
|
||||
Save the current derived key from the global crypto instance to a file.
|
||||
The file is written with mode 0600 for security.
|
||||
|
||||
Args:
|
||||
path: File path to write the key material to
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
import json as _json
|
||||
|
||||
crypto = get_private_gallery_crypto()
|
||||
if not crypto.is_initialized() or crypto._derived_key is None:
|
||||
logger.warning("Cannot export key: encryption not initialized")
|
||||
return False
|
||||
|
||||
try:
|
||||
key_data = {
|
||||
'derived_key': base64.b64encode(crypto._derived_key).decode('utf-8')
|
||||
}
|
||||
key_path = Path(path)
|
||||
key_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Write atomically via temp file
|
||||
tmp_path = key_path.with_suffix('.tmp')
|
||||
with open(tmp_path, 'w') as f:
|
||||
_json.dump(key_data, f)
|
||||
os.chmod(str(tmp_path), 0o600)
|
||||
tmp_path.replace(key_path)
|
||||
|
||||
logger.info(f"Exported encryption key to {path}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to export key to {path}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def load_key_from_file(path: str) -> Optional[PrivateGalleryCrypto]:
|
||||
"""
|
||||
Load a derived key from a file and return an initialized crypto instance.
|
||||
|
||||
Args:
|
||||
path: File path containing the key material
|
||||
|
||||
Returns:
|
||||
Initialized PrivateGalleryCrypto instance, or None if unavailable
|
||||
"""
|
||||
import json as _json
|
||||
|
||||
key_path = Path(path)
|
||||
if not key_path.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(key_path, 'r') as f:
|
||||
key_data = _json.load(f)
|
||||
|
||||
derived_key = base64.b64decode(key_data['derived_key'])
|
||||
|
||||
crypto = PrivateGalleryCrypto()
|
||||
crypto._derived_key = derived_key
|
||||
|
||||
# Initialize Fernet for field encryption
|
||||
fernet_key = base64.urlsafe_b64encode(derived_key)
|
||||
crypto._fernet = Fernet(fernet_key)
|
||||
|
||||
# Initialize AES-GCM for file encryption
|
||||
crypto._aesgcm = AESGCM(derived_key)
|
||||
|
||||
return crypto
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load key from {path}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def delete_key_file(path: str) -> bool:
|
||||
"""Delete the key file if it exists."""
|
||||
try:
|
||||
key_path = Path(path)
|
||||
if key_path.exists():
|
||||
key_path.unlink()
|
||||
logger.info(f"Deleted key file {path}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete key file {path}: {e}")
|
||||
return False
|
||||
961
modules/pushover_notifier.py
Normal file
961
modules/pushover_notifier.py
Normal file
@@ -0,0 +1,961 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Pushover Notification Module
|
||||
Sends professional push notifications when new media is downloaded
|
||||
"""
|
||||
|
||||
import os
|
||||
import requests
|
||||
from datetime import datetime
|
||||
from typing import Dict, Optional, Any
|
||||
from pathlib import Path
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('Notifier')
|
||||
|
||||
|
||||
class PushoverNotifier:
|
||||
"""Handles Pushover push notifications for media downloads"""
|
||||
|
||||
# Pushover API endpoint
|
||||
API_URL = "https://api.pushover.net/1/messages.json"
|
||||
|
||||
# Plural forms for proper grammar
|
||||
PLURALS = {
|
||||
'story': 'stories',
|
||||
'video': 'videos',
|
||||
'photo': 'photos',
|
||||
'image': 'images',
|
||||
'reel': 'reels',
|
||||
'post': 'posts',
|
||||
'thread': 'threads',
|
||||
'item': 'items',
|
||||
'media': 'media', # Already plural (singular: medium)
|
||||
'tagged': 'tagged', # "Tagged" doesn't change in plural (7 Tagged Photos)
|
||||
'audio': 'audio', # Uncountable (3 Audio Downloaded)
|
||||
}
|
||||
|
||||
# Priority levels
|
||||
PRIORITY_LOW = -2
|
||||
PRIORITY_NORMAL = -1
|
||||
PRIORITY_DEFAULT = 0
|
||||
PRIORITY_HIGH = 1
|
||||
PRIORITY_EMERGENCY = 2
|
||||
|
||||
# Platform emoji/icons for better visual appeal
|
||||
PLATFORM_ICONS = {
|
||||
'instagram': '📸',
|
||||
'fastdl': '📸',
|
||||
'imginn': '📸',
|
||||
'toolzu': '📸',
|
||||
'tiktok': '🎵',
|
||||
'forums': '💬',
|
||||
'snapchat': '👻',
|
||||
'youtube': '▶️',
|
||||
'twitter': '🐦',
|
||||
'easynews': '📰',
|
||||
}
|
||||
|
||||
# Platform name mapping (service name -> user-friendly platform name)
|
||||
PLATFORM_NAMES = {
|
||||
'fastdl': 'Instagram',
|
||||
'imginn': 'Instagram',
|
||||
'toolzu': 'Instagram',
|
||||
'instagram': 'Instagram',
|
||||
'tiktok': 'TikTok',
|
||||
'snapchat': 'Snapchat',
|
||||
'forums': 'Forum',
|
||||
'easynews': 'Easynews',
|
||||
}
|
||||
|
||||
# Content type icons
|
||||
CONTENT_ICONS = {
|
||||
'post': '🖼️',
|
||||
'story': '⭐',
|
||||
'reel': '🎬',
|
||||
'video': '🎥',
|
||||
'image': '🖼️',
|
||||
'thread': '🧵',
|
||||
'photo': '📷',
|
||||
'audio': '🎵',
|
||||
}
|
||||
|
||||
def __init__(self, user_key: str, api_token: str, enabled: bool = True,
|
||||
default_priority: int = 0, device: str = None, include_image: bool = True,
|
||||
unified_db=None, enable_review_queue_notifications: bool = True):
|
||||
"""
|
||||
Initialize Pushover notifier
|
||||
|
||||
Args:
|
||||
user_key: Your Pushover user key
|
||||
api_token: Your Pushover application API token
|
||||
enabled: Whether notifications are enabled
|
||||
default_priority: Default notification priority (-2 to 2)
|
||||
device: Specific device name to send to (optional)
|
||||
include_image: Whether to include image thumbnails in notifications (default: True)
|
||||
unified_db: UnifiedDatabase instance for recording notifications (optional)
|
||||
enable_review_queue_notifications: Whether to send push notifications for review queue items (default: True)
|
||||
"""
|
||||
self.user_key = user_key
|
||||
self.api_token = api_token
|
||||
self.enabled = enabled
|
||||
self.default_priority = default_priority
|
||||
self.device = device
|
||||
self.include_image = include_image
|
||||
self.unified_db = unified_db
|
||||
self.enable_review_queue_notifications = enable_review_queue_notifications
|
||||
self.stats = {
|
||||
'sent': 0,
|
||||
'failed': 0,
|
||||
'skipped': 0
|
||||
}
|
||||
# Tracking for database recording
|
||||
self._current_notification_context = None
|
||||
|
||||
def _record_notification(self, title: str, message: str, priority: int, status: str, response_data: dict, image_path: str = None):
|
||||
"""Record notification to database
|
||||
|
||||
Args:
|
||||
title: Notification title
|
||||
message: Notification message
|
||||
priority: Priority level
|
||||
status: Status ('sent' or 'failed')
|
||||
response_data: Response from Pushover API
|
||||
image_path: Optional path to thumbnail image
|
||||
"""
|
||||
if not self.unified_db:
|
||||
logger.debug("[Pushover] No database connection available for recording notification")
|
||||
return
|
||||
|
||||
if not self._current_notification_context:
|
||||
logger.debug("[Pushover] No notification context available for recording")
|
||||
return
|
||||
|
||||
try:
|
||||
import json
|
||||
|
||||
context = self._current_notification_context
|
||||
|
||||
# Add image path to metadata if provided
|
||||
metadata = context.get('metadata', {}) or {}
|
||||
if image_path:
|
||||
metadata['image_path'] = str(image_path)
|
||||
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO notifications (
|
||||
platform, source, content_type, message, title,
|
||||
priority, download_count, sent_at, status, response_data, metadata
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'), ?, ?, ?)
|
||||
""", (
|
||||
context.get('platform'),
|
||||
context.get('source'),
|
||||
context.get('content_type'),
|
||||
message,
|
||||
title,
|
||||
priority,
|
||||
context.get('download_count', 1),
|
||||
status,
|
||||
json.dumps(response_data) if response_data else None,
|
||||
json.dumps(metadata) if metadata else None
|
||||
))
|
||||
|
||||
conn.commit()
|
||||
logger.info(f"[Pushover] Recorded notification to database: {title} (status: {status})")
|
||||
|
||||
# Broadcast to frontend for real-time toast notification
|
||||
try:
|
||||
from web.backend.api import manager
|
||||
if manager and manager.active_connections:
|
||||
manager.broadcast_sync({
|
||||
'type': 'notification_created',
|
||||
'notification': {
|
||||
'title': title,
|
||||
'message': message,
|
||||
'platform': context.get('platform'),
|
||||
'source': context.get('source'),
|
||||
'content_type': context.get('content_type'),
|
||||
'download_count': context.get('download_count', 1),
|
||||
'status': status,
|
||||
}
|
||||
})
|
||||
except Exception:
|
||||
# Fail silently - API may not be running or manager not available
|
||||
pass
|
||||
|
||||
# Clear context after recording to prevent stale data on subsequent notifications
|
||||
self._current_notification_context = None
|
||||
except Exception as e:
|
||||
logger.warning(f"[Pushover] Failed to record notification to database: {e}")
|
||||
import traceback
|
||||
logger.warning(f"[Pushover] Traceback: {traceback.format_exc()}")
|
||||
|
||||
def _get_platform_display_name(self, platform: str, source: str = None) -> str:
|
||||
"""
|
||||
Convert service name to user-friendly platform name
|
||||
|
||||
Args:
|
||||
platform: Service/platform name (fastdl, imginn, toolzu, etc.)
|
||||
source: Source/username (for forums, this is the forum name)
|
||||
|
||||
Returns:
|
||||
User-friendly platform name (Instagram, TikTok, etc.)
|
||||
"""
|
||||
platform_lower = platform.lower()
|
||||
|
||||
# For forums, use the forum name (source) as the platform display name
|
||||
if platform_lower == 'forums' and source:
|
||||
return source.title()
|
||||
|
||||
return self.PLATFORM_NAMES.get(platform_lower, platform.title())
|
||||
|
||||
def _pluralize(self, word: str, count: int) -> str:
|
||||
"""
|
||||
Get the correct plural form of a word
|
||||
|
||||
Args:
|
||||
word: Singular word
|
||||
count: Count to determine if plural needed
|
||||
|
||||
Returns:
|
||||
Singular or plural form
|
||||
"""
|
||||
# Handle None or empty word
|
||||
if not word:
|
||||
return "items" if count != 1 else "item"
|
||||
|
||||
if count == 1:
|
||||
return word
|
||||
|
||||
# Check if we have a custom plural
|
||||
word_lower = word.lower()
|
||||
if word_lower in self.PLURALS:
|
||||
return self.PLURALS[word_lower].title() if word[0].isupper() else self.PLURALS[word_lower]
|
||||
|
||||
# Check if word is already a plural form (value in PLURALS)
|
||||
if word_lower in self.PLURALS.values():
|
||||
return word # Already plural, return as-is
|
||||
|
||||
# Default: just add 's' (but not if already ends with 's')
|
||||
if word_lower.endswith('s'):
|
||||
return word
|
||||
return f"{word}s"
|
||||
|
||||
def _extract_random_video_frame(self, video_path: str) -> str:
|
||||
"""
|
||||
Extract a random frame from a video file
|
||||
|
||||
Args:
|
||||
video_path: Path to the video file
|
||||
|
||||
Returns:
|
||||
Path to extracted frame (temp file) or None if extraction failed
|
||||
"""
|
||||
import subprocess
|
||||
import random
|
||||
import tempfile
|
||||
|
||||
try:
|
||||
# Get video duration using ffprobe
|
||||
ffprobe_cmd = [
|
||||
'ffprobe',
|
||||
'-v', 'error',
|
||||
'-show_entries', 'format=duration',
|
||||
'-of', 'default=noprint_wrappers=1:nokey=1',
|
||||
video_path
|
||||
]
|
||||
|
||||
result = subprocess.run(
|
||||
ffprobe_cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.warning(f"[Pushover] ffprobe failed to get video duration: {result.stderr[:200]}")
|
||||
return None
|
||||
|
||||
duration = float(result.stdout.strip())
|
||||
|
||||
# Skip first and last 10% to avoid black frames
|
||||
start_offset = duration * 0.1
|
||||
end_offset = duration * 0.9
|
||||
|
||||
if end_offset <= start_offset:
|
||||
# Video too short, just use middle
|
||||
timestamp = duration / 2
|
||||
else:
|
||||
# Pick random timestamp in the middle 80%
|
||||
timestamp = random.uniform(start_offset, end_offset)
|
||||
|
||||
logger.debug(f"[Pushover] Video duration: {duration:.2f}s, extracting frame at {timestamp:.2f}s")
|
||||
|
||||
# Create temp file for the frame
|
||||
temp_fd, temp_path = tempfile.mkstemp(suffix='.jpg', prefix='pushover_frame_')
|
||||
os.close(temp_fd) # Close the file descriptor, ffmpeg will write to it
|
||||
success = False
|
||||
|
||||
try:
|
||||
# Extract frame using ffmpeg
|
||||
ffmpeg_cmd = [
|
||||
'ffmpeg',
|
||||
'-ss', str(timestamp), # Seek to timestamp
|
||||
'-i', video_path, # Input file
|
||||
'-vframes', '1', # Extract 1 frame
|
||||
'-q:v', '2', # High quality
|
||||
'-y', # Overwrite output
|
||||
temp_path
|
||||
]
|
||||
|
||||
result = subprocess.run(
|
||||
ffmpeg_cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.debug(f"[Pushover] ffmpeg failed: {result.stderr}")
|
||||
return None
|
||||
|
||||
# Verify the frame was created
|
||||
if Path(temp_path).exists() and Path(temp_path).stat().st_size > 0:
|
||||
success = True
|
||||
return temp_path
|
||||
else:
|
||||
logger.debug("[Pushover] Frame extraction produced empty file")
|
||||
return None
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.debug("[Pushover] Video frame extraction timed out")
|
||||
return None
|
||||
finally:
|
||||
# Clean up temp file if extraction failed
|
||||
if not success:
|
||||
try:
|
||||
Path(temp_path).unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"[Pushover] Error extracting video frame: {e}")
|
||||
return None
|
||||
|
||||
def send_notification(self,
|
||||
title: str,
|
||||
message: str,
|
||||
priority: int = None,
|
||||
url: str = None,
|
||||
url_title: str = None,
|
||||
sound: str = None,
|
||||
device: str = None,
|
||||
html: bool = False,
|
||||
image_path: str = None,
|
||||
max_retries: int = 3,
|
||||
retry_delay: int = 5) -> bool:
|
||||
"""
|
||||
Send a Pushover notification with automatic retry on transient failures
|
||||
|
||||
Args:
|
||||
title: Notification title
|
||||
message: Notification message
|
||||
priority: Priority level (-2 to 2)
|
||||
url: Supplementary URL
|
||||
url_title: Title for the URL
|
||||
sound: Notification sound name
|
||||
device: Specific device to send to
|
||||
html: Enable HTML formatting
|
||||
image_path: Path to image file to attach as thumbnail
|
||||
max_retries: Maximum number of retry attempts (default 3)
|
||||
retry_delay: Initial retry delay in seconds, doubles each retry (default 5)
|
||||
|
||||
Returns:
|
||||
True if notification sent successfully
|
||||
"""
|
||||
if not self.enabled:
|
||||
logger.debug("[Pushover] Notifications disabled, skipping")
|
||||
self.stats['skipped'] += 1
|
||||
return False
|
||||
|
||||
if not self.user_key or not self.api_token:
|
||||
logger.warning("[Pushover] Missing user_key or api_token")
|
||||
self.stats['failed'] += 1
|
||||
return False
|
||||
|
||||
# Normalize priority
|
||||
actual_priority = priority if priority is not None else self.default_priority
|
||||
|
||||
# Prepare payload
|
||||
payload = {
|
||||
'token': self.api_token,
|
||||
'user': self.user_key,
|
||||
'title': title,
|
||||
'message': message,
|
||||
'priority': actual_priority,
|
||||
}
|
||||
|
||||
# Add optional parameters
|
||||
if url:
|
||||
payload['url'] = url
|
||||
if url_title:
|
||||
payload['url_title'] = url_title
|
||||
if sound:
|
||||
payload['sound'] = sound
|
||||
if device or self.device:
|
||||
payload['device'] = device or self.device
|
||||
if html:
|
||||
payload['html'] = 1
|
||||
|
||||
# Retry loop with exponential backoff
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
# Check if we have an image to attach
|
||||
files = None
|
||||
if image_path:
|
||||
from pathlib import Path
|
||||
img_path = Path(image_path)
|
||||
|
||||
# Only attach if file exists and is an image
|
||||
if img_path.exists() and img_path.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
|
||||
try:
|
||||
# Determine MIME type
|
||||
mime_type = 'image/jpeg'
|
||||
if img_path.suffix.lower() == '.png':
|
||||
mime_type = 'image/png'
|
||||
elif img_path.suffix.lower() == '.gif':
|
||||
mime_type = 'image/gif'
|
||||
elif img_path.suffix.lower() == '.bmp':
|
||||
mime_type = 'image/bmp'
|
||||
elif img_path.suffix.lower() == '.webp':
|
||||
mime_type = 'image/webp'
|
||||
|
||||
# Open and attach the image
|
||||
files = {'attachment': (img_path.name, open(img_path, 'rb'), mime_type)}
|
||||
logger.debug(f"[Pushover] Attaching image: {img_path.name}")
|
||||
except Exception as e:
|
||||
logger.warning(f"[Pushover] Failed to attach image {image_path}: {e}")
|
||||
|
||||
response = requests.post(self.API_URL, data=payload, files=files, timeout=30)
|
||||
|
||||
# Close file if opened
|
||||
if files and 'attachment' in files:
|
||||
files['attachment'][1].close()
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
if result.get('status') == 1:
|
||||
request_id = result.get('request', 'unknown')
|
||||
if attempt > 0:
|
||||
logger.info(f"[Pushover] Notification sent after {attempt + 1} attempt(s): {title} (request: {request_id})")
|
||||
else:
|
||||
logger.info(f"[Pushover] Notification sent: {title} (request: {request_id})")
|
||||
self.stats['sent'] += 1
|
||||
|
||||
# Record to database if available and we have context
|
||||
self._record_notification(title, message, actual_priority, 'sent', result, image_path)
|
||||
|
||||
return True
|
||||
else:
|
||||
# API returned error status - don't retry client errors
|
||||
logger.error(f"[Pushover] API error: {result}")
|
||||
self.stats['failed'] += 1
|
||||
|
||||
# Record failure to database
|
||||
self._record_notification(title, message, actual_priority, 'failed', result, image_path)
|
||||
|
||||
return False
|
||||
|
||||
# Handle HTTP errors with retry logic
|
||||
elif response.status_code >= 500:
|
||||
# Server error (5xx) - retry with backoff
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = retry_delay * (2 ** attempt)
|
||||
logger.warning(f"[Pushover] HTTP {response.status_code}: {response.text[:100]}, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
|
||||
import time
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
else:
|
||||
# Max retries exceeded
|
||||
logger.error(f"[Pushover] HTTP {response.status_code} after {max_retries} attempts: {response.text}")
|
||||
self.stats['failed'] += 1
|
||||
self._record_notification(title, message, actual_priority, 'failed', {'error': f"HTTP {response.status_code} after {max_retries} retries"}, image_path)
|
||||
return False
|
||||
else:
|
||||
# Client error (4xx) - don't retry
|
||||
logger.error(f"[Pushover] HTTP {response.status_code}: {response.text}")
|
||||
self.stats['failed'] += 1
|
||||
self._record_notification(title, message, actual_priority, 'failed', {'error': response.text}, image_path)
|
||||
return False
|
||||
|
||||
except (requests.ConnectionError, requests.Timeout) as e:
|
||||
# Network errors - retry with backoff
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = retry_delay * (2 ** attempt)
|
||||
logger.warning(f"[Pushover] Network error: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
|
||||
import time
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
else:
|
||||
# Max retries exceeded
|
||||
logger.error(f"[Pushover] Network error after {max_retries} attempts: {e}")
|
||||
self.stats['failed'] += 1
|
||||
self._record_notification(title, message, actual_priority, 'failed', {'error': f"Network error after {max_retries} retries: {str(e)}"}, image_path)
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
# Other exceptions - don't retry
|
||||
logger.error(f"[Pushover] Failed to send notification: {e}")
|
||||
self.stats['failed'] += 1
|
||||
self._record_notification(title, message, actual_priority, 'failed', {'error': str(e)}, image_path)
|
||||
return False
|
||||
|
||||
# Should never reach here, but just in case
|
||||
return False
|
||||
|
||||
def notify_download(self,
|
||||
platform: str,
|
||||
source: str,
|
||||
content_type: str,
|
||||
filename: str = None,
|
||||
search_term: str = None,
|
||||
count: int = 1,
|
||||
metadata: Dict[str, Any] = None,
|
||||
priority: int = None) -> bool:
|
||||
"""
|
||||
Send a professional notification for a new download
|
||||
|
||||
Args:
|
||||
platform: Platform name (instagram, tiktok, forum, etc.)
|
||||
source: Username or source identifier
|
||||
content_type: Type of content (post, story, reel, thread, etc.)
|
||||
filename: Optional filename
|
||||
search_term: Optional search term (for forum searches)
|
||||
count: Number of items downloaded (default 1)
|
||||
metadata: Additional metadata dictionary
|
||||
priority: Notification priority
|
||||
|
||||
Returns:
|
||||
True if notification sent successfully
|
||||
"""
|
||||
metadata = metadata or {}
|
||||
|
||||
# Handle None content_type
|
||||
content_type = content_type or 'item'
|
||||
|
||||
# Get appropriate icons
|
||||
platform_icon = self.PLATFORM_ICONS.get(platform.lower(), '📥')
|
||||
content_icon = self.CONTENT_ICONS.get(content_type.lower(), '📄')
|
||||
|
||||
# Build title with proper grammar
|
||||
if count > 1:
|
||||
plural_type = self._pluralize(content_type, count)
|
||||
title = f"{platform_icon} {count} {plural_type.title()} Downloaded"
|
||||
else:
|
||||
title = f"{platform_icon} New {content_type.title()} Downloaded"
|
||||
|
||||
# Build message
|
||||
message_parts = []
|
||||
|
||||
# Add platform (convert service name to user-friendly platform name)
|
||||
# For forums, use forum name; for Instagram services, use "Instagram"
|
||||
platform_display = self._get_platform_display_name(platform, source)
|
||||
message_parts.append(f"📱 <b>Platform:</b> {platform_display}")
|
||||
|
||||
# Add source/username (skip for forums since source becomes the platform name)
|
||||
if source and platform.lower() != 'forums':
|
||||
message_parts.append(f"{content_icon} <b>Source:</b> {source}")
|
||||
|
||||
# Add search term if available
|
||||
if search_term:
|
||||
message_parts.append(f"🔍 <b>Search:</b> {search_term}")
|
||||
|
||||
# Add post date if available
|
||||
if metadata.get('post_date'):
|
||||
try:
|
||||
if isinstance(metadata['post_date'], str):
|
||||
post_date = datetime.fromisoformat(metadata['post_date'])
|
||||
else:
|
||||
post_date = metadata['post_date']
|
||||
date_str = post_date.strftime("%Y-%m-%d %H:%M")
|
||||
message_parts.append(f"📅 <b>Posted:</b> {date_str}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Add timestamp
|
||||
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
message_parts.append(f"⏰ <b>Downloaded:</b> {now}")
|
||||
|
||||
message = "\n".join(message_parts)
|
||||
|
||||
# Set context for database recording
|
||||
self._current_notification_context = {
|
||||
'platform': platform,
|
||||
'source': source,
|
||||
'content_type': content_type,
|
||||
'download_count': count,
|
||||
'metadata': {'search_term': search_term} if search_term else metadata
|
||||
}
|
||||
|
||||
# Determine sound based on platform or priority
|
||||
sound = None
|
||||
if priority and priority >= self.PRIORITY_HIGH:
|
||||
sound = "pushover" # Default urgent sound
|
||||
|
||||
return self.send_notification(
|
||||
title=title,
|
||||
message=message,
|
||||
priority=priority,
|
||||
sound=sound,
|
||||
html=True
|
||||
)
|
||||
|
||||
def notify_batch_download(self,
|
||||
platform: str,
|
||||
downloads: list,
|
||||
search_term: str = None,
|
||||
is_review_queue: bool = False) -> bool:
|
||||
"""
|
||||
Send notification for batch downloads
|
||||
|
||||
Args:
|
||||
platform: Platform name
|
||||
downloads: List of download dicts with keys: source, content_type, filename, file_path
|
||||
search_term: Optional search term
|
||||
is_review_queue: True if these are review queue items (no face match)
|
||||
|
||||
Returns:
|
||||
True if notification sent successfully
|
||||
"""
|
||||
if not downloads:
|
||||
return False
|
||||
|
||||
# Check if review queue notifications are disabled
|
||||
# Always check current database value for review queue notifications
|
||||
if is_review_queue:
|
||||
if self.unified_db:
|
||||
try:
|
||||
from modules.settings_manager import SettingsManager
|
||||
settings_manager = SettingsManager(str(self.unified_db.db_path))
|
||||
pushover_settings = settings_manager.get('pushover', {})
|
||||
enable_review_notifications = pushover_settings.get('enable_review_queue_notifications', True)
|
||||
if not enable_review_notifications:
|
||||
logger.debug("[Pushover] Skipping review queue notification (disabled in settings)")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"[Pushover] Could not check review queue notification setting, using cached value: {e}")
|
||||
# Fall back to cached value
|
||||
if not self.enable_review_queue_notifications:
|
||||
logger.debug("[Pushover] Skipping review queue notification (disabled in cached settings)")
|
||||
return False
|
||||
else:
|
||||
# No database, use cached value
|
||||
if not self.enable_review_queue_notifications:
|
||||
logger.debug("[Pushover] Skipping review queue notification (disabled in settings)")
|
||||
return False
|
||||
|
||||
# Extract source from first download
|
||||
source = None
|
||||
if downloads and downloads[0].get('source'):
|
||||
source = downloads[0]['source']
|
||||
|
||||
# Extract content type (handle None explicitly)
|
||||
content_type = (downloads[0].get('content_type') or 'item') if downloads else 'item'
|
||||
|
||||
# Collect all media file paths for the notification database record
|
||||
image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.heic', '.heif', '.avif', '.tiff', '.tif'}
|
||||
video_extensions = {'.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v', '.flv'}
|
||||
audio_extensions = {'.mp3', '.wav', '.flac', '.aac', '.m4a', '.ogg', '.wma'}
|
||||
all_media_paths = []
|
||||
|
||||
for dl in downloads:
|
||||
file_path = dl.get('file_path')
|
||||
if file_path and Path(file_path).exists():
|
||||
suffix = Path(file_path).suffix.lower()
|
||||
ct = dl.get('content_type', '').lower()
|
||||
if ct == 'audio' or suffix in audio_extensions:
|
||||
media_type = 'audio'
|
||||
elif ct == 'image' or suffix in image_extensions:
|
||||
media_type = 'image'
|
||||
elif ct == 'video' or suffix in video_extensions:
|
||||
media_type = 'video'
|
||||
else:
|
||||
continue
|
||||
all_media_paths.append({
|
||||
'file_path': file_path,
|
||||
'filename': dl.get('filename', Path(file_path).name),
|
||||
'media_type': media_type
|
||||
})
|
||||
|
||||
# Set context for database recording with all media files
|
||||
metadata = {}
|
||||
if search_term:
|
||||
metadata['search_term'] = search_term
|
||||
if all_media_paths:
|
||||
metadata['media_files'] = all_media_paths # Store all media files for notifications page
|
||||
|
||||
self._current_notification_context = {
|
||||
'platform': platform,
|
||||
'source': source,
|
||||
'content_type': content_type,
|
||||
'download_count': len(downloads),
|
||||
'metadata': metadata if metadata else None
|
||||
}
|
||||
|
||||
# Use different icon for review queue
|
||||
if is_review_queue:
|
||||
platform_icon = "👁️" # Eye icon for review
|
||||
else:
|
||||
platform_icon = self.PLATFORM_ICONS.get(platform.lower(), '📥')
|
||||
|
||||
# Group by content type
|
||||
by_type = {}
|
||||
for dl in downloads:
|
||||
content_type = dl.get('content_type') or 'item' # Handle None explicitly
|
||||
by_type.setdefault(content_type, []).append(dl)
|
||||
|
||||
# Build title with proper grammar
|
||||
total = len(downloads)
|
||||
if is_review_queue:
|
||||
# Review queue notification
|
||||
if len(by_type) == 1:
|
||||
content_type = list(by_type.keys())[0]
|
||||
plural_type = self._pluralize(content_type, total)
|
||||
title = f"{platform_icon} {total} {plural_type.title()} - Review Queue"
|
||||
else:
|
||||
title = f"{platform_icon} {total} Items - Review Queue"
|
||||
else:
|
||||
# Regular download notification
|
||||
if len(by_type) == 1:
|
||||
# Single content type - use specific name
|
||||
content_type = list(by_type.keys())[0]
|
||||
plural_type = self._pluralize(content_type, total)
|
||||
title = f"{platform_icon} {total} {plural_type.title()} Downloaded"
|
||||
else:
|
||||
# Multiple content types - use "Items"
|
||||
title = f"{platform_icon} {total} Items Downloaded"
|
||||
|
||||
# Build message
|
||||
message_parts = []
|
||||
|
||||
# Extract source from first download since they're all from same source
|
||||
source = None
|
||||
if downloads and downloads[0].get('source'):
|
||||
source = downloads[0]['source']
|
||||
|
||||
# Add platform (convert service name to user-friendly platform name)
|
||||
# For forums, use forum name; for Instagram services, use "Instagram"
|
||||
platform_display = self._get_platform_display_name(platform, source)
|
||||
message_parts.append(f"📱 <b>Platform:</b> {platform_display}")
|
||||
|
||||
# Add source/username (skip for forums since source becomes the platform name)
|
||||
if source and platform.lower() != 'forums':
|
||||
# Get content icon for the primary content type
|
||||
primary_content_type = list(by_type.keys())[0] if by_type else 'item'
|
||||
content_icon = self.CONTENT_ICONS.get(primary_content_type.lower(), '📄')
|
||||
message_parts.append(f"{content_icon} <b>Source:</b> {source}")
|
||||
|
||||
if search_term:
|
||||
message_parts.append(f"🔍 <b>Search:</b> {search_term}")
|
||||
|
||||
# Add review queue notice if applicable
|
||||
if is_review_queue:
|
||||
message_parts.append(f"\n⚠️ <b>No face match detected</b> - Items moved to review queue for manual review")
|
||||
|
||||
# Summary by type (only show if multiple types)
|
||||
if len(by_type) > 1:
|
||||
message_parts.append(f"\n<b>Breakdown:</b>")
|
||||
for content_type, items in by_type.items():
|
||||
content_icon = self.CONTENT_ICONS.get(content_type.lower(), '📄')
|
||||
count = len(items)
|
||||
plural_type = self._pluralize(content_type, count)
|
||||
message_parts.append(f"{content_icon} {count} {plural_type}")
|
||||
|
||||
# Add timestamp
|
||||
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
message_parts.append(f"\n⏰ <b>Downloaded:</b> {now}")
|
||||
|
||||
message = "\n".join(message_parts)
|
||||
|
||||
# Select a random file for thumbnail attachment (if enabled)
|
||||
# Can be an image or video (extract random frame from video)
|
||||
import random
|
||||
image_path = None
|
||||
temp_frame_path = None # Track temporary frame extractions
|
||||
|
||||
if self.include_image:
|
||||
image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'}
|
||||
video_extensions = {'.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v'}
|
||||
|
||||
# Collect all valid media file paths (images and videos)
|
||||
media_files = []
|
||||
for dl in downloads:
|
||||
file_path = dl.get('file_path')
|
||||
if file_path:
|
||||
exists = Path(file_path).exists()
|
||||
if exists:
|
||||
suffix = Path(file_path).suffix.lower()
|
||||
if suffix in image_extensions or suffix in video_extensions:
|
||||
media_files.append(file_path)
|
||||
else:
|
||||
logger.debug(f"[Pushover] Skipping file (invalid extension): {Path(file_path).name} ({suffix})")
|
||||
else:
|
||||
logger.warning(f"[Pushover] Skipping file (doesn't exist): {file_path}")
|
||||
else:
|
||||
logger.warning(f"[Pushover] Download entry has no file_path")
|
||||
|
||||
logger.debug(f"[Pushover] Found {len(media_files)} valid media files out of {len(downloads)} downloads")
|
||||
|
||||
# Randomly select one file if available
|
||||
if media_files:
|
||||
selected_file = random.choice(media_files)
|
||||
selected_suffix = Path(selected_file).suffix.lower()
|
||||
|
||||
if selected_suffix in image_extensions:
|
||||
# It's an image, use directly
|
||||
image_path = selected_file
|
||||
logger.debug(f"[Pushover] Selected image thumbnail: {Path(image_path).name}")
|
||||
|
||||
elif selected_suffix in video_extensions:
|
||||
# It's a video, extract a random frame
|
||||
logger.info(f"[Pushover] Selected video for thumbnail, extracting random frame: {Path(selected_file).name}")
|
||||
temp_frame_path = self._extract_random_video_frame(selected_file)
|
||||
if temp_frame_path:
|
||||
image_path = temp_frame_path
|
||||
logger.info(f"[Pushover] Successfully extracted video frame for thumbnail: {Path(temp_frame_path).name}")
|
||||
else:
|
||||
logger.warning("[Pushover] Failed to extract frame from video - notification will be sent without thumbnail")
|
||||
else:
|
||||
logger.debug("[Pushover] No media files available for thumbnail attachment")
|
||||
else:
|
||||
logger.debug("[Pushover] Image thumbnails disabled in settings")
|
||||
|
||||
# Send notification with lower priority for review queue
|
||||
priority = -1 if is_review_queue else None # Low priority for review queue
|
||||
result = self.send_notification(
|
||||
title=title,
|
||||
message=message,
|
||||
html=True,
|
||||
image_path=image_path,
|
||||
priority=priority
|
||||
)
|
||||
|
||||
# Clean up temporary frame file if we created one
|
||||
if temp_frame_path and Path(temp_frame_path).exists():
|
||||
try:
|
||||
Path(temp_frame_path).unlink()
|
||||
logger.debug(f"[Pushover] Cleaned up temp frame: {Path(temp_frame_path).name}")
|
||||
except Exception as e:
|
||||
logger.debug(f"[Pushover] Failed to cleanup temp frame: {e}")
|
||||
|
||||
return result
|
||||
|
||||
def notify_error(self, platform: str, error_message: str, source: str = None) -> bool:
|
||||
"""
|
||||
Send error notification
|
||||
|
||||
Args:
|
||||
platform: Platform name
|
||||
error_message: Error description
|
||||
source: Optional source/username
|
||||
|
||||
Returns:
|
||||
True if notification sent successfully
|
||||
"""
|
||||
# Convert service name to user-friendly platform name
|
||||
# For forums, use forum name; for Instagram services, use "Instagram"
|
||||
platform_display = self._get_platform_display_name(platform, source)
|
||||
title = f"⚠️ {platform_display} Download Error"
|
||||
|
||||
message_parts = [
|
||||
f"<b>Platform:</b> {platform_display}",
|
||||
]
|
||||
|
||||
# Add source (skip for forums since source becomes the platform name)
|
||||
if source and platform.lower() != 'forums':
|
||||
message_parts.append(f"<b>Source:</b> {source}")
|
||||
|
||||
message_parts.append(f"\n<b>Error:</b> {error_message}")
|
||||
|
||||
message = "\n".join(message_parts)
|
||||
|
||||
return self.send_notification(
|
||||
title=title,
|
||||
message=message,
|
||||
priority=self.PRIORITY_HIGH,
|
||||
sound="siren",
|
||||
html=True
|
||||
)
|
||||
|
||||
def get_stats(self) -> Dict[str, int]:
|
||||
"""Get notification statistics"""
|
||||
return self.stats.copy()
|
||||
|
||||
def reset_stats(self):
|
||||
"""Reset statistics"""
|
||||
self.stats = {
|
||||
'sent': 0,
|
||||
'failed': 0,
|
||||
'skipped': 0
|
||||
}
|
||||
|
||||
|
||||
def create_notifier_from_config(config: Dict, unified_db=None) -> Optional[PushoverNotifier]:
|
||||
"""
|
||||
Create a PushoverNotifier from configuration dictionary
|
||||
|
||||
Args:
|
||||
config: Configuration dict with pushover settings
|
||||
unified_db: UnifiedDatabase instance for recording notifications (optional)
|
||||
|
||||
Returns:
|
||||
PushoverNotifier instance or None if disabled/invalid
|
||||
"""
|
||||
pushover_config = config.get('pushover', {})
|
||||
|
||||
if not pushover_config.get('enabled', False):
|
||||
logger.info("[Pushover] Notifications disabled in config")
|
||||
return None
|
||||
|
||||
user_key = pushover_config.get('user_key')
|
||||
api_token = pushover_config.get('api_token')
|
||||
|
||||
if not user_key or not api_token:
|
||||
logger.warning("[Pushover] Missing user_key or api_token in config")
|
||||
return None
|
||||
|
||||
return PushoverNotifier(
|
||||
user_key=user_key,
|
||||
api_token=api_token,
|
||||
enabled=True,
|
||||
default_priority=pushover_config.get('priority', 0),
|
||||
device=pushover_config.get('device'),
|
||||
include_image=pushover_config.get('include_image', True),
|
||||
unified_db=unified_db,
|
||||
enable_review_queue_notifications=pushover_config.get('enable_review_queue_notifications', True)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test the notifier
|
||||
print("Testing Pushover Notifier...")
|
||||
|
||||
# This is a test - replace with your actual credentials
|
||||
notifier = PushoverNotifier(
|
||||
user_key="YOUR_USER_KEY",
|
||||
api_token="YOUR_API_TOKEN",
|
||||
enabled=False # Set to True to test
|
||||
)
|
||||
|
||||
# Test notification
|
||||
notifier.notify_download(
|
||||
platform="instagram",
|
||||
source="evalongoria",
|
||||
content_type="story",
|
||||
filename="evalongoria_story_20251018.mp4",
|
||||
metadata={'post_date': datetime.now()}
|
||||
)
|
||||
|
||||
print(f"Stats: {notifier.get_stats()}")
|
||||
1612
modules/reddit_community_monitor.py
Normal file
1612
modules/reddit_community_monitor.py
Normal file
File diff suppressed because it is too large
Load Diff
3243
modules/scheduler.py
Executable file
3243
modules/scheduler.py
Executable file
File diff suppressed because it is too large
Load Diff
194
modules/scraper_event_emitter.py
Normal file
194
modules/scraper_event_emitter.py
Normal file
@@ -0,0 +1,194 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Thread-safe WebSocket event emitter for scraper monitoring
|
||||
|
||||
Provides real-time events for the scraping monitor page:
|
||||
- Scraper sessions starting/completing
|
||||
- File downloads and movements
|
||||
- Progress updates
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
|
||||
class ScraperEventEmitter:
|
||||
"""Emits WebSocket events for real-time scraper monitoring"""
|
||||
|
||||
def __init__(self, websocket_manager=None, app_state=None):
|
||||
"""
|
||||
Initialize event emitter
|
||||
|
||||
Args:
|
||||
websocket_manager: WebSocket connection manager (optional)
|
||||
app_state: Application state for tracking active sessions (optional)
|
||||
"""
|
||||
self.websocket_manager = websocket_manager
|
||||
self.app_state = app_state
|
||||
|
||||
def emit_scraper_started(self, session_id: str, platform: str, account: str,
|
||||
content_type: str, estimated_count: int = 0, accounts_list: list = None):
|
||||
"""
|
||||
Emit when scraper session begins
|
||||
|
||||
Args:
|
||||
session_id: Unique session identifier
|
||||
platform: Platform name (instagram, snapchat, etc.)
|
||||
account: Account/username being scraped (or comma-separated list)
|
||||
content_type: Type of content (stories, posts, etc.)
|
||||
estimated_count: Estimated number of items to download
|
||||
accounts_list: Optional list of all accounts to be processed
|
||||
"""
|
||||
event_data = {
|
||||
'session_id': session_id,
|
||||
'platform': platform,
|
||||
'account': account,
|
||||
'content_type': content_type,
|
||||
'estimated_count': estimated_count,
|
||||
'timestamp': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Include accounts list if provided
|
||||
if accounts_list:
|
||||
event_data['accounts_list'] = accounts_list
|
||||
|
||||
# Store session in app_state for API retrieval
|
||||
# Match the scheduler's data structure exactly
|
||||
if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
|
||||
self.app_state.active_scraper_sessions[session_id] = {
|
||||
'session_id': session_id,
|
||||
'platform': platform,
|
||||
'account': account,
|
||||
'content_type': content_type,
|
||||
'start_time': datetime.now().isoformat(),
|
||||
'status': 'Starting...',
|
||||
'detailed_status': 'Starting...',
|
||||
'progress': {
|
||||
'current': 0,
|
||||
'total': estimated_count or 100
|
||||
},
|
||||
'stats': {'media': 0, 'review': 0, 'failed': 0}
|
||||
}
|
||||
|
||||
self._broadcast({
|
||||
'type': 'scraper_started',
|
||||
'data': event_data
|
||||
})
|
||||
|
||||
def emit_scraper_progress(self, session_id: str, status: str,
|
||||
current: int, total: int, current_account: str = None,
|
||||
completed_accounts: list = None):
|
||||
"""
|
||||
Emit progress update
|
||||
|
||||
Args:
|
||||
session_id: Session identifier
|
||||
status: Status message (e.g., "Downloading stories...")
|
||||
current: Current item count
|
||||
total: Total item count
|
||||
current_account: Currently active account/forum name (optional)
|
||||
completed_accounts: List of completed accounts (optional)
|
||||
"""
|
||||
event_data = {
|
||||
'session_id': session_id,
|
||||
'status': status,
|
||||
'progress_current': current,
|
||||
'progress_total': total,
|
||||
'timestamp': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Include current account if provided
|
||||
if current_account:
|
||||
event_data['current_account'] = current_account
|
||||
|
||||
# Include completed accounts if provided
|
||||
if completed_accounts:
|
||||
event_data['completed_accounts'] = completed_accounts
|
||||
|
||||
# Update session in app_state - match scheduler structure
|
||||
if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
|
||||
if session_id in self.app_state.active_scraper_sessions:
|
||||
session = self.app_state.active_scraper_sessions[session_id]
|
||||
session['status'] = status
|
||||
session['detailed_status'] = status
|
||||
# Update account to current account if provided
|
||||
if current_account:
|
||||
session['account'] = current_account
|
||||
# Use nested progress structure to match scheduler
|
||||
session['progress'] = {
|
||||
'current': current,
|
||||
'total': total
|
||||
}
|
||||
if completed_accounts:
|
||||
session['completed_accounts'] = completed_accounts
|
||||
|
||||
self._broadcast({
|
||||
'type': 'scraper_progress',
|
||||
'data': event_data
|
||||
})
|
||||
|
||||
def emit_scraper_completed(self, session_id: str, stats: Dict[str, int]):
|
||||
"""
|
||||
Emit when scraper session completes
|
||||
|
||||
Args:
|
||||
session_id: Session identifier
|
||||
stats: Statistics dict with keys: total_downloaded, moved, review, duplicates, failed
|
||||
"""
|
||||
# Remove session from app_state
|
||||
if self.app_state and hasattr(self.app_state, 'active_scraper_sessions'):
|
||||
self.app_state.active_scraper_sessions.pop(session_id, None)
|
||||
|
||||
self._broadcast({
|
||||
'type': 'scraper_completed',
|
||||
'data': {
|
||||
'session_id': session_id,
|
||||
'stats': stats,
|
||||
'timestamp': datetime.now().isoformat()
|
||||
}
|
||||
})
|
||||
|
||||
def emit_file_moved(self, session_id: str, platform: str, account: str,
|
||||
filename: str, media_type: str, destination_type: str,
|
||||
destination_path: str, thumbnail_url: str = None,
|
||||
face_match: Dict[str, Any] = None):
|
||||
"""
|
||||
Emit when file is moved to destination
|
||||
|
||||
Args:
|
||||
session_id: Session identifier
|
||||
platform: Platform name
|
||||
account: Account/username
|
||||
filename: File name
|
||||
media_type: 'image' or 'video'
|
||||
destination_type: 'media', 'review', or 'recycle'
|
||||
destination_path: Full path to destination file
|
||||
thumbnail_url: URL to thumbnail (optional)
|
||||
face_match: Face recognition result dict (optional)
|
||||
"""
|
||||
self._broadcast({
|
||||
'type': 'file_moved',
|
||||
'data': {
|
||||
'session_id': session_id,
|
||||
'platform': platform,
|
||||
'account': account,
|
||||
'filename': filename,
|
||||
'media_type': media_type,
|
||||
'destination_type': destination_type,
|
||||
'destination_path': destination_path,
|
||||
'thumbnail_url': thumbnail_url,
|
||||
'face_match': face_match or {'matched': False},
|
||||
'timestamp': datetime.now().isoformat()
|
||||
}
|
||||
})
|
||||
|
||||
def _broadcast(self, message: dict):
|
||||
"""
|
||||
Thread-safe broadcast to WebSocket clients
|
||||
|
||||
Args:
|
||||
message: Event message dict
|
||||
"""
|
||||
if self.websocket_manager:
|
||||
# Use broadcast_sync for thread-safe emission from background threads
|
||||
self.websocket_manager.broadcast_sync(message)
|
||||
652
modules/scraper_gallery_bridge.py
Normal file
652
modules/scraper_gallery_bridge.py
Normal file
@@ -0,0 +1,652 @@
|
||||
"""
|
||||
Scraper Gallery Bridge
|
||||
|
||||
Maps scraper accounts (Instagram, TikTok, Snapchat) to private gallery persons.
|
||||
After each download session, auto-imports new media as gallery posts.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import mimetypes
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import tempfile
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SCRAPER_BRIDGE_KEY_FILE = '/opt/immich/private/.scraper_bridge_key'
|
||||
|
||||
# Map scraper module names → platform
|
||||
SCRAPER_TO_PLATFORM = {
|
||||
'fastdl': 'instagram',
|
||||
'imginn': 'instagram',
|
||||
'imginn_api': 'instagram',
|
||||
'instagram_client': 'instagram',
|
||||
'toolzu': 'instagram',
|
||||
'instagram': 'instagram',
|
||||
'instagram_unified': 'instagram',
|
||||
'tiktok': 'tiktok',
|
||||
'snapchat': 'snapchat',
|
||||
'snapchat_client': 'snapchat',
|
||||
}
|
||||
|
||||
PLATFORM_COLORS = {
|
||||
'instagram': '#E1306C',
|
||||
'tiktok': '#00f2ea',
|
||||
'snapchat': '#FFFC00',
|
||||
}
|
||||
|
||||
PLATFORM_LABELS = {
|
||||
'instagram': 'Instagram',
|
||||
'tiktok': 'TikTok',
|
||||
'snapchat': 'Snapchat',
|
||||
}
|
||||
|
||||
|
||||
def get_crypto():
|
||||
"""Load crypto from key file for background access (works when gallery is locked)."""
|
||||
from modules.private_gallery_crypto import load_key_from_file
|
||||
crypto = load_key_from_file(SCRAPER_BRIDGE_KEY_FILE)
|
||||
if crypto is None:
|
||||
logger.debug("Scraper bridge crypto unavailable - key file missing or invalid")
|
||||
return crypto
|
||||
|
||||
|
||||
def get_available_accounts(platform: str, config: dict, db) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Aggregate usernames from all scraper configs + paid_content_creators for a platform.
|
||||
Returns de-duplicated list with source annotations.
|
||||
"""
|
||||
accounts = {} # username -> set of sources
|
||||
|
||||
if platform == 'instagram':
|
||||
# instagram.accounts[].username
|
||||
ig_cfg = config.get('instagram', {})
|
||||
if ig_cfg.get('enabled', False):
|
||||
ig_accounts = ig_cfg.get('accounts', [])
|
||||
if not ig_accounts and 'usernames' in ig_cfg:
|
||||
ig_accounts = [{'username': u} for u in ig_cfg['usernames']]
|
||||
for acc in ig_accounts:
|
||||
u = acc.get('username', '').strip().lower()
|
||||
if u:
|
||||
accounts.setdefault(u, set()).add('instagram')
|
||||
|
||||
# Collect usernames + phrase_search usernames from each scraper
|
||||
for scraper_id in ('fastdl', 'imginn', 'imginn_api', 'instagram_client', 'toolzu'):
|
||||
scraper_cfg = config.get(scraper_id, {})
|
||||
if not scraper_cfg.get('enabled', False):
|
||||
continue
|
||||
for u in scraper_cfg.get('usernames', []):
|
||||
u = u.strip().lower()
|
||||
if u:
|
||||
accounts.setdefault(u, set()).add(scraper_id)
|
||||
# phrase_search usernames are also downloadable accounts
|
||||
for u in scraper_cfg.get('phrase_search', {}).get('usernames', []):
|
||||
u = u.strip().lower()
|
||||
if u:
|
||||
accounts.setdefault(u, set()).add(scraper_id)
|
||||
|
||||
elif platform == 'tiktok':
|
||||
tt_cfg = config.get('tiktok', {})
|
||||
if tt_cfg.get('enabled', False):
|
||||
tt_accounts = tt_cfg.get('accounts', [])
|
||||
if not tt_accounts and 'usernames' in tt_cfg:
|
||||
tt_accounts = [{'username': u} for u in tt_cfg['usernames']]
|
||||
for acc in tt_accounts:
|
||||
u = acc.get('username', '').strip().lower()
|
||||
if u:
|
||||
accounts.setdefault(u, set()).add('tiktok')
|
||||
|
||||
elif platform == 'snapchat':
|
||||
# snapchat.usernames
|
||||
sc_cfg = config.get('snapchat', {})
|
||||
if sc_cfg.get('enabled', False):
|
||||
for u in sc_cfg.get('usernames', []):
|
||||
u = u.strip().lower()
|
||||
if u:
|
||||
accounts.setdefault(u, set()).add('snapchat')
|
||||
|
||||
# snapchat_client.usernames
|
||||
sc_client_cfg = config.get('snapchat_client', {})
|
||||
if sc_client_cfg.get('enabled', False):
|
||||
for u in sc_client_cfg.get('usernames', []):
|
||||
u = u.strip().lower()
|
||||
if u:
|
||||
accounts.setdefault(u, set()).add('snapchat_client')
|
||||
|
||||
# Add from paid_content_creators table
|
||||
try:
|
||||
conn = sqlite3.connect(db.db_path, timeout=10)
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
'SELECT username FROM paid_content_creators WHERE platform = ? AND enabled = 1',
|
||||
(platform,)
|
||||
)
|
||||
for row in cursor.fetchall():
|
||||
u = row['username'].strip().lower()
|
||||
if u:
|
||||
accounts.setdefault(u, set()).add('paid_content')
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not query paid_content_creators: {e}")
|
||||
|
||||
# Check which are already mapped
|
||||
mapped_usernames = set()
|
||||
try:
|
||||
conn = sqlite3.connect(db.db_path, timeout=10)
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
'SELECT username FROM private_media_scraper_accounts WHERE platform = ?',
|
||||
(platform,)
|
||||
)
|
||||
for row in cursor.fetchall():
|
||||
mapped_usernames.add(row['username'].lower())
|
||||
conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
result = []
|
||||
for username, sources in sorted(accounts.items()):
|
||||
result.append({
|
||||
'username': username,
|
||||
'sources': sorted(sources),
|
||||
'is_mapped': username.lower() in mapped_usernames,
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _ensure_platform_tag(platform: str, db, crypto) -> int:
|
||||
"""Find or create a tag for the platform in private_gallery_tags."""
|
||||
conn = sqlite3.connect(db.db_path, timeout=10)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT id, encrypted_name FROM private_gallery_tags")
|
||||
label = PLATFORM_LABELS.get(platform, platform.title())
|
||||
for row in cursor.fetchall():
|
||||
try:
|
||||
name = crypto.decrypt_field(row['encrypted_name'])
|
||||
if name.lower() == label.lower():
|
||||
return row['id']
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Create the tag
|
||||
encrypted_name = crypto.encrypt_field(label)
|
||||
color = PLATFORM_COLORS.get(platform, '#888888')
|
||||
cursor.execute('''
|
||||
INSERT INTO private_gallery_tags (encrypted_name, color)
|
||||
VALUES (?, ?)
|
||||
''', (encrypted_name, color))
|
||||
conn.commit()
|
||||
tag_id = cursor.lastrowid
|
||||
logger.info(f"Created '{label}' tag with ID {tag_id}")
|
||||
return tag_id
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _get_file_info(file_path: Path) -> Dict[str, Any]:
|
||||
"""Get file type, mime type, and dimensions."""
|
||||
ext = file_path.suffix.lower().lstrip('.')
|
||||
mime_type, _ = mimetypes.guess_type(str(file_path))
|
||||
if not mime_type:
|
||||
mime_type = 'application/octet-stream'
|
||||
|
||||
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
|
||||
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
|
||||
|
||||
if ext in image_exts:
|
||||
file_type = 'image'
|
||||
elif ext in video_exts:
|
||||
file_type = 'video'
|
||||
else:
|
||||
file_type = 'other'
|
||||
|
||||
width, height, duration = 0, 0, 0
|
||||
|
||||
if file_type == 'image':
|
||||
try:
|
||||
from PIL import Image
|
||||
with Image.open(file_path) as img:
|
||||
width, height = img.size
|
||||
except Exception:
|
||||
pass
|
||||
elif file_type == 'video':
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_streams', str(file_path)],
|
||||
capture_output=True, text=True, timeout=15
|
||||
)
|
||||
if result.returncode == 0:
|
||||
import json
|
||||
probe = json.loads(result.stdout)
|
||||
for stream in probe.get('streams', []):
|
||||
if stream.get('codec_type') == 'video':
|
||||
width = int(stream.get('width', 0))
|
||||
height = int(stream.get('height', 0))
|
||||
dur = stream.get('duration')
|
||||
if dur:
|
||||
duration = int(float(dur))
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
'file_type': file_type,
|
||||
'mime_type': mime_type,
|
||||
'width': width,
|
||||
'height': height,
|
||||
'duration': duration,
|
||||
}
|
||||
|
||||
|
||||
def _compute_perceptual_hash(file_path: Path) -> Optional[str]:
|
||||
"""Calculate perceptual hash for an image or video file."""
|
||||
try:
|
||||
import imagehash
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
ext = file_path.suffix.lower().lstrip('.')
|
||||
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
|
||||
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
|
||||
|
||||
pil_image = None
|
||||
try:
|
||||
if ext in video_exts:
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
return None
|
||||
cap = cv2.VideoCapture(str(file_path))
|
||||
if not cap.isOpened():
|
||||
return None
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, int(total_frames * 0.5))
|
||||
ret, frame = cap.read()
|
||||
cap.release()
|
||||
if not ret or frame is None:
|
||||
return None
|
||||
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
pil_image = Image.fromarray(frame_rgb)
|
||||
elif ext in image_exts:
|
||||
pil_image = Image.open(file_path)
|
||||
else:
|
||||
return None
|
||||
|
||||
return str(imagehash.dhash(pil_image, hash_size=16))
|
||||
except Exception:
|
||||
return None
|
||||
finally:
|
||||
if pil_image:
|
||||
try:
|
||||
pil_image.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _generate_thumbnail(file_path: Path, output_path: Path, file_type: str) -> bool:
|
||||
"""Generate a thumbnail for an image or video."""
|
||||
try:
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if file_type == 'image':
|
||||
from PIL import Image, ImageOps
|
||||
with Image.open(file_path) as img:
|
||||
img = ImageOps.exif_transpose(img)
|
||||
img.thumbnail((400, 400))
|
||||
if img.mode in ('RGBA', 'P'):
|
||||
img = img.convert('RGB')
|
||||
img.save(output_path, 'JPEG', quality=85)
|
||||
return True
|
||||
|
||||
elif file_type == 'video':
|
||||
result = subprocess.run([
|
||||
'ffmpeg', '-y', '-i', str(file_path),
|
||||
'-ss', '00:00:01', '-vframes', '1',
|
||||
'-vf', 'scale=400:-1:force_original_aspect_ratio=decrease',
|
||||
str(output_path)
|
||||
], capture_output=True, timeout=30)
|
||||
return result.returncode == 0 and output_path.exists()
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def import_new_media(platform: str, username: str, person_id: int,
|
||||
last_imported_at: Optional[str], db, crypto,
|
||||
last_imported_file_id: int = 0) -> int:
|
||||
"""
|
||||
Import new media files from file_inventory into the private gallery.
|
||||
Returns count of imported files.
|
||||
"""
|
||||
conn = sqlite3.connect(db.db_path, timeout=30)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Use id-based filtering (reliable, monotonically increasing with insertion order).
|
||||
# Falls back to created_date only for legacy accounts without last_imported_file_id.
|
||||
if last_imported_file_id and last_imported_file_id > 0:
|
||||
cursor.execute('''
|
||||
SELECT id, file_path, filename, created_date FROM file_inventory
|
||||
WHERE platform = ? AND source = ? AND id > ?
|
||||
AND location IN ('final', 'review')
|
||||
ORDER BY id ASC
|
||||
''', (platform, username, last_imported_file_id))
|
||||
elif last_imported_at:
|
||||
cursor.execute('''
|
||||
SELECT id, file_path, filename, created_date FROM file_inventory
|
||||
WHERE platform = ? AND source = ? AND created_date > ?
|
||||
AND location IN ('final', 'review')
|
||||
ORDER BY id ASC
|
||||
''', (platform, username, last_imported_at))
|
||||
else:
|
||||
# First run: only import files from the last 1 hour
|
||||
from datetime import timedelta
|
||||
cutoff = (datetime.now() - timedelta(hours=1)).isoformat()
|
||||
cursor.execute('''
|
||||
SELECT id, file_path, filename, created_date FROM file_inventory
|
||||
WHERE platform = ? AND source = ? AND created_date > ?
|
||||
AND location IN ('final', 'review')
|
||||
ORDER BY id ASC
|
||||
''', (platform, username, cutoff))
|
||||
|
||||
files = cursor.fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if not files:
|
||||
return 0
|
||||
|
||||
# Filter to existing files, track max id for updating last_imported_file_id
|
||||
valid_files = []
|
||||
max_file_id = last_imported_file_id or 0
|
||||
for f in files:
|
||||
fp = Path(f['file_path'])
|
||||
file_id = f['id']
|
||||
if file_id > max_file_id:
|
||||
max_file_id = file_id
|
||||
if fp.exists() and fp.stat().st_size > 0:
|
||||
valid_files.append({'path': fp, 'created_date': f['created_date'], 'id': file_id})
|
||||
|
||||
if not valid_files:
|
||||
return 0
|
||||
|
||||
# Get storage path
|
||||
conn = sqlite3.connect(db.db_path, timeout=10)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT value FROM private_media_config WHERE key = 'storage_path'")
|
||||
row = cursor.fetchone()
|
||||
storage_path = Path(row['value']) if row else Path('/opt/immich/private')
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
data_path = storage_path / 'data'
|
||||
thumbs_path = storage_path / 'thumbs'
|
||||
data_path.mkdir(parents=True, exist_ok=True)
|
||||
thumbs_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Get/create platform tag
|
||||
tag_id = _ensure_platform_tag(platform, db, crypto)
|
||||
|
||||
# Create a post for this batch
|
||||
now_iso = datetime.now().isoformat()
|
||||
encrypted_desc = crypto.encrypt_field(f"{PLATFORM_LABELS.get(platform, platform)} - @{username}")
|
||||
encrypted_date = crypto.encrypt_field(now_iso)
|
||||
|
||||
conn = sqlite3.connect(db.db_path, timeout=10)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT INTO private_media_posts (person_id, encrypted_description, encrypted_media_date, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
''', (person_id, encrypted_desc, encrypted_date, now_iso, now_iso))
|
||||
conn.commit()
|
||||
post_id = cursor.lastrowid
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
media_count = 0
|
||||
latest_date = last_imported_at
|
||||
|
||||
for file_info_entry in valid_files:
|
||||
file_path = file_info_entry['path']
|
||||
created_date = file_info_entry['created_date']
|
||||
# Normalize to string for consistent comparison (PostgreSQL returns datetime objects)
|
||||
if hasattr(created_date, 'isoformat'):
|
||||
created_date = created_date.isoformat()
|
||||
try:
|
||||
# Calculate file hash
|
||||
sha256 = hashlib.sha256()
|
||||
with open(file_path, 'rb') as f:
|
||||
for chunk in iter(lambda: f.read(65536), b''):
|
||||
sha256.update(chunk)
|
||||
file_hash = sha256.hexdigest()
|
||||
|
||||
# Check for duplicates (scoped by person)
|
||||
conn = sqlite3.connect(db.db_path, timeout=10)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
'SELECT id FROM private_media WHERE file_hash = ? AND person_id = ?',
|
||||
(file_hash, person_id)
|
||||
)
|
||||
if cursor.fetchone():
|
||||
logger.debug(f"Duplicate file skipped: {file_path.name}")
|
||||
if created_date and (not latest_date or created_date > latest_date):
|
||||
latest_date = created_date
|
||||
continue
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# Get file info
|
||||
finfo = _get_file_info(file_path)
|
||||
file_size = file_path.stat().st_size
|
||||
|
||||
# Compute perceptual hash
|
||||
perceptual_hash = _compute_perceptual_hash(file_path)
|
||||
|
||||
# Generate storage ID
|
||||
storage_id = str(uuid.uuid4())
|
||||
|
||||
# Generate thumbnail
|
||||
temp_thumb = Path(tempfile.gettempdir()) / f"pg_thumb_{storage_id}.jpg"
|
||||
_generate_thumbnail(file_path, temp_thumb, finfo['file_type'])
|
||||
|
||||
# Encrypt the file
|
||||
encrypted_file = data_path / f"{storage_id}.enc"
|
||||
if not crypto.encrypt_file(file_path, encrypted_file):
|
||||
logger.error(f"Encryption failed for {file_path.name}")
|
||||
continue
|
||||
|
||||
# Encrypt thumbnail
|
||||
if temp_thumb.exists():
|
||||
encrypted_thumb = thumbs_path / f"{storage_id}.enc"
|
||||
crypto.encrypt_file(temp_thumb, encrypted_thumb)
|
||||
try:
|
||||
temp_thumb.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Insert media record
|
||||
encrypted_filename = crypto.encrypt_field(file_path.name)
|
||||
encrypted_source = crypto.encrypt_field(f"@{username}")
|
||||
|
||||
conn = sqlite3.connect(db.db_path, timeout=10)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT INTO private_media (
|
||||
post_id, storage_id, encrypted_filename, encrypted_description,
|
||||
file_hash, file_size, file_type, mime_type,
|
||||
width, height, duration, person_id,
|
||||
encrypted_media_date, source_type, encrypted_source_path,
|
||||
perceptual_hash, created_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
''', (
|
||||
post_id,
|
||||
storage_id,
|
||||
encrypted_filename,
|
||||
None,
|
||||
file_hash,
|
||||
file_size,
|
||||
finfo['file_type'],
|
||||
finfo['mime_type'],
|
||||
finfo['width'],
|
||||
finfo['height'],
|
||||
finfo['duration'],
|
||||
person_id,
|
||||
encrypted_date,
|
||||
platform,
|
||||
encrypted_source,
|
||||
perceptual_hash,
|
||||
now_iso,
|
||||
))
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
media_count += 1
|
||||
if created_date and (not latest_date or created_date > latest_date):
|
||||
latest_date = created_date
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to import {file_path.name}: {e}")
|
||||
|
||||
# Apply platform tag to the post if we imported media
|
||||
if media_count > 0:
|
||||
conn = sqlite3.connect(db.db_path, timeout=10)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT OR IGNORE INTO private_media_post_tags (post_id, tag_id)
|
||||
VALUES (?, ?)
|
||||
''', (post_id, tag_id))
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# Update the mapping row with both timestamp and file id markers
|
||||
conn = sqlite3.connect(db.db_path, timeout=10)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
UPDATE private_media_scraper_accounts
|
||||
SET last_imported_at = ?,
|
||||
last_imported_file_id = ?,
|
||||
total_media_imported = total_media_imported + ?,
|
||||
updated_at = ?
|
||||
WHERE platform = ? AND username = ? AND person_id = ?
|
||||
''', (latest_date or now_iso, max_file_id, media_count, now_iso, platform, username, person_id))
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
logger.info(f"Imported {media_count} files from {platform}/@{username} to gallery (last_file_id={max_file_id})")
|
||||
else:
|
||||
# No media imported - still update the file id marker so we don't re-check these files
|
||||
if max_file_id > (last_imported_file_id or 0):
|
||||
conn = sqlite3.connect(db.db_path, timeout=10)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
UPDATE private_media_scraper_accounts
|
||||
SET last_imported_file_id = ?
|
||||
WHERE platform = ? AND username = ? AND person_id = ?
|
||||
''', (max_file_id, platform, username, person_id))
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# Delete the empty post
|
||||
conn = sqlite3.connect(db.db_path, timeout=10)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("DELETE FROM private_media_posts WHERE id = ?", (post_id,))
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return media_count
|
||||
|
||||
|
||||
def on_download_complete(task_id: str, download_count: int, db, crypto) -> int:
|
||||
"""
|
||||
Called from scheduler after a task completes.
|
||||
Checks ALL mapped accounts for the platform for new media.
|
||||
|
||||
This handles all cases:
|
||||
- Batch tasks (fastdl:all, imginn_api:all)
|
||||
- Per-user tasks that also download phrase_search users (instagram_client:evalongoria)
|
||||
- Simple per-user tasks (toolzu:evalongoria)
|
||||
|
||||
The id-based filtering is cheap — accounts with no new files return quickly.
|
||||
"""
|
||||
if not task_id or ':' not in task_id:
|
||||
return 0
|
||||
|
||||
scraper_module = task_id.split(':')[0]
|
||||
|
||||
# Map scraper module to platform
|
||||
platform = SCRAPER_TO_PLATFORM.get(scraper_module)
|
||||
if not platform:
|
||||
return 0
|
||||
|
||||
# Always check ALL mapped accounts for the platform.
|
||||
# A single task can download for many users (batch tasks, phrase_search),
|
||||
# and id-based filtering makes per-account checks cheap.
|
||||
return _import_all_mapped_accounts(platform, db, crypto)
|
||||
|
||||
|
||||
def _import_all_mapped_accounts(platform: str, db, crypto) -> int:
|
||||
"""
|
||||
After a batch task (e.g. fastdl:all), check ALL mapped accounts
|
||||
for the platform and import any new media.
|
||||
"""
|
||||
conn = sqlite3.connect(db.db_path, timeout=10)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
SELECT id, username, person_id, last_imported_at, last_imported_file_id
|
||||
FROM private_media_scraper_accounts
|
||||
WHERE platform = ? AND enabled = 1
|
||||
''', (platform,))
|
||||
rows = cursor.fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if not rows:
|
||||
return 0
|
||||
|
||||
total_imported = 0
|
||||
for row in rows:
|
||||
try:
|
||||
count = import_new_media(
|
||||
platform, row['username'], row['person_id'],
|
||||
row['last_imported_at'], db, crypto,
|
||||
last_imported_file_id=row['last_imported_file_id'] or 0
|
||||
)
|
||||
total_imported += count
|
||||
except Exception as e:
|
||||
logger.error(f"Gallery bridge batch import error for {platform}/@{row['username']}: {e}")
|
||||
|
||||
if total_imported > 0:
|
||||
logger.info(f"Batch import for {platform}: {total_imported} files across {len(rows)} accounts")
|
||||
|
||||
return total_imported
|
||||
728
modules/semantic_search.py
Normal file
728
modules/semantic_search.py
Normal file
@@ -0,0 +1,728 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Semantic Search Module using CLIP
|
||||
Provides image/video similarity search and natural language search capabilities
|
||||
"""
|
||||
|
||||
import os
|
||||
import struct
|
||||
import numpy as np
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
from pathlib import Path
|
||||
from PIL import Image
|
||||
import threading
|
||||
import queue
|
||||
from datetime import datetime
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('SemanticSearch')
|
||||
|
||||
# Global model instance (lazy loaded)
|
||||
_clip_model = None
|
||||
_clip_model_name = None
|
||||
_model_lock = threading.Lock()
|
||||
|
||||
|
||||
def get_configured_model_name() -> str:
|
||||
"""Get the configured CLIP model name from settings"""
|
||||
try:
|
||||
from modules.settings_manager import SettingsManager
|
||||
from pathlib import Path
|
||||
# Use the correct database path
|
||||
db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db'
|
||||
settings_manager = SettingsManager(str(db_path))
|
||||
semantic_settings = settings_manager.get('semantic_search', {})
|
||||
if isinstance(semantic_settings, dict):
|
||||
model = semantic_settings.get('model', 'clip-ViT-B-32')
|
||||
logger.info(f"Configured CLIP model: {model}")
|
||||
return model
|
||||
return 'clip-ViT-B-32'
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get configured model: {e}")
|
||||
return 'clip-ViT-B-32'
|
||||
|
||||
|
||||
def get_clip_model(model_name: str = None):
|
||||
"""Get or load the CLIP model (thread-safe singleton)"""
|
||||
global _clip_model, _clip_model_name
|
||||
|
||||
if model_name is None:
|
||||
model_name = get_configured_model_name()
|
||||
|
||||
# Check if we need to reload (model changed)
|
||||
if _clip_model is not None and _clip_model_name != model_name:
|
||||
with _model_lock:
|
||||
logger.info(f"Model changed from {_clip_model_name} to {model_name}, reloading...")
|
||||
_clip_model = None
|
||||
_clip_model_name = None
|
||||
|
||||
if _clip_model is None:
|
||||
with _model_lock:
|
||||
if _clip_model is None:
|
||||
logger.info(f"Loading CLIP model ({model_name})...")
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
_clip_model = SentenceTransformer(model_name)
|
||||
_clip_model_name = model_name
|
||||
logger.info(f"CLIP model {model_name} loaded successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load CLIP model: {e}")
|
||||
raise
|
||||
|
||||
return _clip_model
|
||||
|
||||
|
||||
def embedding_to_bytes(embedding: np.ndarray) -> bytes:
|
||||
"""Convert numpy embedding to bytes for database storage"""
|
||||
return embedding.astype(np.float32).tobytes()
|
||||
|
||||
|
||||
def bytes_to_embedding(data: bytes) -> np.ndarray:
|
||||
"""Convert bytes from database back to numpy embedding"""
|
||||
return np.frombuffer(data, dtype=np.float32)
|
||||
|
||||
|
||||
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
||||
"""Calculate cosine similarity between two embeddings"""
|
||||
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
|
||||
|
||||
|
||||
class SemanticSearch:
|
||||
"""Semantic search engine using CLIP embeddings"""
|
||||
|
||||
SUPPORTED_IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'}
|
||||
SUPPORTED_VIDEO_EXTENSIONS = {'.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v'}
|
||||
|
||||
def __init__(self, unified_db):
|
||||
"""
|
||||
Initialize Semantic Search
|
||||
|
||||
Args:
|
||||
unified_db: UnifiedDatabase instance
|
||||
"""
|
||||
self.db = unified_db
|
||||
self.logger = get_logger('SemanticSearch')
|
||||
self._model = None
|
||||
|
||||
@property
|
||||
def model(self):
|
||||
"""Lazy load CLIP model"""
|
||||
if self._model is None:
|
||||
self._model = get_clip_model()
|
||||
return self._model
|
||||
|
||||
def get_image_embedding(self, image_path: str) -> Optional[np.ndarray]:
|
||||
"""
|
||||
Generate CLIP embedding for an image
|
||||
|
||||
Args:
|
||||
image_path: Path to the image file
|
||||
|
||||
Returns:
|
||||
Embedding vector or None on error
|
||||
"""
|
||||
try:
|
||||
# Load and preprocess image
|
||||
with Image.open(image_path) as image:
|
||||
# Convert to RGB if necessary
|
||||
if image.mode != 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
# Generate embedding
|
||||
embedding = self.model.encode(image, convert_to_numpy=True)
|
||||
|
||||
return embedding
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Failed to get embedding for {image_path}: {e}")
|
||||
return None
|
||||
|
||||
def get_video_frame_embedding(self, video_path: str, frame_position: float = 0.1) -> Optional[np.ndarray]:
|
||||
"""
|
||||
Generate CLIP embedding for a video by extracting a frame
|
||||
|
||||
Args:
|
||||
video_path: Path to the video file
|
||||
frame_position: Position in video (0-1) to extract frame from
|
||||
|
||||
Returns:
|
||||
Embedding vector or None on error
|
||||
"""
|
||||
# Try cv2 first, fall back to ffmpeg for codecs cv2 can't handle (e.g. AV1)
|
||||
image = self._extract_frame_cv2(video_path, frame_position)
|
||||
if image is None:
|
||||
image = self._extract_frame_ffmpeg(video_path, frame_position)
|
||||
|
||||
if image is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
embedding = self.model.encode(image, convert_to_numpy=True)
|
||||
return embedding
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Failed to encode video frame for {video_path}: {e}")
|
||||
return None
|
||||
finally:
|
||||
# Clean up image to prevent memory leaks
|
||||
if image is not None:
|
||||
try:
|
||||
image.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _extract_frame_cv2(self, video_path: str, frame_position: float) -> Optional[Image.Image]:
|
||||
"""Extract frame using OpenCV"""
|
||||
try:
|
||||
import cv2
|
||||
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
if not cap.isOpened():
|
||||
return None
|
||||
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
if total_frames <= 0:
|
||||
cap.release()
|
||||
return None
|
||||
|
||||
target_frame = int(total_frames * frame_position)
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, target_frame)
|
||||
|
||||
ret, frame = cap.read()
|
||||
cap.release()
|
||||
|
||||
if not ret:
|
||||
return None
|
||||
|
||||
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
return Image.fromarray(frame_rgb)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"cv2 frame extraction failed for {video_path}: {e}")
|
||||
return None
|
||||
|
||||
def _extract_frame_ffmpeg(self, video_path: str, frame_position: float) -> Optional[Image.Image]:
|
||||
"""Extract frame using ffmpeg (fallback for codecs cv2 can't handle)"""
|
||||
try:
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
# Get video duration
|
||||
probe_cmd = [
|
||||
'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
|
||||
'-of', 'default=noprint_wrappers=1:nokey=1', video_path
|
||||
]
|
||||
result = subprocess.run(probe_cmd, capture_output=True, text=True, timeout=10)
|
||||
if result.returncode != 0:
|
||||
return None
|
||||
|
||||
duration = float(result.stdout.strip())
|
||||
seek_time = duration * frame_position
|
||||
|
||||
# Extract frame to temp file
|
||||
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
|
||||
tmp_path = tmp.name
|
||||
|
||||
extract_cmd = [
|
||||
'ffmpeg', '-y', '-ss', str(seek_time), '-i', video_path,
|
||||
'-vframes', '1', '-q:v', '2', tmp_path
|
||||
]
|
||||
result = subprocess.run(extract_cmd, capture_output=True, timeout=30)
|
||||
|
||||
if result.returncode != 0 or not os.path.exists(tmp_path):
|
||||
return None
|
||||
|
||||
image = Image.open(tmp_path)
|
||||
image.load() # Load into memory before deleting file
|
||||
|
||||
# Clean up temp file
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass # Best effort cleanup of temp file
|
||||
|
||||
if image.mode != 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
return image
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"ffmpeg frame extraction failed for {video_path}: {e}")
|
||||
return None
|
||||
|
||||
def get_text_embedding(self, text: str) -> Optional[np.ndarray]:
|
||||
"""
|
||||
Generate CLIP embedding for text query
|
||||
|
||||
Args:
|
||||
text: Text query
|
||||
|
||||
Returns:
|
||||
Embedding vector or None on error
|
||||
"""
|
||||
try:
|
||||
embedding = self.model.encode(text, convert_to_numpy=True)
|
||||
return embedding
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get text embedding: {e}")
|
||||
return None
|
||||
|
||||
def store_embedding(self, file_id: int, embedding: np.ndarray) -> bool:
|
||||
"""
|
||||
Store embedding in database
|
||||
|
||||
Args:
|
||||
file_id: File inventory ID
|
||||
embedding: Embedding vector
|
||||
|
||||
Returns:
|
||||
Success status
|
||||
"""
|
||||
try:
|
||||
embedding_bytes = embedding_to_bytes(embedding)
|
||||
|
||||
with self.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT OR REPLACE INTO content_embeddings
|
||||
(file_id, embedding, embedding_model, embedding_version, created_date)
|
||||
VALUES (?, ?, 'clip-ViT-B-32', 1, CURRENT_TIMESTAMP)
|
||||
''', (file_id, embedding_bytes))
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to store embedding for file {file_id}: {e}")
|
||||
return False
|
||||
|
||||
def get_embedding(self, file_id: int) -> Optional[np.ndarray]:
|
||||
"""
|
||||
Get stored embedding from database
|
||||
|
||||
Args:
|
||||
file_id: File inventory ID
|
||||
|
||||
Returns:
|
||||
Embedding vector or None
|
||||
"""
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
SELECT embedding FROM content_embeddings WHERE file_id = ?
|
||||
''', (file_id,))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row and row['embedding']:
|
||||
return bytes_to_embedding(row['embedding'])
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get embedding for file {file_id}: {e}")
|
||||
return None
|
||||
|
||||
def delete_embedding(self, file_id: int) -> bool:
|
||||
"""
|
||||
Delete embedding for a file
|
||||
|
||||
Args:
|
||||
file_id: File inventory ID
|
||||
|
||||
Returns:
|
||||
True if deleted, False otherwise
|
||||
"""
|
||||
try:
|
||||
with self.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('DELETE FROM content_embeddings WHERE file_id = ?', (file_id,))
|
||||
if cursor.rowcount > 0:
|
||||
self.logger.debug(f"Deleted embedding for file_id {file_id}")
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to delete embedding for file {file_id}: {e}")
|
||||
return False
|
||||
|
||||
def delete_embedding_by_path(self, file_path: str) -> bool:
|
||||
"""
|
||||
Delete embedding for a file by its path
|
||||
|
||||
Args:
|
||||
file_path: File path
|
||||
|
||||
Returns:
|
||||
True if deleted, False otherwise
|
||||
"""
|
||||
try:
|
||||
with self.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
# First get the file_id
|
||||
cursor.execute('SELECT id FROM file_inventory WHERE file_path = ?', (file_path,))
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
cursor.execute('DELETE FROM content_embeddings WHERE file_id = ?', (row['id'],))
|
||||
if cursor.rowcount > 0:
|
||||
self.logger.debug(f"Deleted embedding for {file_path}")
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to delete embedding for {file_path}: {e}")
|
||||
return False
|
||||
|
||||
def generate_embedding_for_file(self, file_id: int, file_path: str, content_type: str = None) -> bool:
|
||||
"""
|
||||
Generate and store embedding for a single file
|
||||
|
||||
Args:
|
||||
file_id: File inventory ID
|
||||
file_path: Path to the file
|
||||
content_type: Optional content type ('image' or 'video')
|
||||
|
||||
Returns:
|
||||
True if embedding generated and stored successfully
|
||||
"""
|
||||
try:
|
||||
if not os.path.exists(file_path):
|
||||
self.logger.debug(f"File not found for embedding: {file_path}")
|
||||
return False
|
||||
|
||||
ext = Path(file_path).suffix.lower()
|
||||
|
||||
# Determine file type
|
||||
if content_type:
|
||||
is_image = 'image' in content_type.lower()
|
||||
is_video = 'video' in content_type.lower()
|
||||
else:
|
||||
is_image = ext in self.SUPPORTED_IMAGE_EXTENSIONS
|
||||
is_video = ext in self.SUPPORTED_VIDEO_EXTENSIONS
|
||||
|
||||
embedding = None
|
||||
if is_image:
|
||||
embedding = self.get_image_embedding(file_path)
|
||||
elif is_video:
|
||||
embedding = self.get_video_frame_embedding(file_path)
|
||||
|
||||
if embedding is not None:
|
||||
if self.store_embedding(file_id, embedding):
|
||||
self.logger.debug(f"Generated embedding for file_id {file_id}: {Path(file_path).name}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to generate embedding for file {file_id}: {e}")
|
||||
return False
|
||||
|
||||
def get_embedding_stats(self) -> Dict:
|
||||
"""Get statistics about embeddings in the database"""
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Total embeddings for files in 'final' location only
|
||||
# (excludes embeddings for files moved to recycle bin or review)
|
||||
cursor.execute('''
|
||||
SELECT COUNT(*) FROM content_embeddings ce
|
||||
JOIN file_inventory fi ON ce.file_id = fi.id
|
||||
WHERE fi.location = 'final'
|
||||
''')
|
||||
total_embeddings = cursor.fetchone()[0]
|
||||
|
||||
# Total files in final location
|
||||
cursor.execute("SELECT COUNT(*) FROM file_inventory WHERE location = 'final'")
|
||||
total_files = cursor.fetchone()[0]
|
||||
|
||||
# Files without embeddings
|
||||
cursor.execute('''
|
||||
SELECT COUNT(*) FROM file_inventory fi
|
||||
WHERE fi.location = 'final'
|
||||
AND NOT EXISTS (SELECT 1 FROM content_embeddings ce WHERE ce.file_id = fi.id)
|
||||
''')
|
||||
missing_embeddings = cursor.fetchone()[0]
|
||||
|
||||
return {
|
||||
'total_embeddings': total_embeddings,
|
||||
'total_files': total_files,
|
||||
'missing_embeddings': missing_embeddings,
|
||||
'coverage_percent': round((total_embeddings / total_files * 100) if total_files > 0 else 0, 2)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to get embedding stats: {e}")
|
||||
return {}
|
||||
|
||||
def generate_embeddings_batch(self, limit: int = 100, platform: str = None,
|
||||
progress_callback=None) -> Dict:
|
||||
"""
|
||||
Generate embeddings for files that don't have them yet
|
||||
|
||||
Args:
|
||||
limit: Maximum files to process
|
||||
platform: Filter by platform
|
||||
progress_callback: Optional callback(processed, total, current_file)
|
||||
|
||||
Returns:
|
||||
Dict with success/error counts
|
||||
"""
|
||||
results = {'processed': 0, 'success': 0, 'errors': 0, 'skipped': 0}
|
||||
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get files without embeddings
|
||||
query = '''
|
||||
SELECT fi.id, fi.file_path, fi.content_type, fi.filename
|
||||
FROM file_inventory fi
|
||||
WHERE fi.location = 'final'
|
||||
AND NOT EXISTS (SELECT 1 FROM content_embeddings ce WHERE ce.file_id = fi.id)
|
||||
'''
|
||||
params = []
|
||||
|
||||
if platform:
|
||||
query += ' AND fi.platform = ?'
|
||||
params.append(platform)
|
||||
|
||||
query += ' LIMIT ?'
|
||||
params.append(limit)
|
||||
|
||||
cursor.execute(query, params)
|
||||
files = cursor.fetchall()
|
||||
|
||||
total = len(files)
|
||||
self.logger.info(f"Processing {total} files for embedding generation")
|
||||
|
||||
for i, file_row in enumerate(files):
|
||||
file_id = file_row['id']
|
||||
file_path = file_row['file_path']
|
||||
content_type = file_row['content_type'] or ''
|
||||
filename = file_row['filename'] or ''
|
||||
|
||||
results['processed'] += 1
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(i + 1, total, filename)
|
||||
|
||||
# Skip if file doesn't exist
|
||||
if not os.path.exists(file_path):
|
||||
results['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Determine file type
|
||||
ext = Path(file_path).suffix.lower()
|
||||
|
||||
embedding = None
|
||||
|
||||
if ext in self.SUPPORTED_IMAGE_EXTENSIONS or 'image' in content_type.lower():
|
||||
embedding = self.get_image_embedding(file_path)
|
||||
elif ext in self.SUPPORTED_VIDEO_EXTENSIONS or 'video' in content_type.lower():
|
||||
embedding = self.get_video_frame_embedding(file_path)
|
||||
else:
|
||||
results['skipped'] += 1
|
||||
continue
|
||||
|
||||
if embedding is not None:
|
||||
if self.store_embedding(file_id, embedding):
|
||||
results['success'] += 1
|
||||
else:
|
||||
results['errors'] += 1
|
||||
else:
|
||||
results['errors'] += 1
|
||||
|
||||
self.logger.info(f"Embedding generation complete: {results}")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to generate embeddings batch: {e}")
|
||||
return results
|
||||
|
||||
def search_by_text(self, query: str, limit: int = 50, platform: str = None,
|
||||
source: str = None, threshold: float = 0.2) -> List[Dict]:
|
||||
"""
|
||||
Search for images/videos using natural language
|
||||
|
||||
Args:
|
||||
query: Natural language search query
|
||||
limit: Maximum results
|
||||
platform: Filter by platform
|
||||
source: Filter by source
|
||||
threshold: Minimum similarity score (0-1)
|
||||
|
||||
Returns:
|
||||
List of files with similarity scores
|
||||
"""
|
||||
try:
|
||||
# Get text embedding
|
||||
query_embedding = self.get_text_embedding(query)
|
||||
if query_embedding is None:
|
||||
return []
|
||||
|
||||
return self._search_by_embedding(query_embedding, limit, platform, source, threshold)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Text search failed: {e}")
|
||||
return []
|
||||
|
||||
def search_by_image(self, image_path: str, limit: int = 50, platform: str = None,
|
||||
source: str = None, threshold: float = 0.5) -> List[Dict]:
|
||||
"""
|
||||
Find similar images to a given image
|
||||
|
||||
Args:
|
||||
image_path: Path to query image
|
||||
limit: Maximum results
|
||||
platform: Filter by platform
|
||||
source: Filter by source
|
||||
threshold: Minimum similarity score (0-1)
|
||||
|
||||
Returns:
|
||||
List of similar files with scores
|
||||
"""
|
||||
try:
|
||||
# Get image embedding
|
||||
query_embedding = self.get_image_embedding(image_path)
|
||||
if query_embedding is None:
|
||||
return []
|
||||
|
||||
return self._search_by_embedding(query_embedding, limit, platform, source, threshold)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Image search failed: {e}")
|
||||
return []
|
||||
|
||||
def search_by_file_id(self, file_id: int, limit: int = 50, platform: str = None,
|
||||
source: str = None, threshold: float = 0.5) -> List[Dict]:
|
||||
"""
|
||||
Find similar files to a file already in the database
|
||||
|
||||
Args:
|
||||
file_id: File inventory ID
|
||||
limit: Maximum results
|
||||
platform: Filter by platform
|
||||
source: Filter by source
|
||||
threshold: Minimum similarity score (0-1)
|
||||
|
||||
Returns:
|
||||
List of similar files with scores
|
||||
"""
|
||||
try:
|
||||
# Get existing embedding
|
||||
query_embedding = self.get_embedding(file_id)
|
||||
|
||||
if query_embedding is None:
|
||||
# Try to generate it
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT file_path FROM file_inventory WHERE id = ?', (file_id,))
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
query_embedding = self.get_image_embedding(row['file_path'])
|
||||
|
||||
if query_embedding is None:
|
||||
return []
|
||||
|
||||
results = self._search_by_embedding(query_embedding, limit + 1, platform, source, threshold)
|
||||
|
||||
# Remove the query file itself from results
|
||||
return [r for r in results if r['id'] != file_id][:limit]
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Similar file search failed: {e}")
|
||||
return []
|
||||
|
||||
def _search_by_embedding(self, query_embedding: np.ndarray, limit: int,
|
||||
platform: str = None, source: str = None,
|
||||
threshold: float = 0.2) -> List[Dict]:
|
||||
"""
|
||||
Internal search using embedding vector
|
||||
|
||||
Args:
|
||||
query_embedding: Query embedding vector
|
||||
limit: Maximum results
|
||||
platform: Filter by platform
|
||||
source: Filter by source
|
||||
threshold: Minimum similarity score
|
||||
|
||||
Returns:
|
||||
List of files with similarity scores, sorted by score
|
||||
"""
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Build query to get all embeddings (with optional filters)
|
||||
query = '''
|
||||
SELECT ce.file_id, ce.embedding, fi.file_path, fi.filename,
|
||||
fi.platform, fi.source, fi.content_type, fi.file_size
|
||||
FROM content_embeddings ce
|
||||
JOIN file_inventory fi ON fi.id = ce.file_id
|
||||
WHERE fi.location = 'final'
|
||||
'''
|
||||
params = []
|
||||
|
||||
if platform:
|
||||
query += ' AND fi.platform = ?'
|
||||
params.append(platform)
|
||||
if source:
|
||||
query += ' AND fi.source = ?'
|
||||
params.append(source)
|
||||
|
||||
cursor.execute(query, params)
|
||||
|
||||
results = []
|
||||
for row in cursor.fetchall():
|
||||
embedding = bytes_to_embedding(row['embedding'])
|
||||
similarity = cosine_similarity(query_embedding, embedding)
|
||||
|
||||
if similarity >= threshold:
|
||||
results.append({
|
||||
'id': row['file_id'],
|
||||
'file_path': row['file_path'],
|
||||
'filename': row['filename'],
|
||||
'platform': row['platform'],
|
||||
'source': row['source'],
|
||||
'content_type': row['content_type'],
|
||||
'file_size': row['file_size'],
|
||||
'similarity': round(similarity, 4)
|
||||
})
|
||||
|
||||
# Sort by similarity descending
|
||||
results.sort(key=lambda x: x['similarity'], reverse=True)
|
||||
|
||||
return results[:limit]
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Embedding search failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
# Global instance (lazy initialization)
|
||||
_semantic_search = None
|
||||
|
||||
|
||||
def reset_clip_model():
|
||||
"""Reset the global CLIP model so it will be reloaded with new config"""
|
||||
global _clip_model, _clip_model_name
|
||||
with _model_lock:
|
||||
_clip_model = None
|
||||
_clip_model_name = None
|
||||
logger.info("CLIP model cache cleared, will reload on next use")
|
||||
|
||||
|
||||
def get_semantic_search(unified_db=None, force_reload=False):
|
||||
"""Get or create global semantic search instance
|
||||
|
||||
Args:
|
||||
unified_db: Database instance to use
|
||||
force_reload: If True, recreate the instance (useful when model config changes)
|
||||
"""
|
||||
global _semantic_search
|
||||
if _semantic_search is None or force_reload:
|
||||
if force_reload:
|
||||
# Also reset the CLIP model so it reloads with new config
|
||||
reset_clip_model()
|
||||
if unified_db is None:
|
||||
from modules.unified_database import UnifiedDatabase
|
||||
unified_db = UnifiedDatabase()
|
||||
_semantic_search = SemanticSearch(unified_db)
|
||||
return _semantic_search
|
||||
319
modules/service_health_monitor.py
Normal file
319
modules/service_health_monitor.py
Normal file
@@ -0,0 +1,319 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Service Health Monitor - Tracks service failures and sends alerts
|
||||
Only active during scheduler mode for unattended operation monitoring
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Optional
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
|
||||
class ServiceHealthMonitor:
|
||||
"""Monitor service health and send alerts when services get stuck"""
|
||||
|
||||
def __init__(self,
|
||||
state_file: str = "/opt/media-downloader/database/service_health.json",
|
||||
config: dict = None,
|
||||
error_monitoring_config: dict = None,
|
||||
pushover_notifier = None,
|
||||
scheduler_mode: bool = False):
|
||||
"""
|
||||
Initialize health monitor
|
||||
|
||||
Args:
|
||||
state_file: Path to JSON file storing health state
|
||||
config: Configuration dict from settings.json
|
||||
error_monitoring_config: Error monitoring settings (for push alert delay)
|
||||
pushover_notifier: Instance of PushoverNotifier for alerts
|
||||
scheduler_mode: Only monitor when True (scheduler mode)
|
||||
"""
|
||||
self.state_file = Path(state_file)
|
||||
self.state_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.pushover = pushover_notifier
|
||||
self.scheduler_mode = scheduler_mode
|
||||
self.error_monitoring_config = error_monitoring_config or {}
|
||||
|
||||
# Default configuration
|
||||
self.config = {
|
||||
'enabled': True,
|
||||
'notification_cooldown_hours': 24,
|
||||
'min_consecutive_failures': 2, # Number of consecutive run failures before alerting
|
||||
'services': {
|
||||
'fastdl': {'monitor': True, 'notify': True},
|
||||
'imginn': {'monitor': True, 'notify': True},
|
||||
'snapchat': {'monitor': True, 'notify': True},
|
||||
'toolzu': {'monitor': True, 'notify': True},
|
||||
'tiktok': {'monitor': True, 'notify': True},
|
||||
'forums': {'monitor': True, 'notify': True}
|
||||
},
|
||||
'pushover': {
|
||||
'enabled': True,
|
||||
'priority': 0,
|
||||
'sound': 'pushover'
|
||||
}
|
||||
}
|
||||
|
||||
# Merge user config
|
||||
if config:
|
||||
self.config.update(config)
|
||||
|
||||
# Load or initialize state
|
||||
self.state = self._load_state()
|
||||
|
||||
# Setup logging
|
||||
self.logger = get_logger('ServiceHealthMonitor')
|
||||
|
||||
def _load_state(self) -> Dict:
|
||||
"""Load health state from file"""
|
||||
if self.state_file.exists():
|
||||
try:
|
||||
with open(self.state_file, 'r') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to load health state: {e}")
|
||||
|
||||
# Initialize empty state
|
||||
return {'service_health': {}}
|
||||
|
||||
def _save_state(self):
|
||||
"""Save health state to file"""
|
||||
try:
|
||||
with open(self.state_file, 'w') as f:
|
||||
json.dump(self.state, f, indent=2, default=str)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to save health state: {e}")
|
||||
|
||||
def _get_service_state(self, service: str) -> Dict:
|
||||
"""Get state for a service, initialize if doesn't exist"""
|
||||
if service not in self.state['service_health']:
|
||||
self.state['service_health'][service] = {
|
||||
'status': 'healthy',
|
||||
'consecutive_failures': 0,
|
||||
'last_success': None,
|
||||
'last_failure': None,
|
||||
'last_notification_sent': None,
|
||||
'failure_type': None,
|
||||
'total_failures': 0,
|
||||
'total_successes': 0
|
||||
}
|
||||
return self.state['service_health'][service]
|
||||
|
||||
def record_success(self, service: str):
|
||||
"""
|
||||
Record successful operation for a service
|
||||
|
||||
Args:
|
||||
service: Service name (fastdl, imginn, snapchat, etc.)
|
||||
"""
|
||||
# Only monitor in scheduler mode
|
||||
if not self.scheduler_mode:
|
||||
return
|
||||
|
||||
# Check if service is monitored
|
||||
if not self._is_monitored(service):
|
||||
return
|
||||
|
||||
state = self._get_service_state(service)
|
||||
now = datetime.now()
|
||||
|
||||
# Was service previously stuck? Send recovery notification
|
||||
was_stuck = state['status'] == 'stuck'
|
||||
|
||||
# Update state
|
||||
state['status'] = 'healthy'
|
||||
state['consecutive_failures'] = 0
|
||||
state['last_success'] = now.isoformat()
|
||||
state['failure_type'] = None
|
||||
state['total_successes'] += 1
|
||||
|
||||
self._save_state()
|
||||
|
||||
# Send recovery notification if service was stuck
|
||||
if was_stuck and self._should_notify(service):
|
||||
self._send_recovery_notification(service, now)
|
||||
|
||||
def record_failure(self, service: str, reason: str = 'unknown'):
|
||||
"""
|
||||
Record failure for a service
|
||||
|
||||
Args:
|
||||
service: Service name (fastdl, imginn, snapchat, etc.)
|
||||
reason: Reason for failure (cloudflare, rate_limit, timeout, etc.)
|
||||
"""
|
||||
# Only monitor in scheduler mode
|
||||
if not self.scheduler_mode:
|
||||
return
|
||||
|
||||
# Check if service is monitored
|
||||
if not self._is_monitored(service):
|
||||
return
|
||||
|
||||
state = self._get_service_state(service)
|
||||
now = datetime.now()
|
||||
|
||||
# Update state - increment consecutive failures
|
||||
state['consecutive_failures'] += 1
|
||||
state['last_failure'] = now.isoformat()
|
||||
state['failure_type'] = reason
|
||||
state['total_failures'] += 1
|
||||
|
||||
# Check if service should be marked as stuck based on consecutive run failures
|
||||
min_failures = self.config.get('min_consecutive_failures', 2)
|
||||
if state['consecutive_failures'] >= min_failures:
|
||||
state['status'] = 'stuck'
|
||||
|
||||
# Send notification if cooldown period has passed
|
||||
if self._should_notify(service) and self._notification_cooldown_expired(service):
|
||||
self._send_alert_notification(service, reason, now)
|
||||
state['last_notification_sent'] = now.isoformat()
|
||||
|
||||
self._save_state()
|
||||
|
||||
def _is_monitored(self, service: str) -> bool:
|
||||
"""Check if service should be monitored"""
|
||||
if not self.config.get('enabled', True):
|
||||
return False
|
||||
|
||||
service_config = self.config.get('services', {}).get(service, {})
|
||||
return service_config.get('monitor', True)
|
||||
|
||||
def _should_notify(self, service: str) -> bool:
|
||||
"""Check if notifications are enabled for this service"""
|
||||
if not self.pushover:
|
||||
return False
|
||||
|
||||
if not self.config.get('pushover', {}).get('enabled', True):
|
||||
return False
|
||||
|
||||
service_config = self.config.get('services', {}).get(service, {})
|
||||
return service_config.get('notify', True)
|
||||
|
||||
def _notification_cooldown_expired(self, service: str) -> bool:
|
||||
"""Check if notification cooldown period has expired"""
|
||||
state = self._get_service_state(service)
|
||||
last_sent = state.get('last_notification_sent')
|
||||
|
||||
if not last_sent:
|
||||
return True # Never sent, can send now
|
||||
|
||||
try:
|
||||
last_sent_time = datetime.fromisoformat(last_sent)
|
||||
# Use push_alert_delay_hours from error_monitoring config if available,
|
||||
# otherwise fall back to notification_cooldown_hours or default 24
|
||||
cooldown_hours = self.error_monitoring_config.get('push_alert_delay_hours',
|
||||
self.config.get('notification_cooldown_hours', 24))
|
||||
cooldown_period = timedelta(hours=cooldown_hours)
|
||||
|
||||
return datetime.now() - last_sent_time > cooldown_period
|
||||
except (ValueError, TypeError):
|
||||
return True # Error parsing date, allow notification
|
||||
|
||||
def _send_alert_notification(self, service: str, reason: str, now: datetime):
|
||||
"""Send Pushover alert notification"""
|
||||
state = self._get_service_state(service)
|
||||
|
||||
# Calculate time since last success
|
||||
time_stuck = "Unknown"
|
||||
if state['last_success']:
|
||||
try:
|
||||
last_success = datetime.fromisoformat(state['last_success'])
|
||||
delta = now - last_success
|
||||
hours = int(delta.total_seconds() / 3600)
|
||||
if hours < 1:
|
||||
time_stuck = f"{int(delta.total_seconds() / 60)} minutes ago"
|
||||
elif hours < 48:
|
||||
time_stuck = f"{hours} hours ago"
|
||||
else:
|
||||
days = int(hours / 24)
|
||||
time_stuck = f"{days} days ago"
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Format service name
|
||||
service_name = service.replace('_', ' ').title()
|
||||
|
||||
# Format reason
|
||||
reason_map = {
|
||||
'cloudflare': 'Cloudflare Challenge',
|
||||
'cloudflare_challenge': 'Cloudflare Challenge',
|
||||
'rate_limit': 'Rate Limited (429)',
|
||||
'forbidden': 'Access Forbidden (403)',
|
||||
'timeout': 'Connection Timeout',
|
||||
'authentication': 'Authentication Required',
|
||||
'captcha': 'CAPTCHA Challenge',
|
||||
'blocked': 'IP Blocked',
|
||||
'unknown': 'Unknown Error'
|
||||
}
|
||||
reason_text = reason_map.get(reason.lower(), reason)
|
||||
|
||||
# Build message
|
||||
title = f"⚠️ Service Alert: {service_name}"
|
||||
message = f"""Status: Stuck/Blocked
|
||||
Issue: {reason_text}
|
||||
Failed Since: {now.strftime('%b %d, %I:%M %p')} ({state['consecutive_failures']} consecutive failures)
|
||||
|
||||
Last successful download: {time_stuck if state['last_success'] else 'Never'}
|
||||
|
||||
Action may be required.
|
||||
"""
|
||||
|
||||
# Send notification
|
||||
try:
|
||||
priority = self.config.get('pushover', {}).get('priority', 0)
|
||||
sound = self.config.get('pushover', {}).get('sound', 'pushover')
|
||||
|
||||
self.pushover.send_notification(
|
||||
title=title,
|
||||
message=message,
|
||||
priority=priority,
|
||||
sound=sound
|
||||
)
|
||||
|
||||
self.logger.info(f"Sent alert notification for {service}: {reason}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to send alert notification: {e}")
|
||||
|
||||
def _send_recovery_notification(self, service: str, now: datetime):
|
||||
"""Send recovery notification (optional)"""
|
||||
# Recovery notifications are optional - can be disabled
|
||||
if not self.config.get('send_recovery_notifications', False):
|
||||
return
|
||||
|
||||
state = self._get_service_state(service)
|
||||
service_name = service.replace('_', ' ').title()
|
||||
|
||||
title = f"✅ Service Recovered: {service_name}"
|
||||
message = f"""Status: Healthy
|
||||
Service is working again.
|
||||
|
||||
Recovered at: {now.strftime('%b %d, %I:%M %p')}
|
||||
"""
|
||||
|
||||
try:
|
||||
self.pushover.send_notification(
|
||||
title=title,
|
||||
message=message,
|
||||
priority=-1, # Low priority for recovery
|
||||
sound='magic'
|
||||
)
|
||||
|
||||
self.logger.info(f"Sent recovery notification for {service}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to send recovery notification: {e}")
|
||||
|
||||
def get_service_status(self, service: str) -> Dict:
|
||||
"""Get current status for a service"""
|
||||
return self._get_service_state(service).copy()
|
||||
|
||||
def get_all_status(self) -> Dict:
|
||||
"""Get status for all services"""
|
||||
return self.state['service_health'].copy()
|
||||
|
||||
def reset_service(self, service: str):
|
||||
"""Reset state for a service"""
|
||||
if service in self.state['service_health']:
|
||||
del self.state['service_health'][service]
|
||||
self._save_state()
|
||||
257
modules/settings_manager.py
Normal file
257
modules/settings_manager.py
Normal file
@@ -0,0 +1,257 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Settings Manager for Media Downloader
|
||||
Handles settings storage in database with JSON file compatibility
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Any, Union, Tuple
|
||||
from contextlib import contextmanager
|
||||
import threading
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('SettingsManager')
|
||||
|
||||
|
||||
class SettingsManager:
|
||||
"""Manage application settings in database (thread-safe)"""
|
||||
|
||||
def __init__(self, db_path: str):
|
||||
"""
|
||||
Initialize settings manager
|
||||
|
||||
Args:
|
||||
db_path: Path to SQLite database
|
||||
"""
|
||||
self.db_path = db_path
|
||||
self._write_lock = threading.RLock() # Reentrant lock for write operations
|
||||
self._create_tables()
|
||||
|
||||
@contextmanager
|
||||
def _get_connection(self, for_write: bool = False):
|
||||
"""Get database connection (thread-safe)"""
|
||||
conn = sqlite3.connect(self.db_path, timeout=30.0, check_same_thread=False)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
if for_write:
|
||||
with self._write_lock:
|
||||
yield conn
|
||||
else:
|
||||
yield conn
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def _create_tables(self):
|
||||
"""Create settings table if it doesn't exist"""
|
||||
with self._get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS settings (
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT NOT NULL,
|
||||
value_type TEXT NOT NULL,
|
||||
category TEXT,
|
||||
description TEXT,
|
||||
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_by TEXT DEFAULT 'system'
|
||||
)
|
||||
''')
|
||||
|
||||
# Create index for category lookups
|
||||
cursor.execute('''
|
||||
CREATE INDEX IF NOT EXISTS idx_settings_category
|
||||
ON settings(category)
|
||||
''')
|
||||
|
||||
conn.commit()
|
||||
logger.info("Settings tables initialized")
|
||||
|
||||
def get(self, key: str, default: Any = None) -> Any:
|
||||
"""
|
||||
Get a setting value
|
||||
|
||||
Args:
|
||||
key: Setting key (supports dot notation, e.g., 'instagram.enabled')
|
||||
default: Default value if not found
|
||||
|
||||
Returns:
|
||||
Setting value or default
|
||||
"""
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT value, value_type FROM settings WHERE key = ?', (key,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
return default
|
||||
|
||||
value, value_type = row['value'], row['value_type']
|
||||
return self._deserialize_value(value, value_type)
|
||||
|
||||
def set(self, key: str, value: Any, category: str = None,
|
||||
description: str = None, updated_by: str = 'system'):
|
||||
"""
|
||||
Set a setting value
|
||||
|
||||
Args:
|
||||
key: Setting key
|
||||
value: Setting value (will be serialized to JSON if needed)
|
||||
category: Optional category
|
||||
description: Optional description
|
||||
updated_by: Who updated the setting
|
||||
"""
|
||||
value_str, value_type = self._serialize_value(value)
|
||||
|
||||
with self._get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT OR REPLACE INTO settings
|
||||
(key, value, value_type, category, description, updated_at, updated_by)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
''', (key, value_str, value_type, category, description,
|
||||
datetime.now().isoformat(), updated_by))
|
||||
conn.commit()
|
||||
logger.debug(f"Setting updated: {key} = {value_str[:100]}")
|
||||
|
||||
def get_category(self, category: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get all settings in a category
|
||||
|
||||
Args:
|
||||
category: Category name
|
||||
|
||||
Returns:
|
||||
Dictionary of settings
|
||||
"""
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
SELECT key, value, value_type
|
||||
FROM settings
|
||||
WHERE category = ?
|
||||
''', (category,))
|
||||
|
||||
result = {}
|
||||
for row in cursor.fetchall():
|
||||
key = row['key']
|
||||
value = self._deserialize_value(row['value'], row['value_type'])
|
||||
result[key] = value
|
||||
|
||||
return result
|
||||
|
||||
def get_all(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get all settings as a nested dictionary
|
||||
|
||||
Returns:
|
||||
Nested dictionary of all settings
|
||||
"""
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT key, value, value_type FROM settings')
|
||||
|
||||
result = {}
|
||||
for row in cursor.fetchall():
|
||||
key = row['key']
|
||||
value = self._deserialize_value(row['value'], row['value_type'])
|
||||
|
||||
# Support nested keys like 'instagram.enabled'
|
||||
self._set_nested(result, key, value)
|
||||
|
||||
return result
|
||||
|
||||
def delete(self, key: str):
|
||||
"""Delete a setting"""
|
||||
with self._get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('DELETE FROM settings WHERE key = ?', (key,))
|
||||
conn.commit()
|
||||
logger.debug(f"Setting deleted: {key}")
|
||||
|
||||
def migrate_from_json(self, json_path: str):
|
||||
"""
|
||||
Migrate settings from JSON file to database
|
||||
|
||||
Args:
|
||||
json_path: Path to settings.json file
|
||||
"""
|
||||
json_file = Path(json_path)
|
||||
if not json_file.exists():
|
||||
logger.warning(f"JSON file not found: {json_path}")
|
||||
return
|
||||
|
||||
with open(json_file, 'r') as f:
|
||||
settings = json.load(f)
|
||||
|
||||
# Flatten and store settings
|
||||
self._migrate_dict(settings, prefix='', category='root')
|
||||
logger.info(f"Settings migrated from {json_path}")
|
||||
|
||||
def _migrate_dict(self, data: Dict, prefix: str = '', category: str = None):
|
||||
"""Recursively migrate nested dictionary"""
|
||||
for key, value in data.items():
|
||||
full_key = f"{prefix}.{key}" if prefix else key
|
||||
|
||||
if isinstance(value, dict):
|
||||
# Store the entire dict as a value
|
||||
self.set(full_key, value, category=category or key)
|
||||
else:
|
||||
# Store primitive value
|
||||
self.set(full_key, value, category=category or prefix.split('.')[0])
|
||||
|
||||
def export_to_json(self, json_path: str):
|
||||
"""
|
||||
Export settings to JSON file
|
||||
|
||||
Args:
|
||||
json_path: Path to save settings.json
|
||||
"""
|
||||
settings = self.get_all()
|
||||
|
||||
with open(json_path, 'w') as f:
|
||||
json.dump(settings, f, indent=2)
|
||||
|
||||
logger.info(f"Settings exported to {json_path}")
|
||||
|
||||
def _serialize_value(self, value: Any) -> Tuple[str, str]:
|
||||
"""
|
||||
Serialize value to string and determine type
|
||||
|
||||
Returns:
|
||||
Tuple of (value_string, value_type)
|
||||
"""
|
||||
if isinstance(value, bool):
|
||||
return (json.dumps(value), 'boolean')
|
||||
elif isinstance(value, int):
|
||||
return (json.dumps(value), 'number')
|
||||
elif isinstance(value, float):
|
||||
return (json.dumps(value), 'number')
|
||||
elif isinstance(value, str):
|
||||
return (value, 'string')
|
||||
elif isinstance(value, (dict, list)):
|
||||
return (json.dumps(value), 'object' if isinstance(value, dict) else 'array')
|
||||
else:
|
||||
return (json.dumps(value), 'object')
|
||||
|
||||
def _deserialize_value(self, value_str: str, value_type: str) -> Any:
|
||||
"""Deserialize value from string"""
|
||||
if value_type == 'string':
|
||||
return value_str
|
||||
else:
|
||||
return json.loads(value_str)
|
||||
|
||||
def _set_nested(self, data: Dict, key: str, value: Any):
|
||||
"""Set value in nested dictionary using dot notation"""
|
||||
parts = key.split('.')
|
||||
current = data
|
||||
|
||||
for part in parts[:-1]:
|
||||
if part not in current:
|
||||
current[part] = {}
|
||||
current = current[part]
|
||||
|
||||
current[parts[-1]] = value
|
||||
871
modules/snapchat_client_module.py
Normal file
871
modules/snapchat_client_module.py
Normal file
@@ -0,0 +1,871 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Snapchat Client Module - Direct HTTP-based Snapchat downloader using curl_cffi.
|
||||
|
||||
Replaces Playwright-based scraping with direct HTTP requests. Snapchat embeds
|
||||
all page data in <script id="__NEXT_DATA__"> JSON tags, so no JavaScript
|
||||
execution is needed. Uses story.snapchat.com which may not require Cloudflare.
|
||||
|
||||
Follows the same pattern as instagram_client_module.py.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
import random
|
||||
import platform
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, List, Set
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from modules.snapchat_scraper import SnapMedia, SnapCollection
|
||||
|
||||
|
||||
class SnapchatClientDownloader(LoggingMixin):
|
||||
"""Snapchat downloader using direct HTTP via curl_cffi (no Playwright)"""
|
||||
|
||||
def __init__(self,
|
||||
show_progress: bool = True,
|
||||
use_database: bool = True,
|
||||
log_callback=None,
|
||||
unified_db=None):
|
||||
"""Initialize the Snapchat Client downloader.
|
||||
|
||||
Args:
|
||||
show_progress: Whether to show download progress
|
||||
use_database: Whether to use database for dedup
|
||||
log_callback: Optional logging callback
|
||||
unified_db: UnifiedDatabase instance
|
||||
"""
|
||||
self._init_logger('SnapchatClient', log_callback, default_module='Download')
|
||||
|
||||
self.scraper_id = 'snapchat_client'
|
||||
self.show_progress = show_progress
|
||||
self.use_database = use_database
|
||||
self.download_count = 0
|
||||
self.downloaded_files: Set[str] = set()
|
||||
self.pending_downloads = []
|
||||
|
||||
# Session (lazy-initialized)
|
||||
self._session = None
|
||||
|
||||
# Database
|
||||
if unified_db and use_database:
|
||||
from modules.unified_database import SnapchatDatabaseAdapter
|
||||
self.db = SnapchatDatabaseAdapter(unified_db)
|
||||
self.unified_db = unified_db
|
||||
else:
|
||||
self.db = None
|
||||
self.unified_db = None
|
||||
self.use_database = False
|
||||
|
||||
# Activity status manager
|
||||
try:
|
||||
from modules.activity_status import get_activity_manager
|
||||
self.activity_manager = get_activity_manager(unified_db)
|
||||
except ImportError:
|
||||
self.activity_manager = None
|
||||
|
||||
# Cookie data from DB
|
||||
self.cookies = []
|
||||
self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
|
||||
|
||||
def _get_session(self):
|
||||
"""Get or create a curl_cffi session with browser TLS fingerprinting."""
|
||||
if self._session is None:
|
||||
from curl_cffi.requests import Session
|
||||
# Try multiple browser versions for curl_cffi compatibility
|
||||
for _browser in ("chrome131", "chrome136", "chrome"):
|
||||
try:
|
||||
self._session = Session(impersonate=_browser)
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
else:
|
||||
self._session = Session()
|
||||
self._session.headers.update({
|
||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'cache-control': 'no-cache',
|
||||
})
|
||||
# Load cookies from database
|
||||
self._load_cookies()
|
||||
return self._session
|
||||
|
||||
def _load_cookies(self):
|
||||
"""Load cookies from database for authenticated requests."""
|
||||
if not self.unified_db:
|
||||
return
|
||||
|
||||
# Try snapchat_client cookies first, fall back to snapchat
|
||||
for scraper_id in ['snapchat_client', 'snapchat']:
|
||||
try:
|
||||
cookies = self.unified_db.get_scraper_cookies(scraper_id)
|
||||
if cookies:
|
||||
self.log(f"Loaded {len(cookies)} cookies from '{scraper_id}' scraper", "debug")
|
||||
self.cookies = cookies
|
||||
for cookie in cookies:
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
domain = cookie.get('domain', '.snapchat.com')
|
||||
if name and value and self._session:
|
||||
self._session.cookies.set(name, value, domain=domain)
|
||||
|
||||
# Check if we have a stored user-agent (important for cf_clearance match)
|
||||
try:
|
||||
import json as _json
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT user_agent FROM scrapers WHERE id = ?",
|
||||
(scraper_id,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
self.user_agent = row[0]
|
||||
if self._session:
|
||||
self._session.headers['User-Agent'] = self.user_agent
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return
|
||||
except Exception as e:
|
||||
self.log(f"Error loading cookies from '{scraper_id}': {e}", "debug")
|
||||
|
||||
def _fetch_page(self, url: str) -> Optional[str]:
|
||||
"""Fetch a page via HTTP and return the HTML content.
|
||||
|
||||
Tries story.snapchat.com first (no Cloudflare), falls back to www.snapchat.com.
|
||||
"""
|
||||
session = self._get_session()
|
||||
|
||||
# If URL uses www.snapchat.com, try story.snapchat.com first
|
||||
story_url = url.replace('www.snapchat.com', 'story.snapchat.com')
|
||||
www_url = url.replace('story.snapchat.com', 'www.snapchat.com')
|
||||
|
||||
# Try story.snapchat.com first (likely no Cloudflare)
|
||||
for attempt_url in [story_url, www_url]:
|
||||
try:
|
||||
resp = session.get(attempt_url, timeout=30)
|
||||
if resp.status_code == 200 and '__NEXT_DATA__' in resp.text:
|
||||
return resp.text
|
||||
elif resp.status_code == 403:
|
||||
self.log(f"403 Forbidden from {attempt_url.split('/@')[0]}", "debug")
|
||||
continue
|
||||
elif resp.status_code != 200:
|
||||
self.log(f"HTTP {resp.status_code} from {attempt_url.split('/@')[0]}", "debug")
|
||||
continue
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching {attempt_url.split('/@')[0]}: {e}", "debug")
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
def _extract_next_data(self, html: str) -> Optional[Dict]:
|
||||
"""Extract __NEXT_DATA__ JSON from HTML page."""
|
||||
match = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
|
||||
if not match:
|
||||
return None
|
||||
try:
|
||||
return json.loads(match.group(1))
|
||||
except json.JSONDecodeError as e:
|
||||
self.log(f"Failed to parse __NEXT_DATA__ JSON: {e}", "error")
|
||||
return None
|
||||
|
||||
def get_profile_content(self, username: str) -> Dict[str, List]:
|
||||
"""Get all spotlight URLs, highlight URLs, and inline story/highlight data from a profile.
|
||||
|
||||
Parses __NEXT_DATA__ JSON to extract:
|
||||
- spotlights: list of spotlight URL strings
|
||||
- highlights: list of highlight URL strings
|
||||
- story_collection: SnapCollection from story.snapList (recent stories), or None
|
||||
- highlight_collections: list of SnapCollection from curatedHighlights (inline data)
|
||||
|
||||
The inline data avoids needing separate HTTP requests for stories and highlights.
|
||||
"""
|
||||
result = {'spotlights': [], 'highlights': [], 'story_collection': None, 'highlight_collections': []}
|
||||
|
||||
url = f"https://story.snapchat.com/@{username}"
|
||||
self.log(f"Fetching profile for @{username}", "info")
|
||||
|
||||
html = self._fetch_page(url)
|
||||
if not html:
|
||||
self.log(f"Failed to fetch profile page for @{username}", "warning")
|
||||
return result
|
||||
|
||||
# Extract spotlight URLs via regex (still needed — spotlight metadata requires per-URL fetch)
|
||||
spotlight_pattern = rf'/@{re.escape(username)}/spotlight/([A-Za-z0-9_-]+)'
|
||||
spotlight_ids = list(set(re.findall(spotlight_pattern, html)))
|
||||
result['spotlights'] = [
|
||||
f"https://story.snapchat.com/@{username}/spotlight/{sid}"
|
||||
for sid in spotlight_ids
|
||||
]
|
||||
self.log(f"Found {len(result['spotlights'])} spotlights", "info")
|
||||
|
||||
# Parse __NEXT_DATA__ for stories and highlights (much more reliable than regex)
|
||||
data = self._extract_next_data(html)
|
||||
if not data:
|
||||
# Fall back to regex for highlights
|
||||
highlight_pattern = rf'/@{re.escape(username)}/highlight/([A-Za-z0-9-]+)'
|
||||
highlight_ids = list(set(re.findall(highlight_pattern, html)))
|
||||
result['highlights'] = [
|
||||
f"https://story.snapchat.com/@{username}/highlight/{hid}"
|
||||
for hid in highlight_ids
|
||||
]
|
||||
self.log(f"Found {len(result['highlights'])} highlights (regex fallback)", "info")
|
||||
return result
|
||||
|
||||
props = (data.get('props') or {}).get('pageProps') or {}
|
||||
|
||||
# Extract story snapList (recent stories — not available via individual URLs)
|
||||
story = props.get('story') or {}
|
||||
story_snaps = story.get('snapList') or []
|
||||
if story_snaps:
|
||||
story_id = story.get('storyId') or {}
|
||||
if isinstance(story_id, dict):
|
||||
story_id = story_id.get('value', 'story')
|
||||
story_collection = SnapCollection(
|
||||
collection_id=story_id or 'story',
|
||||
collection_type='story',
|
||||
title=story.get('storyTitle', '') or 'Stories',
|
||||
username=username,
|
||||
url=url
|
||||
)
|
||||
for snap_data in story_snaps:
|
||||
snap = self._parse_snap_data(snap_data)
|
||||
if snap:
|
||||
story_collection.snaps.append(snap)
|
||||
if story_collection.snaps:
|
||||
result['story_collection'] = story_collection
|
||||
self.log(f"Found {len(story_collection.snaps)} story snaps", "info")
|
||||
|
||||
# Extract curatedHighlights inline (avoids per-highlight HTTP requests)
|
||||
curated_highlights = props.get('curatedHighlights') or []
|
||||
for highlight in curated_highlights:
|
||||
highlight_id = highlight.get('highlightId') or {}
|
||||
if isinstance(highlight_id, dict):
|
||||
highlight_id = highlight_id.get('value', '')
|
||||
|
||||
title = highlight.get('storyTitle') or {}
|
||||
if isinstance(title, dict):
|
||||
title = title.get('value', '')
|
||||
|
||||
collection = SnapCollection(
|
||||
collection_id=highlight_id,
|
||||
collection_type='highlight',
|
||||
title=title or 'Untitled Highlight',
|
||||
username=username,
|
||||
url=f"https://story.snapchat.com/@{username}/highlight/{highlight_id}"
|
||||
)
|
||||
for snap_data in highlight.get('snapList') or []:
|
||||
snap = self._parse_snap_data(snap_data)
|
||||
if snap:
|
||||
collection.snaps.append(snap)
|
||||
if collection.snaps:
|
||||
result['highlight_collections'].append(collection)
|
||||
|
||||
self.log(f"Found {len(result['highlight_collections'])} highlights (inline)", "info")
|
||||
|
||||
return result
|
||||
|
||||
def _parse_snap_data(self, snap_data: Dict) -> Optional[SnapMedia]:
|
||||
"""Parse a snap from __NEXT_DATA__ snapList into a SnapMedia object."""
|
||||
snap_urls = snap_data.get('snapUrls') or {}
|
||||
media_url = snap_urls.get('mediaUrl', '')
|
||||
if not media_url:
|
||||
return None
|
||||
|
||||
snap_id = (snap_data.get('snapId') or {}).get('value', '')
|
||||
media_id = ''
|
||||
if '/d/' in media_url:
|
||||
media_id = media_url.split('/d/')[1].split('.')[0]
|
||||
|
||||
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
|
||||
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str and ts_str != '0' else datetime.now()
|
||||
|
||||
lat = snap_data.get('lat')
|
||||
lng = snap_data.get('lng')
|
||||
|
||||
return SnapMedia(
|
||||
media_id=media_id or snap_id,
|
||||
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
|
||||
media_url=media_url,
|
||||
timestamp=timestamp,
|
||||
index=snap_data.get('snapIndex', 0),
|
||||
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
|
||||
lat=float(lat) if lat else None,
|
||||
lng=float(lng) if lng else None
|
||||
)
|
||||
|
||||
def get_spotlight_metadata(self, url: str) -> Optional[SnapCollection]:
|
||||
"""Extract full metadata from a spotlight URL via __NEXT_DATA__."""
|
||||
html = self._fetch_page(url)
|
||||
if not html:
|
||||
return None
|
||||
|
||||
data = self._extract_next_data(html)
|
||||
if not data:
|
||||
return None
|
||||
|
||||
props = (data.get('props') or {}).get('pageProps') or {}
|
||||
feed = props.get('spotlightFeed') or {}
|
||||
stories = feed.get('spotlightStories') or []
|
||||
|
||||
if not stories:
|
||||
return None
|
||||
|
||||
story_data = stories[0]
|
||||
story = story_data.get('story') or {}
|
||||
metadata = (story_data.get('metadata') or {}).get('videoMetadata') or {}
|
||||
|
||||
story_id = (story.get('storyId') or {}).get('value', '')
|
||||
creator = (metadata.get('creator') or {}).get('personCreator') or {}
|
||||
username = creator.get('username', '')
|
||||
|
||||
collection = SnapCollection(
|
||||
collection_id=story_id,
|
||||
collection_type='spotlight',
|
||||
title=metadata.get('description', ''),
|
||||
username=username,
|
||||
url=url
|
||||
)
|
||||
|
||||
for snap_data in story.get('snapList') or []:
|
||||
snap_id = (snap_data.get('snapId') or {}).get('value', '')
|
||||
snap_urls = snap_data.get('snapUrls') or {}
|
||||
media_url = snap_urls.get('mediaUrl', '')
|
||||
|
||||
media_id = ''
|
||||
if '/d/' in media_url:
|
||||
media_id = media_url.split('/d/')[1].split('.')[0]
|
||||
|
||||
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
|
||||
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
|
||||
|
||||
snap = SnapMedia(
|
||||
media_id=media_id or snap_id,
|
||||
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
|
||||
media_url=media_url,
|
||||
timestamp=timestamp,
|
||||
index=snap_data.get('snapIndex', 0),
|
||||
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
|
||||
duration_ms=int(metadata.get('durationMs', 0)),
|
||||
description=metadata.get('description', ''),
|
||||
view_count=int(metadata.get('viewCount', 0)),
|
||||
width=int(metadata.get('width', 540)),
|
||||
height=int(metadata.get('height', 960))
|
||||
)
|
||||
collection.snaps.append(snap)
|
||||
|
||||
return collection
|
||||
|
||||
def get_highlight_metadata(self, url: str) -> Optional[SnapCollection]:
|
||||
"""Extract full metadata from a highlight URL via __NEXT_DATA__."""
|
||||
html = self._fetch_page(url)
|
||||
if not html:
|
||||
return None
|
||||
|
||||
data = self._extract_next_data(html)
|
||||
if not data:
|
||||
return None
|
||||
|
||||
props = (data.get('props') or {}).get('pageProps') or {}
|
||||
highlight = props.get('highlight') or {}
|
||||
|
||||
if not highlight:
|
||||
return None
|
||||
|
||||
highlight_id = highlight.get('highlightId') or {}
|
||||
if isinstance(highlight_id, dict):
|
||||
highlight_id = highlight_id.get('value', '')
|
||||
|
||||
username_match = re.search(r'@([^/]+)', url)
|
||||
username = username_match.group(1) if username_match else ''
|
||||
|
||||
title = highlight.get('storyTitle') or {}
|
||||
if isinstance(title, dict):
|
||||
title = title.get('value', '')
|
||||
|
||||
collection = SnapCollection(
|
||||
collection_id=highlight_id,
|
||||
collection_type='highlight',
|
||||
title=title or 'Untitled Highlight',
|
||||
username=username,
|
||||
url=url
|
||||
)
|
||||
|
||||
for snap_data in highlight.get('snapList') or []:
|
||||
snap_urls = snap_data.get('snapUrls') or {}
|
||||
media_url = snap_urls.get('mediaUrl', '')
|
||||
|
||||
media_id = ''
|
||||
if '/d/' in media_url:
|
||||
media_id = media_url.split('/d/')[1].split('.')[0]
|
||||
|
||||
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
|
||||
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
|
||||
|
||||
lat = snap_data.get('lat')
|
||||
lng = snap_data.get('lng')
|
||||
|
||||
snap = SnapMedia(
|
||||
media_id=media_id,
|
||||
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
|
||||
media_url=media_url,
|
||||
timestamp=timestamp,
|
||||
index=snap_data.get('snapIndex', 0),
|
||||
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
|
||||
lat=float(lat) if lat else None,
|
||||
lng=float(lng) if lng else None
|
||||
)
|
||||
collection.snaps.append(snap)
|
||||
|
||||
return collection
|
||||
|
||||
def _download_media_file(self, snap: SnapMedia, output_path: str) -> bool:
|
||||
"""Download a single media file via curl_cffi."""
|
||||
try:
|
||||
url = snap.media_url.replace('&', '&')
|
||||
session = self._get_session()
|
||||
|
||||
resp = session.get(url, timeout=60)
|
||||
if resp.status_code == 200 and len(resp.content) > 0:
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(resp.content)
|
||||
self._set_metadata(output_path, snap)
|
||||
return True
|
||||
|
||||
self.log(f"Download failed: HTTP {resp.status_code}", "debug")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading media: {e}", "error")
|
||||
return False
|
||||
|
||||
def _set_metadata(self, file_path: str, snap: SnapMedia, description: str = None):
|
||||
"""Set EXIF metadata and file timestamp."""
|
||||
try:
|
||||
date_str = snap.timestamp.strftime('%Y:%m:%d %H:%M:%S')
|
||||
desc = description or snap.description or ""
|
||||
if snap.view_count:
|
||||
desc += f" [Views: {snap.view_count}]"
|
||||
desc = desc.strip()
|
||||
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
is_video = ext in ['.mp4', '.mov', '.avi', '.webm']
|
||||
is_image = ext in ['.jpg', '.jpeg', '.png', '.webp']
|
||||
|
||||
exif_args = [
|
||||
'exiftool', '-overwrite_original', '-ignoreMinorErrors',
|
||||
f'-FileModifyDate={date_str}',
|
||||
]
|
||||
|
||||
if is_image:
|
||||
exif_args.extend([
|
||||
f'-DateTimeOriginal={date_str}',
|
||||
f'-CreateDate={date_str}',
|
||||
f'-ModifyDate={date_str}',
|
||||
f'-MetadataDate={date_str}',
|
||||
])
|
||||
if desc:
|
||||
exif_args.extend([
|
||||
f'-ImageDescription={desc}',
|
||||
f'-XPComment={desc}',
|
||||
f'-UserComment={desc}',
|
||||
])
|
||||
if snap.lat and snap.lng:
|
||||
lat_ref = 'N' if snap.lat >= 0 else 'S'
|
||||
lng_ref = 'E' if snap.lng >= 0 else 'W'
|
||||
exif_args.extend([
|
||||
f'-GPSLatitude={abs(snap.lat)}',
|
||||
f'-GPSLatitudeRef={lat_ref}',
|
||||
f'-GPSLongitude={abs(snap.lng)}',
|
||||
f'-GPSLongitudeRef={lng_ref}',
|
||||
])
|
||||
|
||||
elif is_video:
|
||||
exif_args.extend([
|
||||
f'-CreateDate={date_str}',
|
||||
f'-ModifyDate={date_str}',
|
||||
f'-MediaCreateDate={date_str}',
|
||||
f'-MediaModifyDate={date_str}',
|
||||
f'-TrackCreateDate={date_str}',
|
||||
f'-TrackModifyDate={date_str}',
|
||||
])
|
||||
if desc:
|
||||
exif_args.extend([
|
||||
f'-Description={desc}',
|
||||
f'-Comment={desc}',
|
||||
])
|
||||
|
||||
exif_args.append(file_path)
|
||||
subprocess.run(exif_args, capture_output=True, timeout=30)
|
||||
|
||||
# Set filesystem modification time
|
||||
ts = snap.timestamp.timestamp()
|
||||
os.utime(file_path, (ts, ts))
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Warning: Could not set metadata for {file_path}: {e}", "debug")
|
||||
|
||||
def _generate_filename(self, username: str, snap: SnapMedia, ext: str) -> str:
|
||||
"""Generate filename with timestamp and media ID."""
|
||||
date_str = snap.timestamp.strftime('%Y%m%d_%H%M%S')
|
||||
return f"{username}_{date_str}_{snap.media_id}.{ext}"
|
||||
|
||||
def _get_processed_posts(self, username: str) -> Set[str]:
|
||||
"""Get set of media IDs that have been processed."""
|
||||
processed = set()
|
||||
if not self.db:
|
||||
return processed
|
||||
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
SELECT filename, metadata FROM downloads
|
||||
WHERE platform = 'snapchat'
|
||||
AND source = ?
|
||||
''', (username,))
|
||||
|
||||
for row in cursor.fetchall():
|
||||
filename, metadata_str = row
|
||||
if filename:
|
||||
parts = filename.split('_')
|
||||
if len(parts) >= 4:
|
||||
media_id = '_'.join(parts[3:]).split('.')[0]
|
||||
processed.add(media_id)
|
||||
|
||||
if metadata_str:
|
||||
try:
|
||||
metadata = json.loads(metadata_str)
|
||||
if 'media_id' in metadata:
|
||||
processed.add(metadata['media_id'])
|
||||
except (json.JSONDecodeError, TypeError, KeyError):
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error loading processed posts: {e}", "debug")
|
||||
|
||||
return processed
|
||||
|
||||
def _record_download(self, username: str, url: str, filename: str,
|
||||
post_date=None, metadata: dict = None, file_path: str = None,
|
||||
deferred: bool = False):
|
||||
"""Record a download in the database."""
|
||||
if deferred:
|
||||
self.pending_downloads.append({
|
||||
'username': username,
|
||||
'url': url,
|
||||
'filename': filename,
|
||||
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
|
||||
'file_path': file_path,
|
||||
'metadata': metadata
|
||||
})
|
||||
return True
|
||||
|
||||
if not self.db:
|
||||
return
|
||||
|
||||
try:
|
||||
self.db.mark_downloaded(
|
||||
username=username,
|
||||
url=url,
|
||||
filename=filename,
|
||||
post_date=post_date,
|
||||
metadata=metadata,
|
||||
file_path=file_path
|
||||
)
|
||||
except Exception as e:
|
||||
self.log(f"Failed to record download: {e}", "debug")
|
||||
|
||||
def get_pending_downloads(self) -> list:
|
||||
"""Get list of pending downloads for deferred recording."""
|
||||
return self.pending_downloads
|
||||
|
||||
def clear_pending_downloads(self):
|
||||
"""Clear pending downloads list."""
|
||||
self.pending_downloads = []
|
||||
|
||||
def download(self, username: str, content_type: str = "all", days_back: int = 14,
|
||||
max_downloads: int = 50, output_dir: str = None,
|
||||
spotlight_dir: str = None, stories_dir: str = None,
|
||||
stitch_highlights: bool = True, defer_database: bool = False,
|
||||
phrase_config: dict = None) -> int:
|
||||
"""Download content from a user - compatible with media-downloader interface.
|
||||
|
||||
Args:
|
||||
username: Snapchat username
|
||||
content_type: "spotlight", "stories", "highlights", or "all"
|
||||
days_back: How many days back to download (filters by post date)
|
||||
max_downloads: Maximum items to download per content type
|
||||
output_dir: Default output directory (used if specific dirs not set)
|
||||
spotlight_dir: Output directory for spotlights
|
||||
stories_dir: Output directory for stories/highlights
|
||||
stitch_highlights: Ignored (kept for backwards compatibility)
|
||||
defer_database: If True, defer database recording
|
||||
phrase_config: Not used (for interface compatibility)
|
||||
|
||||
Returns:
|
||||
Number of files downloaded
|
||||
"""
|
||||
self.defer_database = defer_database
|
||||
self.downloaded_files.clear()
|
||||
|
||||
# Set output directories
|
||||
if spotlight_dir:
|
||||
spotlight_output = Path(spotlight_dir)
|
||||
elif output_dir:
|
||||
spotlight_output = Path(output_dir)
|
||||
else:
|
||||
spotlight_output = Path(f"/opt/media-downloader/downloads/snapchat_client/spotlight/{username}")
|
||||
|
||||
if stories_dir:
|
||||
stories_output = Path(stories_dir)
|
||||
elif output_dir:
|
||||
stories_output = Path(output_dir)
|
||||
else:
|
||||
stories_output = Path(f"/opt/media-downloader/downloads/snapchat_client/stories/{username}")
|
||||
|
||||
spotlight_output.mkdir(parents=True, exist_ok=True)
|
||||
stories_output.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Update activity status
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status("Checking Snapchat")
|
||||
|
||||
# Get processed posts (shared with snapchat module - both use platform='snapchat')
|
||||
processed = self._get_processed_posts(username)
|
||||
self.log(f"Loaded {len(processed)} processed posts from database", "debug")
|
||||
|
||||
cutoff_date = datetime.now() - timedelta(days=days_back)
|
||||
downloaded_count = 0
|
||||
|
||||
# Crash recovery checkpoint
|
||||
from modules.task_checkpoint import TaskCheckpoint
|
||||
checkpoint = TaskCheckpoint(f'snapchat_client:{username}', 'scraping')
|
||||
|
||||
try:
|
||||
# Get profile content via HTTP
|
||||
content = self.get_profile_content(username)
|
||||
|
||||
# Count total items for checkpoint
|
||||
total_items = 0
|
||||
if content_type in ['spotlight', 'all'] and content['spotlights']:
|
||||
total_items += min(len(content['spotlights']), max_downloads)
|
||||
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
|
||||
total_items += min(len(content['highlights']), max_downloads)
|
||||
checkpoint.start(total_items=total_items)
|
||||
if checkpoint.is_recovering():
|
||||
self.log(f"Snapchat Client @{username}: recovering — skipping already-processed URLs", "info")
|
||||
|
||||
# Download spotlights
|
||||
if content_type in ['spotlight', 'all'] and content['spotlights']:
|
||||
spotlight_items = content['spotlights'][:max_downloads]
|
||||
self.log(f"Processing {len(spotlight_items)} spotlights...", "info")
|
||||
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status(
|
||||
"Downloading spotlights",
|
||||
progress_current=0,
|
||||
progress_total=len(spotlight_items)
|
||||
)
|
||||
|
||||
for spot_idx, url in enumerate(spotlight_items):
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status(
|
||||
"Downloading spotlights",
|
||||
progress_current=spot_idx + 1,
|
||||
progress_total=len(spotlight_items)
|
||||
)
|
||||
|
||||
if checkpoint.is_completed(url):
|
||||
continue
|
||||
|
||||
checkpoint.set_current(url)
|
||||
|
||||
try:
|
||||
# Rate limit between page fetches
|
||||
if spot_idx > 0:
|
||||
time.sleep(random.uniform(1.5, 2.5))
|
||||
|
||||
spotlight = self.get_spotlight_metadata(url)
|
||||
if not spotlight or not spotlight.snaps:
|
||||
continue
|
||||
|
||||
snap = spotlight.snaps[0]
|
||||
|
||||
# Check date filter
|
||||
if snap.timestamp < cutoff_date:
|
||||
self.log(f"Spotlight {snap.media_id} is older than {days_back} days, skipping", "debug")
|
||||
continue
|
||||
|
||||
# Check if already processed
|
||||
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
||||
self.log(f"Spotlight {snap.media_id} already processed, skipping", "debug")
|
||||
continue
|
||||
|
||||
# Download
|
||||
ext = 'mp4' if snap.media_type == 'video' else 'jpg'
|
||||
filename = self._generate_filename(username, snap, ext)
|
||||
output_path = str(spotlight_output / filename)
|
||||
|
||||
# Rate limit between CDN downloads
|
||||
time.sleep(random.uniform(0.3, 0.5))
|
||||
|
||||
if self._download_media_file(snap, output_path):
|
||||
self.downloaded_files.add(snap.media_id)
|
||||
downloaded_count += 1
|
||||
self.log(f"Downloaded spotlight: {filename}", "info")
|
||||
|
||||
self._record_download(
|
||||
username=username,
|
||||
url=url,
|
||||
filename=filename,
|
||||
post_date=snap.timestamp,
|
||||
metadata={
|
||||
'media_id': snap.media_id,
|
||||
'description': snap.description,
|
||||
'view_count': snap.view_count,
|
||||
'content_type': 'spotlight'
|
||||
},
|
||||
file_path=output_path,
|
||||
deferred=defer_database
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error processing spotlight: {e}", "error")
|
||||
|
||||
checkpoint.mark_completed(url)
|
||||
|
||||
# Rate limit between content types
|
||||
if content_type == 'all' and content['spotlights'] and content['highlights']:
|
||||
time.sleep(random.uniform(2, 3))
|
||||
|
||||
# Download highlights (stories)
|
||||
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
|
||||
highlight_items = content['highlights'][:max_downloads]
|
||||
self.log(f"Processing {len(highlight_items)} highlights...", "info")
|
||||
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status(
|
||||
"Downloading highlights",
|
||||
progress_current=0,
|
||||
progress_total=len(highlight_items)
|
||||
)
|
||||
|
||||
for hi_idx, url in enumerate(highlight_items):
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status(
|
||||
"Downloading highlights",
|
||||
progress_current=hi_idx + 1,
|
||||
progress_total=len(highlight_items)
|
||||
)
|
||||
|
||||
if checkpoint.is_completed(url):
|
||||
continue
|
||||
|
||||
checkpoint.set_current(url)
|
||||
|
||||
try:
|
||||
# Rate limit between page fetches
|
||||
if hi_idx > 0:
|
||||
time.sleep(random.uniform(1.5, 2.5))
|
||||
|
||||
highlight = self.get_highlight_metadata(url)
|
||||
if not highlight or not highlight.snaps:
|
||||
continue
|
||||
|
||||
# Check if any snap is within date range
|
||||
newest_snap = max(highlight.snaps, key=lambda s: s.timestamp)
|
||||
if newest_snap.timestamp < cutoff_date:
|
||||
self.log(f"Highlight {highlight.collection_id} is older than {days_back} days, skipping", "debug")
|
||||
continue
|
||||
|
||||
# Check if already processed
|
||||
if highlight.collection_id in processed or highlight.collection_id in self.downloaded_files:
|
||||
self.log(f"Highlight {highlight.collection_id} already processed, skipping", "debug")
|
||||
continue
|
||||
|
||||
# Separate videos and images
|
||||
videos = [s for s in highlight.snaps if s.media_type == 'video']
|
||||
images = [s for s in highlight.snaps if s.media_type == 'image']
|
||||
|
||||
# Download images individually
|
||||
for snap in images:
|
||||
if snap.timestamp < cutoff_date:
|
||||
continue
|
||||
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
||||
continue
|
||||
|
||||
time.sleep(random.uniform(0.3, 0.5))
|
||||
|
||||
filename = self._generate_filename(username, snap, 'jpg')
|
||||
output_path = str(stories_output / filename)
|
||||
|
||||
if self._download_media_file(snap, output_path):
|
||||
self.downloaded_files.add(snap.media_id)
|
||||
downloaded_count += 1
|
||||
self.log(f"Downloaded image: {filename}", "info")
|
||||
|
||||
self._record_download(
|
||||
username=username,
|
||||
url=highlight.url,
|
||||
filename=filename,
|
||||
post_date=snap.timestamp,
|
||||
metadata={
|
||||
'media_id': snap.media_id,
|
||||
'highlight_id': highlight.collection_id,
|
||||
'content_type': 'highlight_image'
|
||||
},
|
||||
file_path=output_path,
|
||||
deferred=defer_database
|
||||
)
|
||||
|
||||
# Download videos individually
|
||||
for snap in videos:
|
||||
if snap.timestamp < cutoff_date:
|
||||
continue
|
||||
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
||||
continue
|
||||
|
||||
time.sleep(random.uniform(0.3, 0.5))
|
||||
|
||||
filename = self._generate_filename(username, snap, 'mp4')
|
||||
output_path = str(stories_output / filename)
|
||||
|
||||
if self._download_media_file(snap, output_path):
|
||||
self._set_metadata(output_path, snap)
|
||||
self.downloaded_files.add(snap.media_id)
|
||||
downloaded_count += 1
|
||||
self.log(f"Downloaded video: {filename}", "info")
|
||||
|
||||
self._record_download(
|
||||
username=username,
|
||||
url=highlight.url,
|
||||
filename=filename,
|
||||
post_date=snap.timestamp,
|
||||
metadata={
|
||||
'media_id': snap.media_id,
|
||||
'highlight_id': highlight.collection_id,
|
||||
'content_type': 'highlight_video'
|
||||
},
|
||||
file_path=output_path,
|
||||
deferred=defer_database
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error processing highlight: {e}", "error")
|
||||
|
||||
checkpoint.mark_completed(url)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error during download: {e}", "error")
|
||||
|
||||
checkpoint.finish()
|
||||
self.log(f"Downloaded {downloaded_count} files for @{username}", "info")
|
||||
return downloaded_count
|
||||
985
modules/snapchat_scraper.py
Normal file
985
modules/snapchat_scraper.py
Normal file
@@ -0,0 +1,985 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Snapchat Direct Scraper Module - Scrapes directly from Snapchat.com
|
||||
|
||||
Uses Playwright to scrape profiles and extract:
|
||||
- Spotlight videos (540x960)
|
||||
- Stories/Highlights (480x852, stitched into single videos)
|
||||
|
||||
Full metadata extraction including timestamps, media IDs, descriptions.
|
||||
Follows the same interface as the original snapchat_module.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
import tempfile
|
||||
import subprocess
|
||||
import shutil
|
||||
import platform
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, List, Any, Set
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
# Set environment for Playwright
|
||||
os.environ.setdefault('PLAYWRIGHT_BROWSERS_PATH', '/root/.cache/ms-playwright')
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from modules.cloudflare_handler import (
|
||||
get_playwright_context_options,
|
||||
get_playwright_stealth_scripts,
|
||||
get_flaresolverr_user_agent
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SnapMedia:
|
||||
"""Represents a single snap media item"""
|
||||
media_id: str
|
||||
media_type: str # 'video' or 'image'
|
||||
media_url: str
|
||||
timestamp: datetime
|
||||
index: int = 0
|
||||
thumbnail_url: str = ""
|
||||
duration_ms: int = 0
|
||||
description: str = ""
|
||||
view_count: int = 0
|
||||
width: int = 0
|
||||
height: int = 0
|
||||
lat: Optional[float] = None
|
||||
lng: Optional[float] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SnapCollection:
|
||||
"""Represents a spotlight or highlight collection"""
|
||||
collection_id: str
|
||||
collection_type: str # 'spotlight' or 'highlight'
|
||||
title: str = ""
|
||||
username: str = ""
|
||||
snaps: List[SnapMedia] = field(default_factory=list)
|
||||
url: str = ""
|
||||
|
||||
|
||||
class SnapchatDirectScraper(LoggingMixin):
|
||||
"""
|
||||
Scrapes Snapchat profiles directly for media content.
|
||||
|
||||
Follows the same interface as SnapchatDownloader for compatibility
|
||||
with the media-downloader system.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
headless: bool = True,
|
||||
show_progress: bool = True,
|
||||
use_database: bool = True,
|
||||
log_callback=None,
|
||||
unified_db=None):
|
||||
"""Initialize scraper compatible with media-downloader system"""
|
||||
self.headless = headless
|
||||
self.show_progress = show_progress
|
||||
self.use_database = use_database
|
||||
self.unified_db = unified_db
|
||||
self.scraper_id = 'snapchat_direct'
|
||||
self.download_count = 0
|
||||
self.downloaded_files: Set[str] = set()
|
||||
self.pending_downloads = []
|
||||
|
||||
# Initialize logging via mixin
|
||||
self._init_logger('SnapchatDirect', log_callback, default_module='Download')
|
||||
|
||||
# User-Agent to match FlareSolverr (dynamically fetched for consistency)
|
||||
self.user_agent = get_flaresolverr_user_agent()
|
||||
|
||||
# Browser state
|
||||
self._playwright = None
|
||||
self.browser = None
|
||||
self.context = None
|
||||
|
||||
# Database adapter
|
||||
if unified_db and use_database:
|
||||
from modules.unified_database import SnapchatDatabaseAdapter
|
||||
self.db = SnapchatDatabaseAdapter(unified_db)
|
||||
else:
|
||||
self.db = None
|
||||
self.use_database = False
|
||||
|
||||
# Activity status manager
|
||||
try:
|
||||
from modules.activity_status import get_activity_manager
|
||||
self.activity_manager = get_activity_manager(unified_db)
|
||||
except ImportError:
|
||||
self.activity_manager = None
|
||||
|
||||
# Load cookies from database
|
||||
self.cookies = self._load_cookies_from_db()
|
||||
|
||||
# Load proxy configuration from database
|
||||
self.proxy_url = None
|
||||
if unified_db:
|
||||
try:
|
||||
scraper_config = unified_db.get_scraper('snapchat')
|
||||
if scraper_config and scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
|
||||
self.proxy_url = scraper_config['proxy_url']
|
||||
self.log(f"Using proxy: {self.proxy_url}", "info")
|
||||
except Exception as e:
|
||||
self.log(f"Could not load proxy config: {e}", "debug")
|
||||
|
||||
def _load_cookies_from_db(self) -> List[Dict]:
|
||||
"""Load cookies from database"""
|
||||
if not self.unified_db:
|
||||
return self._get_default_cookies()
|
||||
|
||||
try:
|
||||
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
||||
if cookies:
|
||||
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
|
||||
return cookies
|
||||
except Exception as e:
|
||||
self.log(f"Error loading cookies from database: {e}", "warning")
|
||||
|
||||
# Try loading from original snapchat scraper
|
||||
try:
|
||||
cookies = self.unified_db.get_scraper_cookies('snapchat')
|
||||
if cookies:
|
||||
self.log(f"Using cookies from 'snapchat' scraper", "debug")
|
||||
return cookies
|
||||
except Exception as e:
|
||||
self.log(f"Error loading cookies from snapchat scraper: {e}", "debug")
|
||||
|
||||
return self._get_default_cookies()
|
||||
|
||||
def _get_default_cookies(self) -> List[Dict]:
|
||||
"""Get default cookies for Snapchat"""
|
||||
return [
|
||||
{"name": "sc-cookies-accepted", "value": "true", "domain": "www.snapchat.com", "path": "/"},
|
||||
]
|
||||
|
||||
def _save_cookies_to_db(self, cookies: List[Dict], user_agent: str = None):
|
||||
"""Save cookies to database
|
||||
|
||||
Args:
|
||||
cookies: List of cookie dictionaries
|
||||
user_agent: User agent to associate with cookies (important for cf_clearance).
|
||||
If not provided, uses self.user_agent as fallback.
|
||||
"""
|
||||
if not self.unified_db:
|
||||
return
|
||||
|
||||
try:
|
||||
# Use provided user_agent or fall back to self.user_agent
|
||||
ua = user_agent or self.user_agent
|
||||
self.unified_db.save_scraper_cookies(
|
||||
self.scraper_id,
|
||||
cookies,
|
||||
user_agent=ua,
|
||||
merge=True
|
||||
)
|
||||
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50]}...)", "debug")
|
||||
except Exception as e:
|
||||
self.log(f"Error saving cookies to database: {e}", "warning")
|
||||
|
||||
def _parse_proxy_url(self, proxy_url: str) -> Optional[Dict]:
|
||||
"""
|
||||
Parse proxy URL into Playwright proxy config.
|
||||
Supports: protocol://user:pass@host:port or protocol://host:port
|
||||
"""
|
||||
import re
|
||||
try:
|
||||
# Match: protocol://[user:pass@]host:port
|
||||
match = re.match(
|
||||
r'^(https?|socks[45]?)://(?:([^:]+):([^@]+)@)?([^:]+):(\d+)$',
|
||||
proxy_url
|
||||
)
|
||||
if match:
|
||||
protocol, username, password, host, port = match.groups()
|
||||
config = {'server': f'{protocol}://{host}:{port}'}
|
||||
if username and password:
|
||||
config['username'] = username
|
||||
config['password'] = password
|
||||
return config
|
||||
except Exception as e:
|
||||
self.log(f"Failed to parse proxy URL: {e}", "warning")
|
||||
return None
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry"""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit"""
|
||||
self._close_browser()
|
||||
return False
|
||||
|
||||
def _start_browser(self):
|
||||
"""Start Playwright browser"""
|
||||
if self.browser is not None:
|
||||
return
|
||||
|
||||
os.environ['DISPLAY'] = ':100'
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
self._playwright = sync_playwright().start()
|
||||
self.browser = self._playwright.chromium.launch(
|
||||
headless=self.headless,
|
||||
args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
|
||||
)
|
||||
|
||||
# Build context options - use dynamic fingerprinting from FlareSolverr
|
||||
context_options = get_playwright_context_options()
|
||||
|
||||
# IMPORTANT: If cookies have a stored user_agent, use THAT user_agent
|
||||
# Cloudflare cf_clearance cookies are fingerprinted to the browser that solved the challenge
|
||||
try:
|
||||
if self.unified_db:
|
||||
stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
|
||||
if stored_user_agent:
|
||||
self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug", module="Browser")
|
||||
context_options['user_agent'] = stored_user_agent
|
||||
else:
|
||||
self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser")
|
||||
else:
|
||||
self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser")
|
||||
except Exception as e:
|
||||
self.log(f"Error getting stored user_agent, using default: {e}", "debug", module="Browser")
|
||||
|
||||
# Add proxy if configured
|
||||
if self.proxy_url:
|
||||
proxy_config = self._parse_proxy_url(self.proxy_url)
|
||||
if proxy_config:
|
||||
context_options['proxy'] = proxy_config
|
||||
self.log(f"Browser using proxy: {proxy_config.get('server')}", "info", module="Browser")
|
||||
|
||||
self.context = self.browser.new_context(**context_options)
|
||||
|
||||
# Add anti-detection scripts to all pages in this context
|
||||
self.context.add_init_script(get_playwright_stealth_scripts())
|
||||
|
||||
# Add cookies
|
||||
if self.cookies:
|
||||
# Clean cookies for Playwright and convert expiry->expires
|
||||
cleaned = []
|
||||
for c in self.cookies:
|
||||
clean = {k: v for k, v in c.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
|
||||
# FlareSolverr uses 'expiry' but Playwright uses 'expires'
|
||||
if 'expiry' in clean and 'expires' not in clean:
|
||||
clean['expires'] = clean.pop('expiry')
|
||||
cleaned.append(clean)
|
||||
|
||||
# CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect
|
||||
try:
|
||||
self.context.clear_cookies()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self.context.add_cookies(cleaned)
|
||||
|
||||
self.log("Browser started", "info", module="Browser")
|
||||
|
||||
def _close_browser(self):
|
||||
"""Close browser and cleanup"""
|
||||
if self.context:
|
||||
try:
|
||||
self.context.close()
|
||||
except Exception as e:
|
||||
self.log(f"Error closing browser context: {e}", "debug")
|
||||
self.context = None
|
||||
|
||||
if self.browser:
|
||||
try:
|
||||
self.browser.close()
|
||||
except Exception as e:
|
||||
self.log(f"Error closing browser: {e}", "debug")
|
||||
self.browser = None
|
||||
|
||||
if self._playwright:
|
||||
try:
|
||||
self._playwright.stop()
|
||||
except Exception as e:
|
||||
self.log(f"Error stopping playwright: {e}", "debug")
|
||||
self._playwright = None
|
||||
|
||||
def _get_next_data(self, page) -> Optional[Dict]:
|
||||
"""Extract __NEXT_DATA__ JSON from page"""
|
||||
try:
|
||||
next_data_elem = page.locator('script#__NEXT_DATA__').first
|
||||
if next_data_elem.count() > 0:
|
||||
return json.loads(next_data_elem.inner_text())
|
||||
except Exception as e:
|
||||
self.log(f"Error extracting __NEXT_DATA__: {e}", "debug")
|
||||
return None
|
||||
|
||||
def _set_metadata(self, file_path: str, snap: SnapMedia, description: str = None):
|
||||
"""Set EXIF metadata and file timestamp"""
|
||||
try:
|
||||
date_str = snap.timestamp.strftime('%Y:%m:%d %H:%M:%S')
|
||||
desc = description or snap.description or ""
|
||||
if snap.view_count:
|
||||
desc += f" [Views: {snap.view_count}]"
|
||||
desc = desc.strip()
|
||||
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
is_video = ext in ['.mp4', '.mov', '.avi', '.webm']
|
||||
is_image = ext in ['.jpg', '.jpeg', '.png', '.webp']
|
||||
|
||||
exif_args = [
|
||||
'exiftool', '-overwrite_original', '-ignoreMinorErrors',
|
||||
f'-FileModifyDate={date_str}',
|
||||
]
|
||||
|
||||
if is_image:
|
||||
exif_args.extend([
|
||||
f'-DateTimeOriginal={date_str}',
|
||||
f'-CreateDate={date_str}',
|
||||
f'-ModifyDate={date_str}',
|
||||
f'-MetadataDate={date_str}',
|
||||
])
|
||||
if desc:
|
||||
exif_args.extend([
|
||||
f'-ImageDescription={desc}',
|
||||
f'-XPComment={desc}',
|
||||
f'-UserComment={desc}',
|
||||
])
|
||||
if snap.lat and snap.lng:
|
||||
lat_ref = 'N' if snap.lat >= 0 else 'S'
|
||||
lng_ref = 'E' if snap.lng >= 0 else 'W'
|
||||
exif_args.extend([
|
||||
f'-GPSLatitude={abs(snap.lat)}',
|
||||
f'-GPSLatitudeRef={lat_ref}',
|
||||
f'-GPSLongitude={abs(snap.lng)}',
|
||||
f'-GPSLongitudeRef={lng_ref}',
|
||||
])
|
||||
|
||||
elif is_video:
|
||||
exif_args.extend([
|
||||
f'-CreateDate={date_str}',
|
||||
f'-ModifyDate={date_str}',
|
||||
f'-MediaCreateDate={date_str}',
|
||||
f'-MediaModifyDate={date_str}',
|
||||
f'-TrackCreateDate={date_str}',
|
||||
f'-TrackModifyDate={date_str}',
|
||||
])
|
||||
if desc:
|
||||
exif_args.extend([
|
||||
f'-Description={desc}',
|
||||
f'-Comment={desc}',
|
||||
])
|
||||
|
||||
exif_args.append(file_path)
|
||||
subprocess.run(exif_args, capture_output=True, timeout=30)
|
||||
|
||||
# Set filesystem modification time
|
||||
ts = snap.timestamp.timestamp()
|
||||
os.utime(file_path, (ts, ts))
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Warning: Could not set metadata for {file_path}: {e}", "debug")
|
||||
|
||||
def get_profile_content(self, username: str) -> Dict[str, List[str]]:
|
||||
"""Get all spotlight and highlight URLs from a profile"""
|
||||
import time
|
||||
|
||||
if not self.browser:
|
||||
self._start_browser()
|
||||
|
||||
page = self.context.new_page()
|
||||
result = {'spotlights': [], 'highlights': []}
|
||||
|
||||
try:
|
||||
url = f"https://www.snapchat.com/@{username}"
|
||||
self.log(f"Navigating to profile @{username}", "info")
|
||||
page.goto(url, wait_until='networkidle', timeout=30000)
|
||||
time.sleep(2)
|
||||
|
||||
content = page.content()
|
||||
|
||||
# Extract spotlight URLs
|
||||
spotlight_pattern = rf'/@{username}/spotlight/([A-Za-z0-9_-]+)'
|
||||
spotlight_ids = list(set(re.findall(spotlight_pattern, content)))
|
||||
result['spotlights'] = [
|
||||
f"https://www.snapchat.com/@{username}/spotlight/{sid}"
|
||||
for sid in spotlight_ids
|
||||
]
|
||||
self.log(f"Found {len(result['spotlights'])} spotlights", "info")
|
||||
|
||||
# Click Stories tab to get highlights
|
||||
stories_tab = page.locator('[role="tab"]:has-text("Stories")').first
|
||||
if stories_tab.count() > 0:
|
||||
stories_tab.click()
|
||||
time.sleep(2)
|
||||
|
||||
content = page.content()
|
||||
highlight_pattern = rf'/@{username}/highlight/([A-Za-z0-9-]+)'
|
||||
highlight_ids = list(set(re.findall(highlight_pattern, content)))
|
||||
result['highlights'] = [
|
||||
f"https://www.snapchat.com/@{username}/highlight/{hid}"
|
||||
for hid in highlight_ids
|
||||
]
|
||||
self.log(f"Found {len(result['highlights'])} highlights", "info")
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting profile content: {e}", "error")
|
||||
finally:
|
||||
page.close()
|
||||
|
||||
return result
|
||||
|
||||
def get_spotlight_metadata(self, url: str) -> Optional[SnapCollection]:
|
||||
"""Extract full metadata from a spotlight URL"""
|
||||
import time
|
||||
|
||||
if not self.browser:
|
||||
self._start_browser()
|
||||
|
||||
page = self.context.new_page()
|
||||
|
||||
try:
|
||||
page.goto(url, wait_until='domcontentloaded', timeout=60000)
|
||||
time.sleep(2)
|
||||
|
||||
data = self._get_next_data(page)
|
||||
if not data:
|
||||
return None
|
||||
|
||||
props = (data.get('props') or {}).get('pageProps') or {}
|
||||
feed = props.get('spotlightFeed') or {}
|
||||
stories = feed.get('spotlightStories') or []
|
||||
|
||||
if not stories:
|
||||
return None
|
||||
|
||||
story_data = stories[0]
|
||||
story = story_data.get('story') or {}
|
||||
metadata = (story_data.get('metadata') or {}).get('videoMetadata') or {}
|
||||
|
||||
story_id = (story.get('storyId') or {}).get('value', '')
|
||||
creator = (metadata.get('creator') or {}).get('personCreator') or {}
|
||||
username = creator.get('username', '')
|
||||
|
||||
collection = SnapCollection(
|
||||
collection_id=story_id,
|
||||
collection_type='spotlight',
|
||||
title=metadata.get('description', ''),
|
||||
username=username,
|
||||
url=url
|
||||
)
|
||||
|
||||
for snap_data in story.get('snapList') or []:
|
||||
snap_id = (snap_data.get('snapId') or {}).get('value', '')
|
||||
snap_urls = snap_data.get('snapUrls') or {}
|
||||
media_url = snap_urls.get('mediaUrl', '')
|
||||
|
||||
media_id = ''
|
||||
if '/d/' in media_url:
|
||||
media_id = media_url.split('/d/')[1].split('.')[0]
|
||||
|
||||
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
|
||||
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
|
||||
|
||||
snap = SnapMedia(
|
||||
media_id=media_id or snap_id,
|
||||
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
|
||||
media_url=media_url,
|
||||
timestamp=timestamp,
|
||||
index=snap_data.get('snapIndex', 0),
|
||||
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
|
||||
duration_ms=int(metadata.get('durationMs', 0)),
|
||||
description=metadata.get('description', ''),
|
||||
view_count=int(metadata.get('viewCount', 0)),
|
||||
width=int(metadata.get('width', 540)),
|
||||
height=int(metadata.get('height', 960))
|
||||
)
|
||||
collection.snaps.append(snap)
|
||||
|
||||
return collection
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting spotlight metadata: {e}", "error")
|
||||
return None
|
||||
finally:
|
||||
page.close()
|
||||
|
||||
def get_highlight_metadata(self, url: str) -> Optional[SnapCollection]:
|
||||
"""Extract full metadata from a highlight URL"""
|
||||
import time
|
||||
|
||||
if not self.browser:
|
||||
self._start_browser()
|
||||
|
||||
page = self.context.new_page()
|
||||
|
||||
try:
|
||||
page.goto(url, wait_until='domcontentloaded', timeout=60000)
|
||||
time.sleep(2)
|
||||
|
||||
data = self._get_next_data(page)
|
||||
if not data:
|
||||
return None
|
||||
|
||||
props = (data.get('props') or {}).get('pageProps') or {}
|
||||
highlight = props.get('highlight') or {}
|
||||
|
||||
if not highlight:
|
||||
return None
|
||||
|
||||
highlight_id = highlight.get('highlightId') or {}
|
||||
if isinstance(highlight_id, dict):
|
||||
highlight_id = highlight_id.get('value', '')
|
||||
|
||||
username_match = re.search(r'@([^/]+)', url)
|
||||
username = username_match.group(1) if username_match else ''
|
||||
|
||||
title = highlight.get('storyTitle') or {}
|
||||
if isinstance(title, dict):
|
||||
title = title.get('value', '')
|
||||
|
||||
collection = SnapCollection(
|
||||
collection_id=highlight_id,
|
||||
collection_type='highlight',
|
||||
title=title or 'Untitled Highlight',
|
||||
username=username,
|
||||
url=url
|
||||
)
|
||||
|
||||
for snap_data in highlight.get('snapList') or []:
|
||||
snap_urls = snap_data.get('snapUrls') or {}
|
||||
media_url = snap_urls.get('mediaUrl', '')
|
||||
|
||||
media_id = ''
|
||||
if '/d/' in media_url:
|
||||
media_id = media_url.split('/d/')[1].split('.')[0]
|
||||
|
||||
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
|
||||
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
|
||||
|
||||
lat = snap_data.get('lat')
|
||||
lng = snap_data.get('lng')
|
||||
|
||||
snap = SnapMedia(
|
||||
media_id=media_id,
|
||||
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
|
||||
media_url=media_url,
|
||||
timestamp=timestamp,
|
||||
index=snap_data.get('snapIndex', 0),
|
||||
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
|
||||
lat=float(lat) if lat else None,
|
||||
lng=float(lng) if lng else None
|
||||
)
|
||||
collection.snaps.append(snap)
|
||||
|
||||
return collection
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting highlight metadata: {e}", "error")
|
||||
return None
|
||||
finally:
|
||||
page.close()
|
||||
|
||||
def _download_media_file(self, snap: SnapMedia, output_path: str) -> bool:
|
||||
"""Download a single media file"""
|
||||
try:
|
||||
url = snap.media_url.replace('&', '&')
|
||||
|
||||
result = subprocess.run([
|
||||
'curl', '-sL', '-o', output_path,
|
||||
'-H', 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
url
|
||||
], capture_output=True, timeout=60)
|
||||
|
||||
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
|
||||
self._set_metadata(output_path, snap)
|
||||
return True
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading media: {e}", "error")
|
||||
return False
|
||||
|
||||
def _generate_filename(self, username: str, snap: SnapMedia, ext: str) -> str:
|
||||
"""Generate filename with timestamp and media ID (FastDL format)"""
|
||||
date_str = snap.timestamp.strftime('%Y%m%d_%H%M%S')
|
||||
return f"{username}_{date_str}_{snap.media_id}.{ext}"
|
||||
|
||||
def _record_download(self, username: str, url: str, filename: str,
|
||||
post_date=None, metadata: dict = None, file_path: str = None,
|
||||
deferred: bool = False):
|
||||
"""Record a download in the database"""
|
||||
if deferred:
|
||||
self.pending_downloads.append({
|
||||
'username': username,
|
||||
'url': url,
|
||||
'filename': filename,
|
||||
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
|
||||
'file_path': file_path,
|
||||
'metadata': metadata
|
||||
})
|
||||
return True
|
||||
|
||||
if not self.db:
|
||||
return
|
||||
|
||||
try:
|
||||
self.db.mark_downloaded(
|
||||
username=username,
|
||||
url=url,
|
||||
filename=filename,
|
||||
post_date=post_date,
|
||||
metadata=metadata,
|
||||
file_path=file_path
|
||||
)
|
||||
except Exception as e:
|
||||
self.log(f"Failed to record download: {e}", "debug")
|
||||
|
||||
def get_pending_downloads(self):
|
||||
"""Get list of downloads that were deferred"""
|
||||
return self.pending_downloads.copy()
|
||||
|
||||
def clear_pending_downloads(self):
|
||||
"""Clear the pending downloads list"""
|
||||
self.pending_downloads = []
|
||||
|
||||
def _get_processed_posts(self, username: str) -> Set[str]:
|
||||
"""Get set of media IDs that have been processed"""
|
||||
processed = set()
|
||||
if not self.db:
|
||||
return processed
|
||||
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
SELECT filename, metadata FROM downloads
|
||||
WHERE platform = 'snapchat'
|
||||
AND source = ?
|
||||
''', (username,))
|
||||
|
||||
for row in cursor.fetchall():
|
||||
filename, metadata_str = row
|
||||
if filename:
|
||||
parts = filename.split('_')
|
||||
if len(parts) >= 4:
|
||||
media_id = '_'.join(parts[3:]).split('.')[0]
|
||||
processed.add(media_id)
|
||||
|
||||
if metadata_str:
|
||||
try:
|
||||
metadata = json.loads(metadata_str)
|
||||
if 'media_id' in metadata:
|
||||
processed.add(metadata['media_id'])
|
||||
except (json.JSONDecodeError, TypeError, KeyError):
|
||||
pass # Invalid metadata, skip
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error loading processed posts: {e}", "debug")
|
||||
|
||||
return processed
|
||||
|
||||
def download(self, username: str, content_type: str = "all", days_back: int = 14,
|
||||
max_downloads: int = 50, output_dir: str = None,
|
||||
spotlight_dir: str = None, stories_dir: str = None,
|
||||
stitch_highlights: bool = True, defer_database: bool = False,
|
||||
phrase_config: dict = None):
|
||||
"""
|
||||
Download content from a user - compatible with media-downloader interface
|
||||
|
||||
Args:
|
||||
username: Snapchat username
|
||||
content_type: "spotlight", "stories", "highlights", or "all"
|
||||
days_back: How many days back to download (filters by post date)
|
||||
max_downloads: Maximum items to download per content type
|
||||
output_dir: Default output directory (used if specific dirs not set)
|
||||
spotlight_dir: Output directory for spotlights
|
||||
stories_dir: Output directory for stories/highlights
|
||||
stitch_highlights: Ignored (kept for backwards compatibility)
|
||||
defer_database: If True, defer database recording
|
||||
phrase_config: Not used (for interface compatibility)
|
||||
|
||||
Returns:
|
||||
Number of files downloaded
|
||||
"""
|
||||
self.defer_database = defer_database
|
||||
self.downloaded_files.clear()
|
||||
|
||||
# Set output directories
|
||||
# If specific dirs provided, use them directly
|
||||
# If only output_dir provided, use it directly (caller handles structure)
|
||||
# If nothing provided, use default with subdirectories
|
||||
if spotlight_dir:
|
||||
spotlight_output = Path(spotlight_dir)
|
||||
elif output_dir:
|
||||
spotlight_output = Path(output_dir)
|
||||
else:
|
||||
spotlight_output = Path(f"/opt/media-downloader/downloads/snapchat/spotlight/{username}")
|
||||
|
||||
if stories_dir:
|
||||
stories_output = Path(stories_dir)
|
||||
elif output_dir:
|
||||
stories_output = Path(output_dir)
|
||||
else:
|
||||
stories_output = Path(f"/opt/media-downloader/downloads/snapchat/stories/{username}")
|
||||
|
||||
spotlight_output.mkdir(parents=True, exist_ok=True)
|
||||
stories_output.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Update activity status
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status("Checking Snapchat")
|
||||
|
||||
# Get processed posts
|
||||
processed = self._get_processed_posts(username)
|
||||
self.log(f"Loaded {len(processed)} processed posts from database", "debug")
|
||||
|
||||
cutoff_date = datetime.now() - timedelta(days=days_back)
|
||||
downloaded_count = 0
|
||||
|
||||
# Crash recovery checkpoint
|
||||
from modules.task_checkpoint import TaskCheckpoint
|
||||
checkpoint = TaskCheckpoint(f'snapchat:{username}', 'scraping')
|
||||
|
||||
try:
|
||||
# Start browser
|
||||
self._start_browser()
|
||||
|
||||
# Get profile content
|
||||
content = self.get_profile_content(username)
|
||||
|
||||
# Count total items for checkpoint
|
||||
total_items = 0
|
||||
if content_type in ['spotlight', 'all'] and content['spotlights']:
|
||||
total_items += min(len(content['spotlights']), max_downloads)
|
||||
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
|
||||
total_items += min(len(content['highlights']), max_downloads)
|
||||
checkpoint.start(total_items=total_items)
|
||||
if checkpoint.is_recovering():
|
||||
self.log(f"Snapchat @{username}: recovering — skipping already-processed URLs", "info")
|
||||
|
||||
# Download spotlights
|
||||
if content_type in ['spotlight', 'all'] and content['spotlights']:
|
||||
spotlight_items = content['spotlights'][:max_downloads]
|
||||
self.log(f"Processing {len(spotlight_items)} spotlights...", "info")
|
||||
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status(
|
||||
"Downloading spotlights",
|
||||
progress_current=0,
|
||||
progress_total=len(spotlight_items)
|
||||
)
|
||||
|
||||
for spot_idx, url in enumerate(spotlight_items):
|
||||
# Update progress at start of each iteration (fires even on skips)
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status(
|
||||
"Downloading spotlights",
|
||||
progress_current=spot_idx + 1,
|
||||
progress_total=len(spotlight_items)
|
||||
)
|
||||
|
||||
if checkpoint.is_completed(url):
|
||||
continue
|
||||
|
||||
checkpoint.set_current(url)
|
||||
|
||||
try:
|
||||
spotlight = self.get_spotlight_metadata(url)
|
||||
if not spotlight or not spotlight.snaps:
|
||||
continue
|
||||
|
||||
snap = spotlight.snaps[0]
|
||||
|
||||
# Check date filter
|
||||
if snap.timestamp < cutoff_date:
|
||||
self.log(f"Spotlight {snap.media_id} is older than {days_back} days, skipping", "debug")
|
||||
continue
|
||||
|
||||
# Check if already processed
|
||||
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
||||
self.log(f"Spotlight {snap.media_id} already processed, skipping", "debug")
|
||||
continue
|
||||
|
||||
# Download
|
||||
ext = 'mp4' if snap.media_type == 'video' else 'jpg'
|
||||
filename = self._generate_filename(username, snap, ext)
|
||||
output_path = str(spotlight_output / filename)
|
||||
|
||||
if self._download_media_file(snap, output_path):
|
||||
self.downloaded_files.add(snap.media_id)
|
||||
downloaded_count += 1
|
||||
self.log(f"Downloaded spotlight: {filename}", "info")
|
||||
|
||||
self._record_download(
|
||||
username=username,
|
||||
url=url,
|
||||
filename=filename,
|
||||
post_date=snap.timestamp,
|
||||
metadata={
|
||||
'media_id': snap.media_id,
|
||||
'description': snap.description,
|
||||
'view_count': snap.view_count,
|
||||
'content_type': 'spotlight'
|
||||
},
|
||||
file_path=output_path,
|
||||
deferred=defer_database
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error processing spotlight: {e}", "error")
|
||||
|
||||
checkpoint.mark_completed(url)
|
||||
|
||||
# Download highlights (stories)
|
||||
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
|
||||
highlight_items = content['highlights'][:max_downloads]
|
||||
self.log(f"Processing {len(highlight_items)} highlights...", "info")
|
||||
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status(
|
||||
"Downloading highlights",
|
||||
progress_current=0,
|
||||
progress_total=len(highlight_items)
|
||||
)
|
||||
|
||||
for hi_idx, url in enumerate(highlight_items):
|
||||
# Update progress at start of each iteration (fires even on skips)
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status(
|
||||
"Downloading highlights",
|
||||
progress_current=hi_idx + 1,
|
||||
progress_total=len(highlight_items)
|
||||
)
|
||||
|
||||
if checkpoint.is_completed(url):
|
||||
continue
|
||||
|
||||
checkpoint.set_current(url)
|
||||
|
||||
try:
|
||||
highlight = self.get_highlight_metadata(url)
|
||||
if not highlight or not highlight.snaps:
|
||||
continue
|
||||
|
||||
# Check if any snap is within date range
|
||||
newest_snap = max(highlight.snaps, key=lambda s: s.timestamp)
|
||||
if newest_snap.timestamp < cutoff_date:
|
||||
self.log(f"Highlight {highlight.collection_id} is older than {days_back} days, skipping", "debug")
|
||||
continue
|
||||
|
||||
# Check if already processed
|
||||
if highlight.collection_id in processed or highlight.collection_id in self.downloaded_files:
|
||||
self.log(f"Highlight {highlight.collection_id} already processed, skipping", "debug")
|
||||
continue
|
||||
|
||||
# Separate videos and images
|
||||
videos = [s for s in highlight.snaps if s.media_type == 'video']
|
||||
images = [s for s in highlight.snaps if s.media_type == 'image']
|
||||
|
||||
# Download images individually
|
||||
for snap in images:
|
||||
if snap.timestamp < cutoff_date:
|
||||
continue
|
||||
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
||||
continue
|
||||
|
||||
filename = self._generate_filename(username, snap, 'jpg')
|
||||
output_path = str(stories_output / filename)
|
||||
|
||||
if self._download_media_file(snap, output_path):
|
||||
self.downloaded_files.add(snap.media_id)
|
||||
downloaded_count += 1
|
||||
self.log(f"Downloaded image: {filename}", "info")
|
||||
|
||||
self._record_download(
|
||||
username=username,
|
||||
url=highlight.url,
|
||||
filename=filename,
|
||||
post_date=snap.timestamp,
|
||||
metadata={
|
||||
'media_id': snap.media_id,
|
||||
'highlight_id': highlight.collection_id,
|
||||
'content_type': 'highlight_image'
|
||||
},
|
||||
file_path=output_path,
|
||||
deferred=defer_database
|
||||
)
|
||||
|
||||
# Handle videos - download each clip individually
|
||||
if videos:
|
||||
for snap in videos:
|
||||
if snap.timestamp < cutoff_date:
|
||||
continue
|
||||
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
||||
continue
|
||||
|
||||
filename = self._generate_filename(username, snap, 'mp4')
|
||||
output_path = str(stories_output / filename)
|
||||
|
||||
if self._download_media_file(snap, output_path):
|
||||
self._set_metadata(output_path, snap)
|
||||
self.downloaded_files.add(snap.media_id)
|
||||
downloaded_count += 1
|
||||
self.log(f"Downloaded video: {filename}", "info")
|
||||
|
||||
self._record_download(
|
||||
username=username,
|
||||
url=highlight.url,
|
||||
filename=filename,
|
||||
post_date=snap.timestamp,
|
||||
metadata={
|
||||
'media_id': snap.media_id,
|
||||
'highlight_id': highlight.collection_id,
|
||||
'content_type': 'highlight_video'
|
||||
},
|
||||
file_path=output_path,
|
||||
deferred=defer_database
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error processing highlight: {e}", "error")
|
||||
|
||||
checkpoint.mark_completed(url)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error during download: {e}", "error")
|
||||
|
||||
checkpoint.finish()
|
||||
self.log(f"Downloaded {downloaded_count} files for @{username}", "info")
|
||||
return downloaded_count
|
||||
|
||||
|
||||
def test_scraper():
|
||||
"""Test the scraper"""
|
||||
print("=" * 60)
|
||||
print("SNAPCHAT DIRECT SCRAPER TEST")
|
||||
print("=" * 60)
|
||||
|
||||
with SnapchatDirectScraper(headless=True) as scraper:
|
||||
username = "evalongoria"
|
||||
|
||||
# Test download
|
||||
count = scraper.download(
|
||||
username=username,
|
||||
content_type="all",
|
||||
days_back=30,
|
||||
max_downloads=5,
|
||||
spotlight_dir="/tmp/snap_test/spotlight",
|
||||
stories_dir="/tmp/snap_test/stories",
|
||||
stitch_highlights=True
|
||||
)
|
||||
|
||||
print(f"\nDownloaded {count} files")
|
||||
|
||||
# Show files
|
||||
import os
|
||||
for root, dirs, files in os.walk("/tmp/snap_test"):
|
||||
for f in files:
|
||||
path = os.path.join(root, f)
|
||||
size = os.path.getsize(path) / 1024
|
||||
print(f" {path}: {size:.1f}KB")
|
||||
|
||||
print("=" * 60)
|
||||
print("TEST COMPLETE")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_scraper()
|
||||
391
modules/taddy_client.py
Normal file
391
modules/taddy_client.py
Normal file
@@ -0,0 +1,391 @@
|
||||
"""Taddy Podcast API client for finding podcast appearances"""
|
||||
import asyncio
|
||||
import re
|
||||
from html import unescape
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Optional
|
||||
from web.backend.core.http_client import http_client
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('Taddy')
|
||||
|
||||
|
||||
def strip_html(text: str) -> str:
|
||||
"""Strip HTML tags and decode entities from text"""
|
||||
if not text:
|
||||
return text
|
||||
# Remove HTML tags
|
||||
clean = re.sub(r'<[^>]+>', ' ', text)
|
||||
# Decode HTML entities
|
||||
clean = unescape(clean)
|
||||
# Normalize whitespace
|
||||
clean = re.sub(r'\s+', ' ', clean).strip()
|
||||
return clean
|
||||
|
||||
|
||||
class TaddyClient:
|
||||
"""Client for interacting with the Taddy Podcast API (GraphQL)
|
||||
|
||||
Supports primary and fallback accounts for quota management.
|
||||
When the primary account fails (500 error / quota exceeded),
|
||||
automatically switches to the fallback account.
|
||||
"""
|
||||
|
||||
BASE_URL = "https://api.taddy.org"
|
||||
|
||||
def __init__(self, user_id: str, api_key: str,
|
||||
user_id_2: str = None, api_key_2: str = None):
|
||||
# Primary account
|
||||
self.user_id = user_id
|
||||
self.api_key = api_key
|
||||
|
||||
# Fallback account (optional)
|
||||
self.user_id_2 = user_id_2
|
||||
self.api_key_2 = api_key_2
|
||||
self.has_fallback = bool(user_id_2 and api_key_2)
|
||||
|
||||
# Track which account is active
|
||||
self.using_fallback = False
|
||||
|
||||
self._update_headers()
|
||||
|
||||
def _update_headers(self):
|
||||
"""Update headers based on current active account"""
|
||||
if self.using_fallback and self.has_fallback:
|
||||
self.headers = {
|
||||
"Content-Type": "application/json",
|
||||
"X-USER-ID": self.user_id_2,
|
||||
"X-API-KEY": self.api_key_2
|
||||
}
|
||||
else:
|
||||
self.headers = {
|
||||
"Content-Type": "application/json",
|
||||
"X-USER-ID": self.user_id,
|
||||
"X-API-KEY": self.api_key
|
||||
}
|
||||
|
||||
def _switch_to_fallback(self) -> bool:
|
||||
"""Switch to fallback account if available. Returns True if switched."""
|
||||
if self.has_fallback and not self.using_fallback:
|
||||
self.using_fallback = True
|
||||
self._update_headers()
|
||||
logger.info("Switched to fallback Taddy account")
|
||||
return True
|
||||
return False
|
||||
|
||||
async def _graphql_query(self, query: str, variables: Dict = None, retry_on_fallback: bool = True) -> Optional[Dict]:
|
||||
"""Execute a GraphQL query against the Taddy API
|
||||
|
||||
If the primary account fails with a 500 error (quota exceeded),
|
||||
automatically retries with the fallback account if configured.
|
||||
"""
|
||||
try:
|
||||
payload = {"query": query}
|
||||
if variables:
|
||||
payload["variables"] = variables
|
||||
|
||||
response = await http_client.post(
|
||||
self.BASE_URL,
|
||||
json=payload,
|
||||
headers=self.headers
|
||||
)
|
||||
|
||||
data = response.json()
|
||||
|
||||
if "errors" in data:
|
||||
logger.error(f"Taddy API error: {data['errors']}")
|
||||
return None
|
||||
|
||||
return data.get("data")
|
||||
|
||||
except Exception as e:
|
||||
error_str = str(e).lower()
|
||||
# Check for 500 error (quota exceeded) - http_client raises ServiceError
|
||||
if "500" in error_str or "server error" in error_str:
|
||||
account_type = "fallback" if self.using_fallback else "primary"
|
||||
logger.warning(f"Taddy API returned 500 on {account_type} account (likely quota exceeded)")
|
||||
|
||||
# Try fallback if available and we haven't already
|
||||
if retry_on_fallback and self._switch_to_fallback():
|
||||
logger.info("Retrying with fallback Taddy account...")
|
||||
return await self._graphql_query(query, variables, retry_on_fallback=False)
|
||||
|
||||
logger.error(f"Taddy API request failed: {e}")
|
||||
return None
|
||||
|
||||
async def search_podcast_appearances(
|
||||
self,
|
||||
celebrity_name: str,
|
||||
lookback_days: int = 730, # 2 years
|
||||
lookahead_days: int = 30,
|
||||
limit: int = 25,
|
||||
max_pages: int = 10
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Search for podcast episodes featuring a celebrity.
|
||||
|
||||
Args:
|
||||
celebrity_name: Name of the celebrity to search for
|
||||
lookback_days: How many days back to search
|
||||
lookahead_days: How many days forward to search (for scheduled releases)
|
||||
limit: Maximum results per page
|
||||
|
||||
Returns:
|
||||
List of podcast appearance dicts
|
||||
"""
|
||||
appearances = []
|
||||
|
||||
# Calculate date range
|
||||
now = datetime.now()
|
||||
start_date = now - timedelta(days=lookback_days)
|
||||
# Convert to Unix timestamp (seconds)
|
||||
start_timestamp = int(start_date.timestamp())
|
||||
|
||||
query = """
|
||||
query SearchPodcastEpisodes($term: String!, $limitPerPage: Int, $page: Int, $filterForPublishedAfter: Int) {
|
||||
search(
|
||||
term: $term,
|
||||
filterForTypes: PODCASTEPISODE,
|
||||
matchBy: EXACT_PHRASE,
|
||||
limitPerPage: $limitPerPage,
|
||||
page: $page,
|
||||
filterForPublishedAfter: $filterForPublishedAfter
|
||||
) {
|
||||
searchId
|
||||
podcastEpisodes {
|
||||
uuid
|
||||
name
|
||||
description
|
||||
datePublished
|
||||
audioUrl
|
||||
persons {
|
||||
uuid
|
||||
name
|
||||
role
|
||||
}
|
||||
podcastSeries {
|
||||
uuid
|
||||
name
|
||||
imageUrl
|
||||
}
|
||||
websiteUrl
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
# Paginate through results (max 20 pages API limit, 25 per page = 500 max)
|
||||
# max_pages passed as parameter from config
|
||||
all_episodes = []
|
||||
|
||||
for page in range(1, max_pages + 1):
|
||||
variables = {
|
||||
"term": celebrity_name,
|
||||
"limitPerPage": limit,
|
||||
"page": page,
|
||||
"filterForPublishedAfter": start_timestamp
|
||||
}
|
||||
|
||||
data = await self._graphql_query(query, variables)
|
||||
|
||||
if not data or not data.get("search"):
|
||||
break
|
||||
|
||||
episodes = data["search"].get("podcastEpisodes", [])
|
||||
if not episodes:
|
||||
break # No more results
|
||||
|
||||
all_episodes.extend(episodes)
|
||||
|
||||
# If we got fewer than limit, we've reached the end
|
||||
if len(episodes) < limit:
|
||||
break
|
||||
|
||||
# Small delay between pages
|
||||
await asyncio.sleep(0.2)
|
||||
|
||||
episodes = all_episodes
|
||||
|
||||
for ep in episodes:
|
||||
try:
|
||||
# Parse the episode data
|
||||
podcast_series = ep.get("podcastSeries", {})
|
||||
ep_name = (ep.get("name") or "")
|
||||
podcast_name = (podcast_series.get("name") or "")
|
||||
name_lower = celebrity_name.lower()
|
||||
name_parts = name_lower.split()
|
||||
|
||||
# ===== USE PERSONS METADATA FOR ACCURATE FILTERING =====
|
||||
# Check if celebrity is listed in the persons array with a role
|
||||
persons = ep.get("persons", []) or []
|
||||
person_match = None
|
||||
credit_type = None
|
||||
|
||||
for person in persons:
|
||||
person_name = (person.get("name") or "").lower()
|
||||
# Match full name or last name
|
||||
if name_lower in person_name or person_name in name_lower:
|
||||
person_match = person
|
||||
role = (person.get("role") or "").lower()
|
||||
# Map Taddy roles to our credit types
|
||||
if "host" in role:
|
||||
credit_type = "host"
|
||||
elif "guest" in role:
|
||||
credit_type = "guest"
|
||||
elif role:
|
||||
credit_type = role # Use whatever role they have
|
||||
else:
|
||||
credit_type = "guest" # Default to guest if role not specified
|
||||
break
|
||||
# Also check by last name for partial matches
|
||||
elif len(name_parts) >= 2:
|
||||
last_name = name_parts[-1]
|
||||
first_name = name_parts[0]
|
||||
if len(last_name) >= 4 and (last_name in person_name or first_name in person_name):
|
||||
person_match = person
|
||||
role = (person.get("role") or "").lower()
|
||||
if "host" in role:
|
||||
credit_type = "host"
|
||||
elif "guest" in role:
|
||||
credit_type = "guest"
|
||||
elif role:
|
||||
credit_type = role
|
||||
else:
|
||||
credit_type = "guest"
|
||||
break
|
||||
|
||||
# If person is in the persons list, include the episode
|
||||
if person_match:
|
||||
logger.debug(f"Accepting '{ep_name}' - {celebrity_name} listed as {credit_type} in persons metadata")
|
||||
is_host = (credit_type == "host")
|
||||
else:
|
||||
# Fallback: check if they're the host via podcast series name
|
||||
podcast_name_lower = podcast_name.lower()
|
||||
is_host = name_lower in podcast_name_lower
|
||||
if not is_host and len(name_parts) >= 2:
|
||||
last_name = name_parts[-1]
|
||||
first_name = name_parts[0]
|
||||
if len(last_name) >= 4:
|
||||
is_host = (f"with {last_name}" in podcast_name_lower or
|
||||
f"with {first_name}" in podcast_name_lower or
|
||||
f"{first_name} {last_name}" in podcast_name_lower)
|
||||
|
||||
if is_host:
|
||||
credit_type = "host"
|
||||
logger.debug(f"Accepting '{ep_name}' - host podcast (name in series title)")
|
||||
else:
|
||||
# No persons metadata - use WHITELIST approach
|
||||
# Only accept if title clearly indicates an interview/guest appearance
|
||||
ep_name_lower = ep_name.lower()
|
||||
if name_lower not in ep_name_lower:
|
||||
logger.debug(f"Skipping '{ep_name}' - name not in title")
|
||||
continue
|
||||
|
||||
# Check podcast name for news/gossip shows first
|
||||
garbage_podcast_names = ['news', 'gossip', 'rumor', 'daily', 'trending', 'tmz', 'variety', 'march madness', 'cruz show', 'aesthetic arrest', 'devious maids']
|
||||
if any(word in podcast_name_lower for word in garbage_podcast_names):
|
||||
logger.debug(f"Skipping '{ep_name}' - podcast name suggests news/gossip")
|
||||
continue
|
||||
|
||||
# Reject listicles (multiple comma-separated topics)
|
||||
comma_count = ep_name_lower.count(',')
|
||||
if comma_count >= 3:
|
||||
logger.debug(f"Skipping '{ep_name}' - listicle format ({comma_count} commas)")
|
||||
continue
|
||||
|
||||
# WHITELIST: Only accept if title matches clear interview patterns
|
||||
interview_patterns = [
|
||||
# Direct interview indicators
|
||||
rf'(interview|interviews|interviewing)\s+(with\s+)?{re.escape(name_lower)}',
|
||||
rf'{re.escape(name_lower)}\s+(interview|interviewed)',
|
||||
# Guest indicators
|
||||
rf'(guest|featuring|feat\.?|ft\.?|with guest|special guest)[:\s]+{re.escape(name_lower)}',
|
||||
rf'{re.escape(name_lower)}\s+(joins|joined|stops by|sits down|talks|speaks|discusses|shares|reveals|opens up|gets real|gets honest)',
|
||||
# "Name on Topic" format (common interview title)
|
||||
rf'^{re.escape(name_lower)}\s+on\s+',
|
||||
# Episode number + name format ("Ep 123: Name...")
|
||||
rf'^(ep\.?|episode|#)\s*\d+[:\s]+{re.escape(name_lower)}',
|
||||
# Name at start followed by colon or dash (interview format)
|
||||
rf'^{re.escape(name_lower)}\s*[:\-–—]\s*',
|
||||
# "Conversation with Name"
|
||||
rf'(conversation|chat|talk|talking|speaking)\s+with\s+{re.escape(name_lower)}',
|
||||
# "Name Returns" / "Name is Back"
|
||||
rf'{re.escape(name_lower)}\s+(returns|is back|comes back)',
|
||||
# Q&A format
|
||||
rf'(q&a|q\s*&\s*a|ama)\s+(with\s+)?{re.escape(name_lower)}',
|
||||
# Podcast-specific patterns
|
||||
rf'{re.escape(name_lower)}\s+(live|in studio|in the studio|on the show|on the pod)',
|
||||
]
|
||||
|
||||
is_interview = False
|
||||
for pattern in interview_patterns:
|
||||
if re.search(pattern, ep_name_lower):
|
||||
is_interview = True
|
||||
logger.debug(f"Accepting '{ep_name}' - matches interview pattern")
|
||||
break
|
||||
|
||||
if not is_interview:
|
||||
logger.debug(f"Skipping '{ep_name}' - no interview pattern match (name just mentioned)")
|
||||
continue
|
||||
|
||||
credit_type = "guest"
|
||||
|
||||
# Get the artwork URL from podcast series
|
||||
artwork_url = podcast_series.get("imageUrl")
|
||||
|
||||
# Parse date
|
||||
date_published = ep.get("datePublished")
|
||||
if date_published:
|
||||
# Taddy returns Unix timestamp in seconds
|
||||
try:
|
||||
pub_date = datetime.fromtimestamp(date_published)
|
||||
appearance_date = pub_date.strftime("%Y-%m-%d")
|
||||
status = "upcoming" if pub_date.date() > now.date() else "aired"
|
||||
except (ValueError, TypeError):
|
||||
appearance_date = None
|
||||
status = "aired"
|
||||
else:
|
||||
appearance_date = None
|
||||
status = "aired"
|
||||
|
||||
# Get episode URL
|
||||
episode_url = ep.get("websiteUrl")
|
||||
|
||||
appearance = {
|
||||
"appearance_type": "Podcast",
|
||||
"show_name": podcast_series.get("name", "Unknown Podcast"),
|
||||
"episode_title": ep.get("name"),
|
||||
"appearance_date": appearance_date,
|
||||
"status": status,
|
||||
"description": strip_html(ep.get("description")),
|
||||
"poster_url": artwork_url,
|
||||
"audio_url": ep.get("audioUrl"),
|
||||
"url": episode_url,
|
||||
"credit_type": credit_type or ("host" if is_host else "guest"),
|
||||
"character_name": "Self",
|
||||
"taddy_episode_uuid": ep.get("uuid"),
|
||||
"taddy_podcast_uuid": podcast_series.get("uuid"),
|
||||
"duration_seconds": None, # Duration removed from query to reduce complexity
|
||||
}
|
||||
|
||||
appearances.append(appearance)
|
||||
logger.info(f"Found podcast appearance: {celebrity_name} on '{podcast_series.get('name')}' - {ep.get('name')}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing Taddy episode: {e}")
|
||||
continue
|
||||
|
||||
return appearances
|
||||
|
||||
async def test_connection(self) -> bool:
|
||||
"""Test if the API credentials are valid"""
|
||||
query = """
|
||||
query TestConnection {
|
||||
search(term: "test", filterForTypes: PODCASTSERIES, limitPerPage: 1) {
|
||||
searchId
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
data = await self._graphql_query(query)
|
||||
return data is not None
|
||||
295
modules/task_checkpoint.py
Normal file
295
modules/task_checkpoint.py
Normal file
@@ -0,0 +1,295 @@
|
||||
"""
|
||||
Task Checkpoint Module for Crash Recovery
|
||||
|
||||
Tracks progress of long-running scheduler tasks so that if the scheduler
|
||||
crashes mid-task, it can resume from where it left off instead of
|
||||
re-processing everything from scratch.
|
||||
|
||||
Uses the scheduler_state database (PostgreSQL via pgadapter).
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import threading
|
||||
import time
|
||||
from contextlib import closing
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Callable, List, Optional, Set
|
||||
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('TaskCheckpoint')
|
||||
|
||||
# Path to the scheduler state database
|
||||
_SCHEDULER_DB_PATH = Path(__file__).parent.parent / 'database' / 'scheduler_state.db'
|
||||
|
||||
# How many items to buffer before flushing to DB
|
||||
_FLUSH_INTERVAL = 5
|
||||
|
||||
# Stale checkpoint threshold (hours) — abandon checkpoints older than this
|
||||
STALE_THRESHOLD_HOURS = 48
|
||||
|
||||
|
||||
class TaskCheckpoint:
|
||||
"""Track progress of a scheduler task for crash recovery.
|
||||
|
||||
Usage::
|
||||
|
||||
checkpoint = TaskCheckpoint('instagram_unified:all')
|
||||
checkpoint.start(total_items=len(accounts))
|
||||
for account in accounts:
|
||||
if checkpoint.is_completed(account['username']):
|
||||
continue
|
||||
checkpoint.set_current(account['username'])
|
||||
process(account)
|
||||
checkpoint.mark_completed(account['username'])
|
||||
checkpoint.finish()
|
||||
"""
|
||||
|
||||
def __init__(self, task_id: str, task_type: str = 'scraping'):
|
||||
self.task_id = task_id
|
||||
self.task_type = task_type
|
||||
self._started = False
|
||||
self._recovering = False
|
||||
self._completed_items: Set[str] = set()
|
||||
self._pending_flush: List[str] = [] # items not yet flushed to DB
|
||||
self._current_item: Optional[str] = None
|
||||
self._total_items: int = 0
|
||||
self._lock = threading.Lock()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def start(self, total_items: int = 0):
|
||||
"""Create or resume a checkpoint record.
|
||||
|
||||
If a prior checkpoint exists for this task_id (left behind by a crash),
|
||||
we load the completed items from it and set recovery mode.
|
||||
"""
|
||||
self._total_items = total_items
|
||||
self._started = True
|
||||
|
||||
existing = self._load_existing()
|
||||
if existing is not None:
|
||||
# Resuming from a crash
|
||||
self._completed_items = existing
|
||||
self._recovering = True
|
||||
logger.info(
|
||||
f"Resuming checkpoint for {self.task_id}: "
|
||||
f"{len(self._completed_items)}/{total_items} items already completed",
|
||||
module='Checkpoint',
|
||||
)
|
||||
else:
|
||||
# Fresh run
|
||||
self._completed_items = set()
|
||||
self._recovering = False
|
||||
self._create_record(total_items)
|
||||
|
||||
def is_recovering(self) -> bool:
|
||||
"""True if we are resuming from a prior crash."""
|
||||
return self._recovering
|
||||
|
||||
def is_completed(self, item_id: str) -> bool:
|
||||
"""Check whether *item_id* was already processed in a previous run."""
|
||||
return str(item_id) in self._completed_items
|
||||
|
||||
def get_remaining(self, items: list, key_fn: Callable) -> list:
|
||||
"""Return only items not yet completed.
|
||||
|
||||
Args:
|
||||
items: Full list of items.
|
||||
key_fn: Function that extracts the item key from each element.
|
||||
"""
|
||||
return [item for item in items if str(key_fn(item)) not in self._completed_items]
|
||||
|
||||
def set_current(self, item_id: str):
|
||||
"""Record which item is currently being processed (for crash diagnostics)."""
|
||||
self._current_item = str(item_id)
|
||||
self._update_current_item()
|
||||
|
||||
def mark_completed(self, item_id: str):
|
||||
"""Mark an item as done. Batches DB writes every _FLUSH_INTERVAL items."""
|
||||
item_id = str(item_id)
|
||||
with self._lock:
|
||||
self._completed_items.add(item_id)
|
||||
self._pending_flush.append(item_id)
|
||||
should_flush = len(self._pending_flush) >= _FLUSH_INTERVAL
|
||||
if should_flush:
|
||||
self._flush()
|
||||
|
||||
def finish(self):
|
||||
"""Task completed successfully — delete the checkpoint record."""
|
||||
if not self._started:
|
||||
return
|
||||
self._flush() # flush any remaining items
|
||||
self._delete_record()
|
||||
self._started = False
|
||||
|
||||
def finish_if_started(self):
|
||||
"""No-op if start() was never called; otherwise calls finish()."""
|
||||
if self._started:
|
||||
self.finish()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Class methods for discovery
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def get_interrupted(cls) -> list:
|
||||
"""Find checkpoint records left behind by crashed tasks.
|
||||
|
||||
Returns a list of dicts with keys:
|
||||
task_id, task_type, started_at, completed_count, total_items, current_item
|
||||
"""
|
||||
try:
|
||||
with closing(sqlite3.connect(str(_SCHEDULER_DB_PATH), timeout=10)) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT task_id, task_type, started_at, completed_items, "
|
||||
"total_items, current_item FROM scheduler_task_checkpoints "
|
||||
"WHERE status = 'running'"
|
||||
)
|
||||
rows = cursor.fetchall()
|
||||
|
||||
results = []
|
||||
for row in rows:
|
||||
task_id, task_type, started_at, completed_json, total_items, current_item = row
|
||||
completed = cls._parse_completed_json(completed_json)
|
||||
results.append({
|
||||
'task_id': task_id,
|
||||
'task_type': task_type,
|
||||
'started_at': started_at,
|
||||
'completed_count': len(completed),
|
||||
'total_items': total_items or 0,
|
||||
'current_item': current_item,
|
||||
})
|
||||
return results
|
||||
except Exception as e:
|
||||
if 'no such table' not in str(e).lower():
|
||||
logger.warning(f"Error reading interrupted checkpoints: {e}", module='Checkpoint')
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def abandon(cls, task_id: str):
|
||||
"""Mark a checkpoint as abandoned (e.g. task no longer registered)."""
|
||||
try:
|
||||
with closing(sqlite3.connect(str(_SCHEDULER_DB_PATH), timeout=10)) as conn:
|
||||
conn.execute(
|
||||
"UPDATE scheduler_task_checkpoints SET status = 'abandoned', "
|
||||
"updated_at = ? WHERE task_id = ?",
|
||||
(datetime.now().isoformat(), task_id),
|
||||
)
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error abandoning checkpoint {task_id}: {e}", module='Checkpoint')
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _load_existing(self) -> Optional[Set[str]]:
|
||||
"""Load completed items from an existing checkpoint, or return None."""
|
||||
try:
|
||||
with closing(sqlite3.connect(str(_SCHEDULER_DB_PATH), timeout=10)) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT completed_items FROM scheduler_task_checkpoints "
|
||||
"WHERE task_id = ? AND status = 'running'",
|
||||
(self.task_id,),
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
if row is None:
|
||||
return None
|
||||
return self._parse_completed_json(row[0])
|
||||
except Exception as e:
|
||||
if 'no such table' not in str(e).lower():
|
||||
logger.warning(f"Error loading checkpoint for {self.task_id}: {e}", module='Checkpoint')
|
||||
return None
|
||||
|
||||
def _create_record(self, total_items: int):
|
||||
"""Insert a fresh checkpoint row (or replace an existing abandoned one)."""
|
||||
try:
|
||||
with closing(sqlite3.connect(str(_SCHEDULER_DB_PATH), timeout=10)) as conn:
|
||||
conn.execute(
|
||||
"INSERT OR REPLACE INTO scheduler_task_checkpoints "
|
||||
"(task_id, task_type, started_at, completed_items, current_item, "
|
||||
"total_items, status, updated_at) "
|
||||
"VALUES (?, ?, ?, '[]', NULL, ?, 'running', ?)",
|
||||
(
|
||||
self.task_id,
|
||||
self.task_type,
|
||||
datetime.now().isoformat(),
|
||||
total_items,
|
||||
datetime.now().isoformat(),
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error creating checkpoint for {self.task_id}: {e}", module='Checkpoint')
|
||||
|
||||
def _flush(self):
|
||||
"""Write pending completed items to the database."""
|
||||
with self._lock:
|
||||
if not self._pending_flush:
|
||||
return
|
||||
items_snapshot = list(self._completed_items)
|
||||
self._pending_flush.clear()
|
||||
|
||||
try:
|
||||
completed_json = json.dumps(items_snapshot)
|
||||
with closing(sqlite3.connect(str(_SCHEDULER_DB_PATH), timeout=10)) as conn:
|
||||
conn.execute(
|
||||
"UPDATE scheduler_task_checkpoints "
|
||||
"SET completed_items = ?, total_items = ?, updated_at = ? "
|
||||
"WHERE task_id = ?",
|
||||
(
|
||||
completed_json,
|
||||
self._total_items,
|
||||
datetime.now().isoformat(),
|
||||
self.task_id,
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error flushing checkpoint for {self.task_id}: {e}", module='Checkpoint')
|
||||
|
||||
def _update_current_item(self):
|
||||
"""Update the current_item column for crash diagnostics."""
|
||||
try:
|
||||
with closing(sqlite3.connect(str(_SCHEDULER_DB_PATH), timeout=10)) as conn:
|
||||
conn.execute(
|
||||
"UPDATE scheduler_task_checkpoints "
|
||||
"SET current_item = ?, updated_at = ? WHERE task_id = ?",
|
||||
(self._current_item, datetime.now().isoformat(), self.task_id),
|
||||
)
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
# Non-critical — just diagnostics
|
||||
pass
|
||||
|
||||
def _delete_record(self):
|
||||
"""Remove the checkpoint row on successful completion."""
|
||||
try:
|
||||
with closing(sqlite3.connect(str(_SCHEDULER_DB_PATH), timeout=10)) as conn:
|
||||
conn.execute(
|
||||
"DELETE FROM scheduler_task_checkpoints WHERE task_id = ?",
|
||||
(self.task_id,),
|
||||
)
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error deleting checkpoint for {self.task_id}: {e}", module='Checkpoint')
|
||||
|
||||
@staticmethod
|
||||
def _parse_completed_json(raw: str) -> Set[str]:
|
||||
"""Parse JSON array of completed item IDs, tolerating corruption."""
|
||||
if not raw:
|
||||
return set()
|
||||
try:
|
||||
items = json.loads(raw)
|
||||
if isinstance(items, list):
|
||||
return set(str(i) for i in items)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
logger.warning("Corrupted checkpoint data — starting fresh (scrapers deduplicate)", module='Checkpoint')
|
||||
return set()
|
||||
639
modules/thumbnail_cache_builder.py
Executable file
639
modules/thumbnail_cache_builder.py
Executable file
@@ -0,0 +1,639 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Background worker to pre-generate thumbnails and cache metadata for all media files.
|
||||
This improves performance by generating thumbnails in advance rather than on-demand.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
# Add parent directory to path so we can import modules
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
# Bootstrap database backend (must be before any database imports)
|
||||
import modules.db_bootstrap # noqa: E402,F401
|
||||
|
||||
import sqlite3
|
||||
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('ThumbnailCacheBuilder')
|
||||
|
||||
|
||||
class ThumbnailCacheBuilder:
|
||||
"""Build and maintain thumbnail and metadata cache for media files"""
|
||||
|
||||
def __init__(self):
|
||||
self.scan_dirs = [
|
||||
Path('/opt/immich/md'),
|
||||
Path('/opt/immich/review'),
|
||||
Path('/opt/immich/recycle')
|
||||
]
|
||||
self.db_path = Path(__file__).parent.parent / 'database' / 'thumbnails.db'
|
||||
self.metadata_db_path = Path(__file__).parent.parent / 'database' / 'media_metadata.db'
|
||||
self.unified_db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db'
|
||||
self.max_thumb_size = (300, 300)
|
||||
|
||||
# Image and video extensions
|
||||
self.image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif', '.webp'}
|
||||
self.video_extensions = {'.mp4', '.mov', '.webm', '.avi', '.mkv', '.flv', '.m4v'}
|
||||
|
||||
self.stats = {
|
||||
'processed': 0,
|
||||
'thumbnails_created': 0,
|
||||
'thumbnails_cached': 0,
|
||||
'metadata_cached': 0,
|
||||
'errors': 0,
|
||||
'skipped': 0
|
||||
}
|
||||
|
||||
self._init_metadata_db()
|
||||
|
||||
def _init_metadata_db(self):
|
||||
"""Initialize metadata cache database"""
|
||||
self.metadata_db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
|
||||
conn.execute('PRAGMA journal_mode=WAL')
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS media_metadata (
|
||||
file_hash TEXT PRIMARY KEY,
|
||||
file_path TEXT NOT NULL,
|
||||
width INTEGER,
|
||||
height INTEGER,
|
||||
file_size INTEGER,
|
||||
duration REAL,
|
||||
format TEXT,
|
||||
created_at TEXT,
|
||||
file_mtime DOUBLE PRECISION
|
||||
)
|
||||
""")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_meta_file_path ON media_metadata(file_path)")
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
logger.info(f"Metadata database initialized at {self.metadata_db_path}", module="Database")
|
||||
|
||||
def _get_file_hash(self, file_path: Path, content_hash: str = None) -> str:
|
||||
"""Generate hash for file path or use content hash
|
||||
|
||||
Args:
|
||||
file_path: Path to the file
|
||||
content_hash: Optional SHA256 content hash from database (preferred for recycle bin)
|
||||
"""
|
||||
if content_hash:
|
||||
# Use first 64 chars of content hash (full SHA256 for cache key)
|
||||
return content_hash[:64]
|
||||
# Fall back to path-based hash
|
||||
return hashlib.sha256(str(file_path).encode()).hexdigest()
|
||||
|
||||
def _generate_image_thumbnail(self, file_path: Path) -> tuple:
|
||||
"""Generate thumbnail and extract metadata for image
|
||||
Returns: (thumbnail_data, width, height, format)
|
||||
"""
|
||||
try:
|
||||
with Image.open(file_path) as img:
|
||||
# Get original dimensions
|
||||
width, height = img.size
|
||||
img_format = img.format
|
||||
|
||||
# Convert RGBA to RGB if needed
|
||||
if img.mode == 'RGBA':
|
||||
background = Image.new('RGB', img.size, (255, 255, 255))
|
||||
background.paste(img, mask=img.split()[3])
|
||||
img = background
|
||||
elif img.mode != 'RGB':
|
||||
img = img.convert('RGB')
|
||||
|
||||
# Generate thumbnail
|
||||
img.thumbnail(self.max_thumb_size, Image.Resampling.LANCZOS)
|
||||
|
||||
# Save to bytes
|
||||
buffer = io.BytesIO()
|
||||
img.save(buffer, format='JPEG', quality=85, optimize=True)
|
||||
thumbnail_data = buffer.getvalue()
|
||||
|
||||
return thumbnail_data, width, height, img_format
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating image thumbnail for {file_path}: {e}", module="Error")
|
||||
return None, None, None, None
|
||||
|
||||
def _generate_video_thumbnail(self, file_path: Path) -> tuple:
|
||||
"""Generate thumbnail and extract metadata for video using ffmpeg
|
||||
Returns: (thumbnail_data, width, height, duration)
|
||||
"""
|
||||
try:
|
||||
import subprocess
|
||||
import json
|
||||
|
||||
# Get video metadata using ffprobe
|
||||
probe_cmd = [
|
||||
'ffprobe',
|
||||
'-v', 'quiet',
|
||||
'-print_format', 'json',
|
||||
'-show_format',
|
||||
'-show_streams',
|
||||
str(file_path)
|
||||
]
|
||||
|
||||
result = subprocess.run(probe_cmd, capture_output=True, text=True, timeout=30)
|
||||
if result.returncode != 0:
|
||||
logger.error(f"ffprobe failed for {file_path}", module="Error")
|
||||
return None, None, None, None
|
||||
|
||||
metadata = json.loads(result.stdout)
|
||||
|
||||
# Extract video stream info
|
||||
video_stream = next((s for s in metadata.get('streams', []) if s.get('codec_type') == 'video'), None)
|
||||
if not video_stream:
|
||||
return None, None, None, None
|
||||
|
||||
width = video_stream.get('width')
|
||||
height = video_stream.get('height')
|
||||
duration = float(metadata.get('format', {}).get('duration', 0))
|
||||
|
||||
# Generate thumbnail - seek to 1s or 0s for very short videos
|
||||
temp_output = f"/tmp/thumb_{os.getpid()}.jpg"
|
||||
seek_time = '00:00:01' if duration > 1.5 else '00:00:00'
|
||||
|
||||
thumb_cmd = [
|
||||
'ffmpeg',
|
||||
'-ss', seek_time,
|
||||
'-i', str(file_path),
|
||||
'-vframes', '1',
|
||||
'-vf', f'scale={self.max_thumb_size[0]}:{self.max_thumb_size[1]}:force_original_aspect_ratio=decrease',
|
||||
'-y',
|
||||
temp_output
|
||||
]
|
||||
|
||||
result = subprocess.run(thumb_cmd, capture_output=True, timeout=30)
|
||||
if result.returncode != 0 or not Path(temp_output).exists():
|
||||
logger.error(f"ffmpeg thumbnail generation failed for {file_path}", module="Error")
|
||||
return None, width, height, duration
|
||||
|
||||
# Read thumbnail data
|
||||
with open(temp_output, 'rb') as f:
|
||||
thumbnail_data = f.read()
|
||||
|
||||
# Clean up temp file
|
||||
Path(temp_output).unlink(missing_ok=True)
|
||||
|
||||
return thumbnail_data, width, height, duration
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating video thumbnail for {file_path}: {e}", module="Error")
|
||||
return None, None, None, None
|
||||
|
||||
def _cache_thumbnail(self, file_path: Path, thumbnail_data: bytes, content_hash: str = None):
|
||||
"""Store thumbnail in cache database
|
||||
|
||||
Args:
|
||||
file_path: Path to the file
|
||||
thumbnail_data: JPEG thumbnail data
|
||||
content_hash: Optional SHA256 content hash from database
|
||||
"""
|
||||
try:
|
||||
file_hash = self._get_file_hash(file_path, content_hash)
|
||||
file_mtime = file_path.stat().st_mtime
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path), timeout=30.0)
|
||||
conn.execute('PRAGMA journal_mode=WAL')
|
||||
conn.execute("""
|
||||
INSERT OR REPLACE INTO thumbnails
|
||||
(file_hash, file_path, thumbnail_data, created_at, file_mtime)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""", (file_hash, str(file_path), thumbnail_data, datetime.now().isoformat(), file_mtime))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error caching thumbnail for {file_path}: {e}", module="Error")
|
||||
return False
|
||||
|
||||
def _cache_metadata(self, file_path: Path, width: int, height: int, duration: float = None, format_type: str = None, content_hash: str = None):
|
||||
"""Store metadata in cache database
|
||||
|
||||
Args:
|
||||
file_path: Path to the file
|
||||
width: Image/video width
|
||||
height: Image/video height
|
||||
duration: Video duration (seconds)
|
||||
format_type: Media format
|
||||
content_hash: Optional SHA256 content hash from database
|
||||
"""
|
||||
try:
|
||||
file_hash = self._get_file_hash(file_path, content_hash)
|
||||
file_mtime = file_path.stat().st_mtime
|
||||
file_size = file_path.stat().st_size
|
||||
|
||||
conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
|
||||
conn.execute('PRAGMA journal_mode=WAL')
|
||||
conn.execute("""
|
||||
INSERT OR REPLACE INTO media_metadata
|
||||
(file_hash, file_path, width, height, file_size, duration, format, created_at, file_mtime)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (file_hash, str(file_path), width, height, file_size, duration, format_type,
|
||||
datetime.now().isoformat(), file_mtime))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error caching metadata for {file_path}: {e}", module="Error")
|
||||
return False
|
||||
|
||||
def _is_cached_valid(self, file_path: Path, content_hash: str = None) -> bool:
|
||||
"""Check if file already has valid cached thumbnail and metadata
|
||||
|
||||
Args:
|
||||
file_path: Path to the file
|
||||
content_hash: Optional SHA256 content hash from database
|
||||
"""
|
||||
try:
|
||||
file_hash = self._get_file_hash(file_path, content_hash)
|
||||
file_mtime = file_path.stat().st_mtime
|
||||
|
||||
# Check thumbnail cache
|
||||
conn = sqlite3.connect(str(self.db_path), timeout=30.0)
|
||||
conn.execute('PRAGMA journal_mode=WAL')
|
||||
cursor = conn.execute(
|
||||
"SELECT file_mtime FROM thumbnails WHERE file_hash = ?",
|
||||
(file_hash,)
|
||||
)
|
||||
thumb_result = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if not thumb_result or abs(thumb_result[0] - file_mtime) > 1:
|
||||
return False
|
||||
|
||||
# Check metadata cache
|
||||
conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
|
||||
conn.execute('PRAGMA journal_mode=WAL')
|
||||
cursor = conn.execute(
|
||||
"SELECT file_mtime FROM media_metadata WHERE file_hash = ?",
|
||||
(file_hash,)
|
||||
)
|
||||
meta_result = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if not meta_result or abs(meta_result[0] - file_mtime) > 1:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking cache for {file_path}: {e}", module="Error")
|
||||
return False
|
||||
|
||||
def process_file(self, file_path: Path, content_hash: str = None) -> bool:
|
||||
"""Process a single file - generate thumbnail and cache metadata
|
||||
|
||||
Args:
|
||||
file_path: Path to the file
|
||||
content_hash: Optional SHA256 content hash from database (preferred for cache key)
|
||||
"""
|
||||
try:
|
||||
if not file_path.exists():
|
||||
self.stats['skipped'] += 1
|
||||
return True
|
||||
|
||||
# Check if already cached and up-to-date
|
||||
if self._is_cached_valid(file_path, content_hash):
|
||||
self.stats['skipped'] += 1
|
||||
return True
|
||||
|
||||
file_ext = file_path.suffix.lower()
|
||||
|
||||
if file_ext in self.image_extensions:
|
||||
# Process image
|
||||
thumbnail_data, width, height, format_type = self._generate_image_thumbnail(file_path)
|
||||
|
||||
if thumbnail_data and width and height:
|
||||
# Cache thumbnail
|
||||
if self._cache_thumbnail(file_path, thumbnail_data, content_hash):
|
||||
self.stats['thumbnails_created'] += 1
|
||||
|
||||
# Cache metadata
|
||||
if self._cache_metadata(file_path, width, height, format_type=format_type, content_hash=content_hash):
|
||||
self.stats['metadata_cached'] += 1
|
||||
|
||||
return True
|
||||
else:
|
||||
self.stats['errors'] += 1
|
||||
return False
|
||||
|
||||
elif file_ext in self.video_extensions:
|
||||
# Process video
|
||||
thumbnail_data, width, height, duration = self._generate_video_thumbnail(file_path)
|
||||
|
||||
# Cache thumbnail if generated
|
||||
if thumbnail_data:
|
||||
if self._cache_thumbnail(file_path, thumbnail_data, content_hash):
|
||||
self.stats['thumbnails_created'] += 1
|
||||
|
||||
# Cache metadata if we have dimensions
|
||||
if width and height:
|
||||
if self._cache_metadata(file_path, width, height, duration=duration, format_type='video', content_hash=content_hash):
|
||||
self.stats['metadata_cached'] += 1
|
||||
|
||||
# Consider successful even if thumbnail failed (metadata might still be cached)
|
||||
if width and height:
|
||||
return True
|
||||
else:
|
||||
self.stats['errors'] += 1
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file {file_path}: {e}", module="Error")
|
||||
self.stats['errors'] += 1
|
||||
return False
|
||||
|
||||
def _get_files_from_inventory(self) -> list:
|
||||
"""Query file_inventory table for all media files (database-first)
|
||||
Returns: List of tuples (file_path, content_hash or None)
|
||||
"""
|
||||
try:
|
||||
conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Query all files from file_inventory (any location: final, review, recycle)
|
||||
# Include file_hash from recycle_bin if file is in recycle location
|
||||
cursor.execute("""
|
||||
SELECT
|
||||
fi.file_path,
|
||||
fi.content_type,
|
||||
fi.location,
|
||||
rb.file_hash as content_hash
|
||||
FROM file_inventory fi
|
||||
LEFT JOIN recycle_bin rb ON fi.file_path = rb.recycle_path
|
||||
ORDER BY fi.created_date DESC
|
||||
""")
|
||||
|
||||
rows = cursor.fetchall()
|
||||
conn.close()
|
||||
|
||||
# Convert to Path objects and filter by extension
|
||||
all_extensions = list(self.image_extensions) + list(self.video_extensions)
|
||||
files = []
|
||||
|
||||
for row in rows:
|
||||
file_path = Path(row['file_path'])
|
||||
if file_path.suffix.lower() in all_extensions and file_path.exists():
|
||||
# Return tuple: (file_path, content_hash or None)
|
||||
content_hash = row['content_hash'] if row['content_hash'] else None
|
||||
files.append((file_path, content_hash))
|
||||
|
||||
return files
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error querying file_inventory: {e}", module="Error")
|
||||
# Fallback to filesystem scan if database query fails
|
||||
logger.warning("Falling back to filesystem scan...", module="Warning")
|
||||
return self._fallback_filesystem_scan()
|
||||
|
||||
def _fallback_filesystem_scan(self) -> list:
|
||||
"""Fallback: Scan filesystem if database query fails
|
||||
Returns: List of tuples (file_path, None) - no content_hash available from filesystem
|
||||
"""
|
||||
all_files = []
|
||||
for scan_dir in self.scan_dirs:
|
||||
if not scan_dir.exists():
|
||||
continue
|
||||
for ext in list(self.image_extensions) + list(self.video_extensions):
|
||||
# Return tuples: (file_path, None) - no content hash from filesystem scan
|
||||
all_files.extend([(f, None) for f in scan_dir.rglob(f"*{ext}")])
|
||||
return all_files
|
||||
|
||||
def scan_and_process(self):
|
||||
"""Query file_inventory and process all files (database-first)"""
|
||||
logger.info("Starting thumbnail and metadata cache build...", module="Core")
|
||||
logger.info("Querying file_inventory table (database-first architecture)...", module="Core")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Query file_inventory instead of scanning filesystem
|
||||
# Returns list of tuples: (file_path, content_hash or None)
|
||||
all_files = self._get_files_from_inventory()
|
||||
|
||||
total_files = len(all_files)
|
||||
logger.info(f"Found {total_files} media files to process from file_inventory", module="Core")
|
||||
|
||||
# Count how many have content hashes (from recycle bin)
|
||||
files_with_hash = sum(1 for _, content_hash in all_files if content_hash)
|
||||
if files_with_hash > 0:
|
||||
logger.info(f" - {files_with_hash} files have content hash (from recycle bin - cache survives moves)", module="Core")
|
||||
|
||||
# Process files with progress updates
|
||||
for i, (file_path, content_hash) in enumerate(all_files, 1):
|
||||
self.process_file(file_path, content_hash)
|
||||
self.stats['processed'] += 1
|
||||
|
||||
# Progress update every 100 files
|
||||
if i % 100 == 0 or i == total_files:
|
||||
elapsed = time.time() - start_time
|
||||
rate = i / elapsed if elapsed > 0 else 0
|
||||
eta = (total_files - i) / rate if rate > 0 else 0
|
||||
|
||||
logger.info(f"Progress: {i}/{total_files} ({i/total_files*100:.1f}%) - "
|
||||
f"Rate: {rate:.1f} files/sec - ETA: {eta/60:.1f} min", module="Core")
|
||||
|
||||
# Final statistics
|
||||
elapsed = time.time() - start_time
|
||||
logger.info("=" * 60, module="Core")
|
||||
logger.info("Thumbnail and Metadata Cache Build Complete", module="Core")
|
||||
logger.info("=" * 60, module="Core")
|
||||
logger.info(f"Total files processed: {self.stats['processed']}", module="Core")
|
||||
logger.info(f"Thumbnails created: {self.stats['thumbnails_created']}", module="Core")
|
||||
logger.info(f"Metadata cached: {self.stats['metadata_cached']}", module="Core")
|
||||
logger.info(f"Files skipped (already cached): {self.stats['skipped']}", module="Core")
|
||||
logger.info(f"Errors: {self.stats['errors']}", module="Core")
|
||||
logger.info(f"Total time: {elapsed/60:.1f} minutes", module="Core")
|
||||
logger.info(f"Average rate: {self.stats['processed']/elapsed:.1f} files/sec", module="Core")
|
||||
logger.info("=" * 60, module="Core")
|
||||
|
||||
def cleanup_orphaned_records(self):
|
||||
"""Clean up orphaned database records for files that no longer exist"""
|
||||
logger.info("Starting database cleanup for orphaned records...", module="Cleanup")
|
||||
cleanup_stats = {
|
||||
'face_recognition_scans': 0,
|
||||
'downloads': 0,
|
||||
'media_metadata': 0,
|
||||
'thumbnail_cache': 0
|
||||
}
|
||||
|
||||
conn = None
|
||||
meta_conn = None
|
||||
thumb_conn = None
|
||||
main_conn = None
|
||||
|
||||
try:
|
||||
# Clean up face_recognition_scans for files not in file_inventory
|
||||
conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Find orphaned face_recognition_scans (files not in file_inventory)
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM face_recognition_scans frs
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM file_inventory fi WHERE fi.file_path = frs.file_path
|
||||
)
|
||||
""")
|
||||
orphaned_count = cursor.fetchone()[0]
|
||||
|
||||
if orphaned_count > 0:
|
||||
cursor.execute("""
|
||||
DELETE FROM face_recognition_scans
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM file_inventory fi WHERE fi.file_path = face_recognition_scans.file_path
|
||||
)
|
||||
""")
|
||||
conn.commit()
|
||||
cleanup_stats['face_recognition_scans'] = orphaned_count
|
||||
logger.info(f"Removed {orphaned_count} orphaned face_recognition_scans records", module="Cleanup")
|
||||
|
||||
# Clean up downloads for files not in file_inventory
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM downloads d
|
||||
WHERE d.file_path IS NOT NULL AND d.file_path != ''
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM file_inventory fi WHERE fi.file_path = d.file_path
|
||||
)
|
||||
""")
|
||||
orphaned_downloads = cursor.fetchone()[0]
|
||||
|
||||
if orphaned_downloads > 0:
|
||||
cursor.execute("""
|
||||
DELETE FROM downloads
|
||||
WHERE file_path IS NOT NULL AND file_path != ''
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM file_inventory fi WHERE fi.file_path = downloads.file_path
|
||||
)
|
||||
""")
|
||||
conn.commit()
|
||||
cleanup_stats['downloads'] = orphaned_downloads
|
||||
logger.info(f"Removed {orphaned_downloads} orphaned downloads records", module="Cleanup")
|
||||
|
||||
conn.close()
|
||||
|
||||
# Clean up media_metadata cache for files not in file_inventory
|
||||
try:
|
||||
meta_conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
|
||||
main_conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
|
||||
|
||||
# Get list of valid file paths from file_inventory
|
||||
main_cursor = main_conn.cursor()
|
||||
main_cursor.execute("SELECT file_path FROM file_inventory")
|
||||
valid_paths = set(row[0] for row in main_cursor.fetchall())
|
||||
main_conn.close()
|
||||
|
||||
# Check metadata for orphans
|
||||
meta_cursor = meta_conn.cursor()
|
||||
meta_cursor.execute("SELECT file_path FROM media_metadata")
|
||||
all_meta_paths = [row[0] for row in meta_cursor.fetchall()]
|
||||
|
||||
orphaned_meta = [p for p in all_meta_paths if p not in valid_paths]
|
||||
if orphaned_meta:
|
||||
placeholders = ','.join(['?' for _ in orphaned_meta])
|
||||
meta_cursor.execute(f"DELETE FROM media_metadata WHERE file_path IN ({placeholders})", orphaned_meta)
|
||||
meta_conn.commit()
|
||||
cleanup_stats['media_metadata'] = len(orphaned_meta)
|
||||
logger.info(f"Removed {len(orphaned_meta)} orphaned media_metadata records", module="Cleanup")
|
||||
|
||||
meta_conn.close()
|
||||
except Exception:
|
||||
pass # metadata cleanup is non-critical
|
||||
|
||||
# Clean up thumbnail cache for files not in file_inventory
|
||||
thumb_db_path = Path(__file__).parent.parent / 'database' / 'thumbnails.db'
|
||||
try:
|
||||
thumb_conn = sqlite3.connect(str(thumb_db_path), timeout=30.0)
|
||||
main_conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
|
||||
|
||||
# Get list of valid file paths
|
||||
main_cursor = main_conn.cursor()
|
||||
main_cursor.execute("SELECT file_path FROM file_inventory")
|
||||
valid_paths = set(row[0] for row in main_cursor.fetchall())
|
||||
main_conn.close()
|
||||
|
||||
# Check thumbnails for orphans
|
||||
thumb_cursor = thumb_conn.cursor()
|
||||
# Thumbnails use file_hash as key, so we need to check existence differently
|
||||
try:
|
||||
thumb_cursor.execute("SELECT file_path FROM thumbnails WHERE file_path IS NOT NULL")
|
||||
all_thumb_paths = [row[0] for row in thumb_cursor.fetchall()]
|
||||
|
||||
orphaned_thumbs = [p for p in all_thumb_paths if p and p not in valid_paths]
|
||||
if orphaned_thumbs:
|
||||
placeholders = ','.join(['?' for _ in orphaned_thumbs])
|
||||
thumb_cursor.execute(f"DELETE FROM thumbnails WHERE file_path IN ({placeholders})", orphaned_thumbs)
|
||||
thumb_conn.commit()
|
||||
cleanup_stats['thumbnail_cache'] = len(orphaned_thumbs)
|
||||
logger.info(f"Removed {len(orphaned_thumbs)} orphaned thumbnail records", module="Cleanup")
|
||||
except sqlite3.OperationalError:
|
||||
# Table structure may not have file_path column
|
||||
pass
|
||||
|
||||
thumb_conn.close()
|
||||
except Exception:
|
||||
pass # thumbnail cleanup is non-critical
|
||||
|
||||
# Log summary
|
||||
total_cleaned = sum(cleanup_stats.values())
|
||||
logger.info("=" * 60, module="Cleanup")
|
||||
logger.info("Database Cleanup Complete", module="Cleanup")
|
||||
logger.info("=" * 60, module="Cleanup")
|
||||
logger.info(f"Total orphaned records removed: {total_cleaned}", module="Cleanup")
|
||||
for table, count in cleanup_stats.items():
|
||||
if count > 0:
|
||||
logger.info(f" - {table}: {count}", module="Cleanup")
|
||||
logger.info("=" * 60, module="Cleanup")
|
||||
|
||||
return cleanup_stats
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during database cleanup: {e}", exc_info=True, module="Error")
|
||||
return cleanup_stats
|
||||
finally:
|
||||
# Ensure all database connections are closed
|
||||
for connection in [conn, meta_conn, thumb_conn, main_conn]:
|
||||
if connection:
|
||||
try:
|
||||
connection.close()
|
||||
except Exception:
|
||||
pass # Best effort cleanup
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
logger.info("Thumbnail Cache Builder starting...", module="Core")
|
||||
|
||||
try:
|
||||
builder = ThumbnailCacheBuilder()
|
||||
|
||||
# Run database cleanup first (before processing)
|
||||
logger.info("Phase 1: Database cleanup for orphaned records", module="Core")
|
||||
builder.cleanup_orphaned_records()
|
||||
|
||||
# Then process thumbnails and metadata
|
||||
logger.info("Phase 2: Thumbnail and metadata cache building", module="Core")
|
||||
builder.scan_and_process()
|
||||
|
||||
logger.info("Thumbnail Cache Builder completed successfully", module="Core")
|
||||
return 0
|
||||
except Exception as e:
|
||||
logger.error(f"Fatal error in Thumbnail Cache Builder: {e}", exc_info=True, module="Error")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
102
modules/tiktok_db_adapter.py
Executable file
102
modules/tiktok_db_adapter.py
Executable file
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
TikTok Database Adapter for Unified Database
|
||||
Provides compatibility layer between TikTok module and unified database
|
||||
"""
|
||||
|
||||
from typing import Optional, Dict
|
||||
from datetime import datetime
|
||||
import json
|
||||
|
||||
class TikTokDatabaseAdapter:
|
||||
"""Adapter to make unified database work with TikTok module"""
|
||||
|
||||
def __init__(self, unified_db):
|
||||
"""Initialize adapter with unified database instance"""
|
||||
self.unified_db = unified_db
|
||||
self.platform = 'tiktok'
|
||||
|
||||
def get_file_hash(self, file_path: str) -> Optional[str]:
|
||||
"""Calculate SHA256 hash of a file (delegates to UnifiedDatabase)"""
|
||||
return self.unified_db.get_file_hash(file_path)
|
||||
|
||||
def get_download_by_file_hash(self, file_hash: str) -> Optional[Dict]:
|
||||
"""Get download record by file hash (delegates to UnifiedDatabase)"""
|
||||
return self.unified_db.get_download_by_file_hash(file_hash)
|
||||
|
||||
def record_download(self, video_id: str, username: str, filename: str,
|
||||
post_date: Optional[datetime] = None, metadata: Dict = None,
|
||||
file_path: str = None):
|
||||
"""Record a TikTok download in the unified database"""
|
||||
# Convert TikTok's video_id to a URL format for unified database
|
||||
# For carousel photos, append filename to make URL unique (otherwise url_hash collision)
|
||||
url = f"https://www.tiktok.com/@{username}/video/{video_id}#{filename}"
|
||||
|
||||
# Calculate file hash if file_path provided
|
||||
file_hash = None
|
||||
if file_path:
|
||||
try:
|
||||
from pathlib import Path
|
||||
if Path(file_path).exists():
|
||||
file_hash = self.unified_db.get_file_hash(file_path)
|
||||
except Exception:
|
||||
pass # If hash fails, continue without it
|
||||
|
||||
# Detect content type from file extension
|
||||
from pathlib import Path
|
||||
ext = Path(filename).suffix.lower()
|
||||
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif', '.webp', '.bmp', '.tiff'}
|
||||
content_type = 'image' if ext in image_exts else 'video'
|
||||
|
||||
return self.unified_db.record_download(
|
||||
url=url,
|
||||
platform=self.platform,
|
||||
source=username,
|
||||
content_type=content_type,
|
||||
filename=filename,
|
||||
post_date=post_date,
|
||||
metadata=metadata,
|
||||
file_hash=file_hash,
|
||||
file_path=file_path
|
||||
)
|
||||
|
||||
def is_downloaded(self, video_id: str, username: str = None) -> bool:
|
||||
"""Check if a video has been downloaded"""
|
||||
# Check if ANY file from this video_id has been downloaded
|
||||
# (For carousels, URLs include #filename so we need to search by video_id pattern)
|
||||
try:
|
||||
import sqlite3
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
# Search for URLs containing this video_id
|
||||
if username:
|
||||
url_pattern = f"https://www.tiktok.com/@{username}/video/{video_id}%"
|
||||
else:
|
||||
url_pattern = f"%/video/{video_id}%"
|
||||
|
||||
cursor.execute(
|
||||
"SELECT 1 FROM downloads WHERE url LIKE ? AND platform = ? LIMIT 1",
|
||||
(url_pattern, self.platform)
|
||||
)
|
||||
return cursor.fetchone() is not None
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def is_already_downloaded(self, video_id: str) -> bool:
|
||||
"""Check if a video has already been downloaded (alias for compatibility)"""
|
||||
return self.is_downloaded(video_id)
|
||||
|
||||
def get_download_info(self, video_id: str) -> Optional[Dict]:
|
||||
"""Get download information for a video"""
|
||||
# This is a simplified lookup - may need to search by video_id in URL
|
||||
results = self.unified_db.get_downloads(platform=self.platform, limit=1000)
|
||||
|
||||
for download in results:
|
||||
if video_id in download.get('url', ''):
|
||||
return download
|
||||
|
||||
return None
|
||||
|
||||
def cleanup_old_downloads(self, days: int = 180):
|
||||
"""Clean up old download records"""
|
||||
return self.unified_db.cleanup_old_downloads(days=days, platform=self.platform)
|
||||
603
modules/tiktok_module.py
Executable file
603
modules/tiktok_module.py
Executable file
@@ -0,0 +1,603 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
TikTok Download Module - Downloads TikTok videos with proper timestamp extraction
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import subprocess
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from modules.base_module import LoggingMixin
|
||||
|
||||
|
||||
class TikTokDownloader(LoggingMixin):
|
||||
"""Downloads TikTok videos and extracts metadata including timestamps"""
|
||||
|
||||
def __init__(self, base_path: Path = None, log_callback=None, use_database=True, unified_db=None):
|
||||
"""
|
||||
Initialize TikTok downloader
|
||||
|
||||
Args:
|
||||
base_path: Base path for downloads
|
||||
log_callback: Optional callback for logging (tag, level, message)
|
||||
use_database: Whether to use database for tracking downloads
|
||||
unified_db: UnifiedDatabase instance (required)
|
||||
"""
|
||||
# Initialize logging via mixin
|
||||
self._init_logger('TikTok', log_callback, default_module='Download')
|
||||
|
||||
self.base_path = Path(base_path) if base_path else Path.cwd()
|
||||
self.file_timestamps = {} # Map of filename -> datetime
|
||||
self.use_database = use_database
|
||||
|
||||
# Always use unified database adapter
|
||||
if not unified_db:
|
||||
raise ValueError("TikTok module requires unified_db - standalone database is no longer supported")
|
||||
|
||||
from modules.tiktok_db_adapter import TikTokDatabaseAdapter
|
||||
self.db = TikTokDatabaseAdapter(unified_db)
|
||||
self.use_unified_db = True
|
||||
|
||||
# Initialize activity status manager for real-time updates
|
||||
from modules.activity_status import get_activity_manager
|
||||
self.activity_manager = get_activity_manager(unified_db)
|
||||
|
||||
self.pending_downloads = [] # Track downloads for deferred database recording
|
||||
|
||||
def _is_already_downloaded(self, video_id: str, username: str = None) -> bool:
|
||||
"""Check if a video has already been downloaded"""
|
||||
if not self.use_database:
|
||||
return False
|
||||
|
||||
# Pass username for proper database lookup
|
||||
if username:
|
||||
return self.db.is_downloaded(video_id, username)
|
||||
return self.db.is_already_downloaded(video_id)
|
||||
|
||||
def _record_download(self, video_id: str, username: str, filename: str,
|
||||
post_date: Optional[datetime] = None, metadata: Dict = None,
|
||||
deferred: bool = False):
|
||||
"""Record a successful download in the database
|
||||
|
||||
Args:
|
||||
deferred: If True, don't record to database now - add to pending_downloads list
|
||||
for later recording after file move is complete
|
||||
"""
|
||||
# Extract just the filename from the full path for database
|
||||
from pathlib import Path
|
||||
file_path = str(filename) # Full path
|
||||
filename_only = Path(filename).name # Just the filename
|
||||
|
||||
# If deferred, store for later recording instead of recording now
|
||||
if deferred:
|
||||
self.pending_downloads.append({
|
||||
'video_id': video_id,
|
||||
'username': username,
|
||||
'filename': filename_only,
|
||||
'post_date': post_date.isoformat() if post_date else None,
|
||||
'file_path': file_path,
|
||||
'metadata': metadata
|
||||
})
|
||||
self.log(f"Deferred recording for {video_id}", "debug")
|
||||
return True
|
||||
|
||||
if not self.use_database:
|
||||
return
|
||||
|
||||
return self.db.record_download(
|
||||
video_id=video_id,
|
||||
username=username,
|
||||
filename=filename_only,
|
||||
post_date=post_date,
|
||||
metadata=metadata,
|
||||
file_path=file_path
|
||||
)
|
||||
|
||||
def get_pending_downloads(self):
|
||||
"""Get list of downloads that were deferred for later recording"""
|
||||
return self.pending_downloads.copy()
|
||||
|
||||
def clear_pending_downloads(self):
|
||||
"""Clear the pending downloads list after they've been recorded"""
|
||||
self.pending_downloads = []
|
||||
|
||||
def extract_date_from_info(self, info_dict: Dict) -> Optional[datetime]:
|
||||
"""
|
||||
Extract upload date from yt-dlp info dictionary
|
||||
|
||||
Args:
|
||||
info_dict: yt-dlp info dictionary
|
||||
|
||||
Returns:
|
||||
datetime object or None
|
||||
"""
|
||||
# Try timestamp first (Unix timestamp - has full date and time)
|
||||
# TikTok provides UTC timestamps, need to convert to local time
|
||||
timestamp = info_dict.get('timestamp')
|
||||
if timestamp:
|
||||
try:
|
||||
# Use UTC timestamp and convert to local
|
||||
from datetime import timezone
|
||||
dt_utc = datetime.fromtimestamp(timestamp, tz=timezone.utc)
|
||||
dt = dt_utc.replace(tzinfo=None) # Remove timezone info for local datetime
|
||||
self.log(f"Extracted full timestamp (UTC): {dt}", "debug")
|
||||
return dt
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Try release_timestamp (also has full date and time)
|
||||
release_timestamp = info_dict.get('release_timestamp')
|
||||
if release_timestamp:
|
||||
try:
|
||||
from datetime import timezone
|
||||
dt_utc = datetime.fromtimestamp(release_timestamp, tz=timezone.utc)
|
||||
dt = dt_utc.replace(tzinfo=None) # Remove timezone info for local datetime
|
||||
self.log(f"Extracted release timestamp (UTC): {dt}", "debug")
|
||||
return dt
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Try modified_timestamp
|
||||
modified_timestamp = info_dict.get('modified_timestamp')
|
||||
if modified_timestamp:
|
||||
try:
|
||||
from datetime import timezone
|
||||
dt_utc = datetime.fromtimestamp(modified_timestamp, tz=timezone.utc)
|
||||
dt = dt_utc.replace(tzinfo=None) # Remove timezone info for local datetime
|
||||
self.log(f"Extracted modified timestamp (UTC): {dt}", "debug")
|
||||
return dt
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fall back to upload_date (YYYYMMDD format - only has date, no time)
|
||||
# This should be last resort as it loses time information
|
||||
upload_date = info_dict.get('upload_date')
|
||||
if upload_date and len(upload_date) == 8:
|
||||
try:
|
||||
# Try to get time from filename if it has timestamp format
|
||||
# TikTok sometimes includes timestamp in the video ID
|
||||
dt = datetime.strptime(upload_date, '%Y%m%d')
|
||||
self.log(f"Only date available (no time): {dt.date()}", "warning")
|
||||
return dt
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def download_profile(self,
|
||||
username: str,
|
||||
number_of_days: int = 7,
|
||||
full_profile: bool = False,
|
||||
output_dir: Path = None,
|
||||
defer_database: bool = False) -> Tuple[Dict[str, datetime], List[Path]]:
|
||||
"""
|
||||
Download TikTok profile videos
|
||||
|
||||
Args:
|
||||
username: TikTok username (without @)
|
||||
number_of_days: Number of days to download (ignored if full_profile=True)
|
||||
full_profile: If True, download entire profile
|
||||
output_dir: Output directory (uses base_path/username if not specified)
|
||||
defer_database: If True, don't record to database immediately - store in
|
||||
pending_downloads for later recording after file move is complete
|
||||
|
||||
Returns:
|
||||
Tuple of (file_timestamps dict, list of downloaded files)
|
||||
"""
|
||||
self.defer_database = defer_database # Store for use in _record_download
|
||||
username = username.lstrip('@')
|
||||
output_dir = output_dir or self.base_path / username
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.log(f"Downloading TikTok profile: @{username}", "info")
|
||||
self.activity_manager.update_status("Checking videos")
|
||||
|
||||
# HYBRID APPROACH: Use yt-dlp to get ID list (fast), then gallery-dl per video (handles carousels)
|
||||
|
||||
# Step 1: Use yt-dlp to quickly get list of video IDs with dates
|
||||
profile_url = f"https://www.tiktok.com/@{username}"
|
||||
list_cmd = [
|
||||
"yt-dlp",
|
||||
"--flat-playlist", # Don't download, just list
|
||||
"--print", "%(upload_date)s %(id)s", # Print date and ID
|
||||
"--quiet",
|
||||
"--no-warnings",
|
||||
profile_url
|
||||
]
|
||||
|
||||
self.log(f"Getting video list with yt-dlp...", "debug")
|
||||
|
||||
# Get list of video IDs with dates
|
||||
try:
|
||||
result = subprocess.run(list_cmd, capture_output=True, text=True, timeout=60)
|
||||
lines = [line.strip() for line in result.stdout.strip().split('\n') if line.strip()]
|
||||
|
||||
# Parse and filter by date if needed
|
||||
video_ids = []
|
||||
if not full_profile and number_of_days:
|
||||
from datetime import timedelta
|
||||
cutoff_date = datetime.now() - timedelta(days=number_of_days)
|
||||
cutoff_str = cutoff_date.strftime('%Y%m%d')
|
||||
|
||||
for line in lines:
|
||||
parts = line.split()
|
||||
if len(parts) >= 2:
|
||||
upload_date, video_id = parts[0], parts[1]
|
||||
# Only include videos after cutoff date
|
||||
if upload_date >= cutoff_str:
|
||||
video_ids.append(video_id)
|
||||
else:
|
||||
# No filter, take all
|
||||
video_ids = [line.split()[1] for line in lines if len(line.split()) >= 2]
|
||||
|
||||
self.log(f"Found {len(video_ids)} posts to download", "info")
|
||||
except Exception as e:
|
||||
self.log(f"Failed to get video list: {e}", "error")
|
||||
return {}, []
|
||||
|
||||
if not video_ids:
|
||||
self.log("No videos found matching criteria", "info")
|
||||
return {}, []
|
||||
|
||||
# Set initial progress so dashboard shows 0/N immediately
|
||||
self.activity_manager.update_status(
|
||||
"Downloading videos",
|
||||
progress_current=0,
|
||||
progress_total=len(video_ids)
|
||||
)
|
||||
|
||||
# Crash recovery checkpoint
|
||||
from modules.task_checkpoint import TaskCheckpoint
|
||||
checkpoint = TaskCheckpoint(f'tiktok:{username}', 'scraping')
|
||||
checkpoint.start(total_items=len(video_ids))
|
||||
if checkpoint.is_recovering():
|
||||
self.log(f"TikTok @{username}: recovering — skipping already-downloaded videos", "info")
|
||||
|
||||
# Step 2: Download each video individually with gallery-dl (fast per video, handles carousels)
|
||||
for i, video_id in enumerate(video_ids, 1):
|
||||
# Update progress at start of each iteration (fires even on skips)
|
||||
self.activity_manager.update_status(
|
||||
"Downloading videos",
|
||||
progress_current=i,
|
||||
progress_total=len(video_ids)
|
||||
)
|
||||
|
||||
# Skip if already completed in a previous crashed run
|
||||
if checkpoint.is_completed(video_id):
|
||||
continue
|
||||
|
||||
checkpoint.set_current(video_id)
|
||||
|
||||
# Skip if already downloaded
|
||||
if self._is_already_downloaded(video_id, username):
|
||||
self.log(f"[{i}/{len(video_ids)}] Skipping already downloaded: {video_id}", "debug")
|
||||
checkpoint.mark_completed(video_id)
|
||||
continue
|
||||
|
||||
video_url = f"https://www.tiktok.com/@{username}/video/{video_id}"
|
||||
self.log(f"[{i}/{len(video_ids)}] Downloading {video_id}", "debug")
|
||||
|
||||
cmd = [
|
||||
"gallery-dl",
|
||||
"--write-metadata",
|
||||
"-D", str(output_dir),
|
||||
"-f", "{date:%Y%m%d}_{desc}_{id}_{num}.{extension}",
|
||||
video_url
|
||||
]
|
||||
|
||||
try:
|
||||
self.log(f"Calling gallery-dl for {video_id}", "debug")
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
||||
self.log(f"gallery-dl returned: code={result.returncode}, stdout lines={len(result.stdout.splitlines()) if result.stdout else 0}", "debug")
|
||||
if result.returncode != 0 and result.stderr:
|
||||
stderr = result.stderr
|
||||
if "not available" in stderr.lower() or "404" in stderr:
|
||||
self.log(f"Video {video_id} not available (deleted or private)", "warning")
|
||||
else:
|
||||
self.log(f"Failed to download {video_id}: {stderr[:100]}", "warning")
|
||||
except subprocess.TimeoutExpired:
|
||||
self.log(f"Timeout downloading {video_id}", "warning")
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading {video_id}: {e}", "warning")
|
||||
|
||||
checkpoint.mark_completed(video_id)
|
||||
|
||||
checkpoint.finish()
|
||||
|
||||
# Post-process: Rename files with long descriptions and remove audio-only files
|
||||
for file in output_dir.glob("*"):
|
||||
if file.is_file() and not file.suffix == '.json':
|
||||
# Remove audio-only files (.mp3, .m4a, .aac)
|
||||
if file.suffix.lower() in ['.mp3', '.m4a', '.aac', '.wav', '.ogg']:
|
||||
self.log(f"Removing audio-only file: {file.name}", "debug")
|
||||
file.unlink()
|
||||
# Also remove corresponding JSON
|
||||
json_file = file.with_suffix(file.suffix + '.json')
|
||||
if json_file.exists():
|
||||
json_file.unlink()
|
||||
continue
|
||||
|
||||
# Truncate long filenames (max 255 chars for Linux)
|
||||
if len(file.name) > 200: # Leave some margin
|
||||
# Parse filename: YYYYMMDD_description_ID_NUM.ext
|
||||
parts = file.name.rsplit('_', 2) # Split from right to preserve ID and num
|
||||
if len(parts) == 3:
|
||||
date_and_desc, video_id, num_and_ext = parts
|
||||
# Split date from description
|
||||
date_part = date_and_desc[:8] # YYYYMMDD
|
||||
desc_part = date_and_desc[9:] # Everything after date_
|
||||
|
||||
# Calculate max description length
|
||||
# Format: DATE_DESC_ID_NUM.EXT
|
||||
fixed_length = len(date_part) + len(video_id) + len(num_and_ext) + 3 # 3 underscores
|
||||
max_desc_len = 200 - fixed_length
|
||||
|
||||
if len(desc_part) > max_desc_len:
|
||||
truncated_desc = desc_part[:max_desc_len-3] + "..."
|
||||
new_name = f"{date_part}_{truncated_desc}_{video_id}_{num_and_ext}"
|
||||
new_path = file.parent / new_name
|
||||
|
||||
self.log(f"Truncating long filename: {file.name[:50]}... -> {new_name[:50]}...", "debug")
|
||||
file.rename(new_path)
|
||||
|
||||
# Rename corresponding JSON file too
|
||||
json_file = Path(str(file) + '.json')
|
||||
if json_file.exists():
|
||||
new_json = Path(str(new_path) + '.json')
|
||||
json_file.rename(new_json)
|
||||
|
||||
# Process downloaded files and extract timestamps from JSON
|
||||
downloaded_files = []
|
||||
file_timestamps = {}
|
||||
processed_ids = set() # Track IDs we've checked in DB (not in this loop, but in previous downloads)
|
||||
started_ids = set() # Track IDs we've started processing in THIS run
|
||||
|
||||
for json_file in output_dir.glob("*.json"):
|
||||
try:
|
||||
with open(json_file, 'r', encoding='utf-8') as f:
|
||||
info = json.load(f)
|
||||
|
||||
# Get video ID
|
||||
video_id = info.get('id', '')
|
||||
|
||||
# Extract timestamp from gallery-dl's createTime field (needed for all files)
|
||||
timestamp = None
|
||||
create_time = info.get('createTime')
|
||||
if create_time:
|
||||
try:
|
||||
timestamp = datetime.fromtimestamp(int(create_time))
|
||||
self.log(f"Extracted timestamp {timestamp} from createTime", "debug")
|
||||
except Exception:
|
||||
# Fall back to old yt-dlp method if createTime not available
|
||||
timestamp = self.extract_date_from_info(info)
|
||||
|
||||
# gallery-dl names JSON files as: filename.ext.json
|
||||
# So we need to remove the .json extension to get the media file
|
||||
media_file = Path(str(json_file)[:-5]) # Remove .json extension
|
||||
|
||||
if not media_file.exists():
|
||||
self.log(f"Media file not found for {json_file.name}", "warning")
|
||||
json_file.unlink()
|
||||
continue
|
||||
|
||||
video_file = media_file # Use same variable name for compatibility
|
||||
|
||||
# Check if already downloaded - but only check ONCE per video_id per run
|
||||
# (Don't check again for carousel photos #2, #3 after we've started processing #1)
|
||||
if video_id and video_id not in started_ids:
|
||||
if self._is_already_downloaded(video_id, username):
|
||||
self.log(f"Skipping already downloaded post: {video_id}", "debug")
|
||||
# Mark as processed so we don't check again for this ID's other files
|
||||
processed_ids.add(video_id)
|
||||
# Just remove JSON file, keep media files (they're already processed)
|
||||
json_file.unlink()
|
||||
continue
|
||||
# Mark that we've started processing this video_id
|
||||
started_ids.add(video_id)
|
||||
|
||||
# Skip if this video_id was marked as already downloaded
|
||||
if video_id in processed_ids:
|
||||
json_file.unlink()
|
||||
continue
|
||||
|
||||
# ALWAYS add file to downloaded list and apply timestamp (even for carousel photos #2, #3)
|
||||
downloaded_files.append(video_file)
|
||||
if timestamp:
|
||||
file_timestamps[video_file.name] = timestamp
|
||||
self.log(f"Extracted timestamp {timestamp} for {video_file.name}", "debug")
|
||||
|
||||
# Check for duplicate hash before recording (hash blacklist persists even if original deleted)
|
||||
file_hash = self.db.get_file_hash(str(video_file)) if self.db else None
|
||||
if file_hash:
|
||||
existing = self.db.get_download_by_file_hash(file_hash)
|
||||
if existing and existing.get('file_path') and str(video_file) != existing.get('file_path'):
|
||||
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
|
||||
self.log(f"⚠ Duplicate content detected (hash match): {video_file.name} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
||||
# Delete the duplicate regardless of whether original file still exists
|
||||
try:
|
||||
video_file.unlink()
|
||||
self.log(f"Deleted duplicate (hash blacklist): {video_file.name}", "debug")
|
||||
# Mark as processed so we don't try to download again
|
||||
processed_ids.add(video_id)
|
||||
json_file.unlink()
|
||||
continue
|
||||
except Exception as e:
|
||||
self.log(f"Failed to delete duplicate {video_file.name}: {e}", "warning")
|
||||
|
||||
# Record in database (each file gets its own entry, even for carousels)
|
||||
if video_id:
|
||||
self._record_download(
|
||||
video_id=video_id,
|
||||
username=username,
|
||||
filename=video_file.name,
|
||||
post_date=timestamp,
|
||||
metadata={"title": info.get('desc', ''), "description": info.get('desc', '')},
|
||||
deferred=self.defer_database
|
||||
)
|
||||
|
||||
# Remove JSON file after processing
|
||||
json_file.unlink()
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Failed to process {json_file}: {e}", "error")
|
||||
|
||||
self.log(f"Downloaded {len(downloaded_files)} files from @{username}", "info")
|
||||
|
||||
# Apply timestamps to files
|
||||
import os
|
||||
for file_path in downloaded_files:
|
||||
filename = file_path.name
|
||||
if filename in file_timestamps:
|
||||
timestamp = file_timestamps[filename]
|
||||
try:
|
||||
# Convert datetime to unix timestamp
|
||||
unix_time = timestamp.timestamp()
|
||||
# Set both access time and modification time
|
||||
os.utime(str(file_path), (unix_time, unix_time))
|
||||
self.log(f"Applied timestamp {timestamp} to {filename}", "debug")
|
||||
except Exception as e:
|
||||
self.log(f"Failed to apply timestamp to {filename}: {e}", "warning")
|
||||
|
||||
# Store timestamps for later use
|
||||
self.file_timestamps.update(file_timestamps)
|
||||
|
||||
return file_timestamps, downloaded_files
|
||||
|
||||
def download_video(self, url: str, output_dir: Path = None) -> Tuple[Optional[datetime], Optional[Path]]:
|
||||
"""
|
||||
Download a single TikTok video
|
||||
|
||||
Args:
|
||||
url: TikTok video URL
|
||||
output_dir: Output directory
|
||||
|
||||
Returns:
|
||||
Tuple of (timestamp, downloaded file path)
|
||||
"""
|
||||
output_dir = output_dir or self.base_path
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.log(f"Downloading video: {url}", "info")
|
||||
|
||||
# First, get video info without downloading
|
||||
cmd_info = [
|
||||
"yt-dlp",
|
||||
"--dump-json",
|
||||
"--no-warnings",
|
||||
"--quiet",
|
||||
url
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd_info, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
self.log(f"Failed to get video info: {result.stderr}", "error")
|
||||
return None, None
|
||||
|
||||
info = json.loads(result.stdout)
|
||||
timestamp = self.extract_date_from_info(info)
|
||||
|
||||
# Check if this is a photo post (no video, only audio)
|
||||
formats = info.get('formats', [])
|
||||
has_video = any(f.get('vcodec') != 'none' for f in formats)
|
||||
|
||||
if not has_video and len(formats) > 0:
|
||||
# This is a photo/image post - skip it
|
||||
self.log("Skipping TikTok photo post (only videos are downloaded)", "info")
|
||||
return timestamp, None
|
||||
|
||||
# Download video
|
||||
output_template = str(output_dir / "%(upload_date)s_%(title)s_%(id)s.%(ext)s")
|
||||
cmd_download = [
|
||||
"yt-dlp",
|
||||
"--format", "best", # Explicitly request best video+audio format
|
||||
"--no-warnings",
|
||||
"--quiet",
|
||||
"-o", output_template,
|
||||
url
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd_download, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
self.log(f"Failed to download video: {result.stderr}", "error")
|
||||
return timestamp, None
|
||||
|
||||
# Find the downloaded file
|
||||
expected_name = output_template.replace('%(upload_date)s', info.get('upload_date', 'unknown'))
|
||||
expected_name = expected_name.replace('%(title)s', info.get('title', 'video'))
|
||||
expected_name = expected_name.replace('%(id)s', info.get('id', ''))
|
||||
expected_name = expected_name.replace('%(ext)s', info.get('ext', 'mp4'))
|
||||
|
||||
downloaded_file = Path(expected_name)
|
||||
if not downloaded_file.exists():
|
||||
# Try to find it by pattern
|
||||
pattern = f"*{info.get('id', '')}*.mp4"
|
||||
matches = list(output_dir.glob(pattern))
|
||||
if matches:
|
||||
downloaded_file = matches[0]
|
||||
|
||||
if downloaded_file.exists():
|
||||
if timestamp:
|
||||
self.file_timestamps[downloaded_file.name] = timestamp
|
||||
return timestamp, downloaded_file
|
||||
|
||||
return timestamp, None
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Failed to download video: {e}", "error")
|
||||
return None, None
|
||||
|
||||
def get_file_timestamps(self) -> Dict[str, datetime]:
|
||||
"""Get the collected file timestamps"""
|
||||
return self.file_timestamps.copy()
|
||||
|
||||
def clear_timestamps(self):
|
||||
"""Clear the stored timestamps"""
|
||||
self.file_timestamps.clear()
|
||||
|
||||
|
||||
def download_tiktok_profile(username: str,
|
||||
days: int = 7,
|
||||
base_path: Path = None,
|
||||
log_callback=None,
|
||||
unified_db=None) -> Dict[str, datetime]:
|
||||
"""
|
||||
Simple function interface for downloading TikTok profile
|
||||
|
||||
Args:
|
||||
username: TikTok username
|
||||
days: Number of days to download
|
||||
base_path: Base download path
|
||||
log_callback: Optional logging callback
|
||||
unified_db: UnifiedDatabase instance (required)
|
||||
|
||||
Returns:
|
||||
Dictionary mapping filenames to timestamps
|
||||
"""
|
||||
if not unified_db:
|
||||
raise ValueError("unified_db is required for TikTok downloads")
|
||||
|
||||
downloader = TikTokDownloader(base_path=base_path, log_callback=log_callback, unified_db=unified_db)
|
||||
timestamps, files = downloader.download_profile(username, number_of_days=days)
|
||||
return timestamps
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test the module
|
||||
import tempfile
|
||||
|
||||
print("TikTok Downloader Module Test")
|
||||
print("="*60)
|
||||
|
||||
# Test with a small profile
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
downloader = TikTokDownloader(base_path=Path(tmpdir))
|
||||
|
||||
# You can test with a real TikTok username
|
||||
# timestamps, files = downloader.download_profile("username", number_of_days=1)
|
||||
|
||||
print("Module ready for integration")
|
||||
1512
modules/tmdb_client.py
Normal file
1512
modules/tmdb_client.py
Normal file
File diff suppressed because it is too large
Load Diff
1116
modules/toolzu_module.py
Normal file
1116
modules/toolzu_module.py
Normal file
File diff suppressed because it is too large
Load Diff
6350
modules/unified_database.py
Executable file
6350
modules/unified_database.py
Executable file
File diff suppressed because it is too large
Load Diff
348
modules/universal_logger.py
Normal file
348
modules/universal_logger.py
Normal file
@@ -0,0 +1,348 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Universal Logging Module for Media Downloader
|
||||
Provides consistent logging across all components with automatic rotation and 7-day retention
|
||||
"""
|
||||
|
||||
import logging
|
||||
import logging.handlers
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
import os
|
||||
import glob
|
||||
import sys
|
||||
|
||||
class UniversalLogger:
|
||||
"""
|
||||
Universal logger with automatic rotation and cleanup
|
||||
|
||||
Features:
|
||||
- Consistent log format across all components
|
||||
- Daily log rotation at midnight
|
||||
- Automatic cleanup of logs older than 7 days
|
||||
- Separate log files per component
|
||||
- Console and file output
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
component_name: str,
|
||||
log_dir: str = None,
|
||||
retention_days: int = 7,
|
||||
console_level: str = 'INFO',
|
||||
file_level: str = 'DEBUG'
|
||||
):
|
||||
"""
|
||||
Initialize universal logger for a component
|
||||
|
||||
Args:
|
||||
component_name: Name of the component (e.g., 'API', 'Scheduler', 'MediaDownloader')
|
||||
log_dir: Directory to store logs (default: /opt/media-downloader/logs)
|
||||
retention_days: Number of days to keep logs (default: 7)
|
||||
console_level: Logging level for console output (default: INFO)
|
||||
file_level: Logging level for file output (default: DEBUG)
|
||||
"""
|
||||
self.component_name = component_name
|
||||
self.retention_days = retention_days
|
||||
|
||||
# Set up log directory
|
||||
if log_dir is None:
|
||||
base_path = Path(__file__).parent.parent
|
||||
self.log_dir = base_path / 'logs'
|
||||
else:
|
||||
self.log_dir = Path(log_dir)
|
||||
|
||||
self.log_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# Create logger
|
||||
self.logger = logging.getLogger(f'MediaDownloader.{component_name}')
|
||||
self.logger.setLevel(logging.DEBUG)
|
||||
|
||||
# Remove existing handlers to prevent duplicates
|
||||
self.logger.handlers = []
|
||||
|
||||
# Create formatter - matches media-downloader.py format
|
||||
# Format: 2025-11-12 21:00:00.123456 [ComponentName] [Module] [LEVEL] message
|
||||
# Custom formatter to include microseconds for proper log sorting
|
||||
class MicrosecondFormatter(logging.Formatter):
|
||||
def formatTime(self, record, datefmt=None):
|
||||
ct = datetime.fromtimestamp(record.created)
|
||||
return ct.strftime('%Y-%m-%d %H:%M:%S.%f')
|
||||
|
||||
formatter = MicrosecondFormatter(
|
||||
'%(asctime)s [%(name)s] %(message)s'
|
||||
)
|
||||
|
||||
# File handler with date-stamped filename (one file per day)
|
||||
# Format: 20251113_component.log (all logs for the day append to same file)
|
||||
date_stamp = datetime.now().strftime('%Y%m%d')
|
||||
log_file = self.log_dir / f'{date_stamp}_{component_name.lower()}.log'
|
||||
file_handler = logging.FileHandler(
|
||||
filename=str(log_file),
|
||||
mode='a', # Append mode - preserves logs across restarts
|
||||
encoding='utf-8'
|
||||
)
|
||||
file_handler.setLevel(getattr(logging, file_level.upper()))
|
||||
file_handler.setFormatter(formatter)
|
||||
self.logger.addHandler(file_handler)
|
||||
|
||||
# Console handler
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(getattr(logging, console_level.upper()))
|
||||
console_handler.setFormatter(formatter)
|
||||
self.logger.addHandler(console_handler)
|
||||
|
||||
# Suppress noisy third-party loggers
|
||||
logging.getLogger('asyncio').setLevel(logging.WARNING)
|
||||
logging.getLogger('selenium').setLevel(logging.WARNING)
|
||||
logging.getLogger('urllib3').setLevel(logging.WARNING)
|
||||
logging.getLogger('websocket').setLevel(logging.WARNING)
|
||||
logging.getLogger('requests').setLevel(logging.WARNING)
|
||||
logging.getLogger('PIL').setLevel(logging.WARNING)
|
||||
logging.getLogger('instaloader').setLevel(logging.WARNING)
|
||||
logging.getLogger('tensorflow').setLevel(logging.ERROR)
|
||||
logging.getLogger('deepface').setLevel(logging.WARNING)
|
||||
|
||||
# Clean up old logs on initialization
|
||||
self._cleanup_old_logs()
|
||||
|
||||
def _cleanup_old_logs(self):
|
||||
"""Remove log files older than retention_days"""
|
||||
try:
|
||||
cutoff_date = datetime.now() - timedelta(days=self.retention_days)
|
||||
# Match pattern: YYYYMMDD_HHMMSS_component.log
|
||||
pattern = str(self.log_dir / f'*_{self.component_name.lower()}.log')
|
||||
|
||||
cleaned_count = 0
|
||||
for log_file in glob.glob(pattern):
|
||||
try:
|
||||
file_path = Path(log_file)
|
||||
# Check file modification time
|
||||
mtime = datetime.fromtimestamp(file_path.stat().st_mtime)
|
||||
if mtime < cutoff_date:
|
||||
file_path.unlink()
|
||||
cleaned_count += 1
|
||||
except Exception as e:
|
||||
# Don't fail if we can't clean up a single file
|
||||
pass
|
||||
|
||||
if cleaned_count > 0:
|
||||
# Log cleanup message through the logger itself (after file handler is set up)
|
||||
self.info(f"Cleaned up {cleaned_count} old {self.component_name} log file(s)", module='LogCleanup')
|
||||
except Exception as e:
|
||||
# Don't fail initialization if cleanup fails
|
||||
pass
|
||||
|
||||
def _format_message(self, module: str, level: str, message: str) -> str:
|
||||
"""
|
||||
Format message to match media-downloader.py style
|
||||
|
||||
Args:
|
||||
module: Module name (e.g., 'Core', 'Forum', 'Instagram')
|
||||
level: Log level (e.g., 'INFO', 'ERROR', 'DEBUG')
|
||||
message: Log message
|
||||
|
||||
Returns:
|
||||
Formatted message: [Module] [LEVEL] message
|
||||
"""
|
||||
return f"[{module}] [{level.upper()}] {message}"
|
||||
|
||||
def _broadcast_error(self, message: str, module: str, level: str = 'ERROR'):
|
||||
"""
|
||||
Broadcast error to connected WebSocket clients for real-time notifications.
|
||||
Fails silently to not disrupt logging.
|
||||
"""
|
||||
try:
|
||||
# Try to import the WebSocket manager from the API
|
||||
# This will only work when the API is running
|
||||
from web.backend.api import manager
|
||||
if manager and manager.active_connections:
|
||||
manager.broadcast_sync({
|
||||
'type': 'error_alert',
|
||||
'error': {
|
||||
'module': module,
|
||||
'level': level,
|
||||
'message': message[:200], # Truncate for notification
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'component': self.component_name
|
||||
}
|
||||
})
|
||||
except Exception:
|
||||
# Fail silently - API may not be running or manager not available
|
||||
pass
|
||||
|
||||
def _record_error_to_db(self, message: str, module: str, level: str = 'ERROR'):
|
||||
"""
|
||||
Record error to error_log database table for dashboard display.
|
||||
Uses a separate connection to avoid circular dependencies.
|
||||
Fails silently to not disrupt logging.
|
||||
"""
|
||||
try:
|
||||
import sqlite3
|
||||
import hashlib
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
# Get database path
|
||||
db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db'
|
||||
if not db_path.exists():
|
||||
return
|
||||
|
||||
# Normalize message for deduplication (remove variable parts like URLs, paths, numbers)
|
||||
normalized = message
|
||||
normalized = re.sub(r'/[\w/\-\.]+\.(jpg|png|mp4|webp|gif|heic|mov)', '{file}', normalized)
|
||||
normalized = re.sub(r'https?://[^\s]+', '{url}', normalized)
|
||||
normalized = re.sub(r'\b\d+\b', '{n}', normalized)
|
||||
normalized = re.sub(r'[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}', '{uuid}', normalized)
|
||||
|
||||
# Create error hash for deduplication (module + normalized message)
|
||||
error_key = f"{module}:{normalized[:200]}"
|
||||
error_hash = hashlib.sha256(error_key.encode()).hexdigest()
|
||||
|
||||
# Use a quick connection with short timeout
|
||||
conn = sqlite3.connect(str(db_path), timeout=2.0)
|
||||
conn.execute("PRAGMA busy_timeout = 2000")
|
||||
cursor = conn.cursor()
|
||||
|
||||
now = datetime.now().isoformat()
|
||||
|
||||
# Upsert: insert new error or update occurrence count
|
||||
# Reset viewed_at and dismissed_at to NULL when error recurs so it shows as "new" on dashboard
|
||||
cursor.execute('''
|
||||
INSERT INTO error_log (error_hash, module, level, message, first_seen, last_seen, occurrence_count, log_file)
|
||||
VALUES (?, ?, ?, ?, ?, ?, 1, ?)
|
||||
ON CONFLICT(error_hash) DO UPDATE SET
|
||||
last_seen = excluded.last_seen,
|
||||
occurrence_count = error_log.occurrence_count + 1,
|
||||
viewed_at = NULL,
|
||||
dismissed_at = NULL
|
||||
''', (error_hash, module, level, message[:500], now, now, self.component_name))
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
# Broadcast to WebSocket clients for real-time notification
|
||||
self._broadcast_error(message, module, level)
|
||||
|
||||
except Exception:
|
||||
# Fail silently - don't let error logging break the main logging
|
||||
pass
|
||||
|
||||
def debug(self, message: str, module: str = 'Core'):
|
||||
"""Log debug message"""
|
||||
self.logger.debug(self._format_message(module, 'DEBUG', message))
|
||||
|
||||
def info(self, message: str, module: str = 'Core'):
|
||||
"""Log info message"""
|
||||
self.logger.info(self._format_message(module, 'INFO', message))
|
||||
|
||||
def warning(self, message: str, module: str = 'Core'):
|
||||
"""Log warning message"""
|
||||
self.logger.warning(self._format_message(module, 'WARNING', message))
|
||||
|
||||
def error(self, message: str, module: str = 'Core'):
|
||||
"""Log error message and record to error_log database"""
|
||||
self.logger.error(self._format_message(module, 'ERROR', message))
|
||||
# Record error to database for dashboard display
|
||||
self._record_error_to_db(message, module)
|
||||
|
||||
def critical(self, message: str, module: str = 'Core'):
|
||||
"""Log critical message and record to error_log database"""
|
||||
self.logger.critical(self._format_message(module, 'CRITICAL', message))
|
||||
# Record critical errors to database for dashboard display
|
||||
self._record_error_to_db(message, module, level='CRITICAL')
|
||||
|
||||
def success(self, message: str, module: str = 'Core'):
|
||||
"""Log success message (maps to INFO level)"""
|
||||
self.logger.info(self._format_message(module, 'SUCCESS', message))
|
||||
|
||||
def log(self, message: str, level: str = 'INFO', module: str = 'Core'):
|
||||
"""
|
||||
Generic log method supporting all levels
|
||||
|
||||
Args:
|
||||
message: Log message
|
||||
level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL, SUCCESS)
|
||||
module: Module name
|
||||
"""
|
||||
level_map = {
|
||||
'DEBUG': self.debug,
|
||||
'INFO': self.info,
|
||||
'WARNING': self.warning,
|
||||
'ERROR': self.error,
|
||||
'CRITICAL': self.critical,
|
||||
'SUCCESS': self.success
|
||||
}
|
||||
|
||||
log_func = level_map.get(level.upper(), self.info)
|
||||
log_func(message, module)
|
||||
|
||||
def get_callback(self):
|
||||
"""
|
||||
Get a callback function compatible with existing module signatures
|
||||
|
||||
Returns:
|
||||
Callback function that can be passed to modules expecting log_callback
|
||||
"""
|
||||
def callback(*args):
|
||||
"""
|
||||
Flexible callback that handles multiple signature formats:
|
||||
- callback(message, level)
|
||||
- callback(message, level, module)
|
||||
"""
|
||||
if len(args) == 2:
|
||||
message, level = args
|
||||
# Extract module from message if present
|
||||
if message.startswith('[') and ']' in message:
|
||||
end_bracket = message.index(']')
|
||||
module = message[1:end_bracket]
|
||||
message = message[end_bracket+1:].strip()
|
||||
# Remove level tag if present
|
||||
if message.startswith('[') and ']' in message:
|
||||
message = message[message.index(']')+1:].strip()
|
||||
self.log(message, level, module)
|
||||
else:
|
||||
self.log(message, level)
|
||||
elif len(args) == 3:
|
||||
message, level, module = args
|
||||
self.log(message, level, module)
|
||||
else:
|
||||
# Default: treat as simple message
|
||||
self.info(str(args))
|
||||
|
||||
return callback
|
||||
|
||||
|
||||
# Singleton instances for common components
|
||||
_logger_instances = {}
|
||||
|
||||
def get_logger(
|
||||
component_name: str,
|
||||
log_dir: str = None,
|
||||
retention_days: int = 7,
|
||||
console_level: str = 'INFO',
|
||||
file_level: str = 'DEBUG'
|
||||
) -> UniversalLogger:
|
||||
"""
|
||||
Get or create a logger instance for a component (singleton pattern)
|
||||
|
||||
Args:
|
||||
component_name: Name of the component
|
||||
log_dir: Directory to store logs
|
||||
retention_days: Number of days to keep logs
|
||||
console_level: Console logging level
|
||||
file_level: File logging level
|
||||
|
||||
Returns:
|
||||
UniversalLogger instance
|
||||
"""
|
||||
if component_name not in _logger_instances:
|
||||
_logger_instances[component_name] = UniversalLogger(
|
||||
component_name=component_name,
|
||||
log_dir=log_dir,
|
||||
retention_days=retention_days,
|
||||
console_level=console_level,
|
||||
file_level=file_level
|
||||
)
|
||||
|
||||
return _logger_instances[component_name]
|
||||
1433
modules/universal_video_downloader.py
Normal file
1433
modules/universal_video_downloader.py
Normal file
File diff suppressed because it is too large
Load Diff
2179
modules/youtube_channel_monitor.py
Normal file
2179
modules/youtube_channel_monitor.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user