Files
media-downloader/modules/unified_database.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

6350 lines
292 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Unified Database Manager for Media Downloader System
Consolidates all platform databases into a single, optimized database
"""
import sqlite3
import json
import hashlib
import time
import functools
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple, Any
from contextlib import contextmanager
from threading import Lock, RLock
import queue
from modules.universal_logger import get_logger
logger = get_logger('Database')
# Database retry configuration constants
DB_MAX_RETRIES = 3
DB_BASE_DELAY = 0.1 # Base delay in seconds for exponential backoff
DB_MAX_DELAY = 5.0 # Maximum delay in seconds
def _is_lock_error(e: Exception) -> bool:
"""Check if an OperationalError is a database lock/busy/deadlock error."""
msg = str(e).lower()
return ("database is locked" in msg or "database is busy" in msg
or "deadlock detected" in msg or "could not obtain lock" in msg
or "lock timeout" in msg or "canceling statement due to lock" in msg)
def _safe_alter(cursor, sql: str) -> bool:
"""Execute an ALTER TABLE statement, skipping gracefully on lock timeout.
Returns True if executed, False if skipped due to lock contention."""
try:
cursor.execute(sql)
return True
except Exception as e:
msg = str(e).lower()
if _is_lock_error(e):
logger.debug(f"Skipped migration (lock contention): {sql[:80]}")
return False
# For "already exists" type errors, also skip gracefully
if 'already exists' in msg or 'duplicate column' in msg:
return True
raise
def retry_on_lock(operation_name: str = "database operation", max_retries: int = DB_MAX_RETRIES,
base_delay: float = DB_BASE_DELAY, max_delay: float = DB_MAX_DELAY):
"""
Decorator for retrying database operations on lock errors.
Args:
operation_name: Name for logging
max_retries: Maximum number of retry attempts
base_delay: Base delay in seconds for exponential backoff
max_delay: Maximum delay cap in seconds
"""
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
last_error = None
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except sqlite3.OperationalError as e:
last_error = e
if _is_lock_error(e):
if attempt < max_retries - 1:
delay = min(max_delay, base_delay * (2 ** attempt))
logger.warning(f"{operation_name} locked, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries})")
time.sleep(delay)
continue
else:
logger.error(f"{operation_name} failed after {max_retries} attempts: {e}")
else:
# Non-lock operational error, don't retry
raise
# If we exhausted retries, raise the last error
if last_error:
raise last_error
return wrapper
return decorator
class DatabasePool:
"""Connection pool for better concurrency"""
def __init__(self, db_path: str, pool_size: int = 20):
self.db_path = db_path
self.pool = queue.Queue(maxsize=pool_size)
self.write_lock = RLock() # Reentrant lock for writes
# Pre-populate pool with connections configured for better concurrency
for _ in range(pool_size):
conn = sqlite3.connect(
db_path,
check_same_thread=False,
timeout=30.0,
isolation_level=None # Manual transaction control
)
conn.row_factory = sqlite3.Row
# PRAGMAs are translated to no-ops by pg_adapter when using PostgreSQL
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=NORMAL")
conn.execute("PRAGMA cache_size=10000")
conn.execute("PRAGMA temp_store=MEMORY")
conn.execute("PRAGMA busy_timeout=30000")
conn.execute("PRAGMA wal_checkpoint=TRUNCATE")
conn.execute("PRAGMA foreign_keys=ON")
self.pool.put(conn)
@contextmanager
def get_connection(self, for_write=False):
"""Get a connection from the pool
Args:
for_write: If True, acquire write lock for serialized writes
"""
if for_write:
with self.write_lock:
conn = self.pool.get(timeout=5) # 5 second timeout to get connection
try:
# Use IMMEDIATE mode for writes to fail fast if locked
conn.execute("BEGIN IMMEDIATE")
yield conn
conn.commit()
except Exception:
conn.rollback()
raise
finally:
self.pool.put(conn)
else:
conn = self.pool.get(timeout=5)
try:
yield conn
finally:
self.pool.put(conn)
def close_all(self):
"""Close all connections in the pool"""
while not self.pool.empty():
conn = self.pool.get()
conn.close()
def checkpoint(self):
"""Run a WAL checkpoint to merge WAL file into main database"""
conn = self.pool.get(timeout=5)
try:
result = conn.execute("PRAGMA wal_checkpoint(TRUNCATE)").fetchone()
# result is (busy, log_pages, checkpointed_pages)
if result:
logger.debug(f"WAL checkpoint: busy={result[0]}, log={result[1]}, checkpointed={result[2]}")
return result
except Exception as e:
logger.warning(f"WAL checkpoint failed: {e}")
return None
finally:
self.pool.put(conn)
def get_write_connection(self):
"""Get a dedicated connection for writing (not from pool)"""
conn = sqlite3.connect(
self.db_path,
check_same_thread=False,
timeout=60.0, # Wait up to 60 seconds
isolation_level='IMMEDIATE' # Lock immediately for writes
)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=NORMAL")
conn.execute("PRAGMA busy_timeout=60000") # 60 second timeout
conn.execute("PRAGMA foreign_keys=ON") # Enable foreign key enforcement
return conn
class UnifiedDatabase:
"""Unified database for all media downloads"""
def __init__(self, db_path: str = None, use_pool: bool = True, pool_size: int = 3):
"""
Initialize unified database
Args:
db_path: Path to the unified database file (defaults to /opt/media-downloader/database/media_downloader.db)
use_pool: Whether to use connection pooling
pool_size: Size of connection pool (default: 5, recommended: 20 for API workers)
"""
# Use proper default path if none provided
if db_path is None:
db_path = str(Path(__file__).parent.parent / 'database' / 'media_downloader.db')
self.db_path = Path(db_path)
self.use_pool = use_pool
if use_pool:
self.pool = DatabasePool(str(self.db_path), pool_size=pool_size)
else:
self.pool = None
self._init_database()
@contextmanager
def get_connection(self, for_write=False):
"""Get a database connection
Args:
for_write: If True, use write lock for serialized writes
"""
if self.pool:
with self.pool.get_connection(for_write=for_write) as conn:
yield conn
else:
conn = sqlite3.connect(
self.db_path,
timeout=10.0, # Reduced timeout for faster failure
isolation_level=None # Manual transaction control
)
conn.row_factory = sqlite3.Row
# PRAGMAs are translated to no-ops by pg_adapter when using PostgreSQL
conn.execute("PRAGMA busy_timeout=10000")
conn.execute("PRAGMA journal_mode=WAL")
try:
if for_write:
conn.execute("BEGIN IMMEDIATE")
yield conn
if for_write:
conn.commit()
except Exception:
if for_write:
conn.rollback()
raise
finally:
conn.close()
def get_dedicated_write_connection(self):
"""Get a dedicated write connection (not from pool) for critical writes"""
if self.pool:
return self.pool.get_write_connection()
else:
conn = sqlite3.connect(
self.db_path,
timeout=60.0,
isolation_level='IMMEDIATE'
)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA busy_timeout=60000")
return conn
def _init_database(self):
"""Initialize the unified database schema"""
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# PRAGMAs are translated to no-ops by pg_adapter when using PostgreSQL
cursor.execute("PRAGMA journal_mode = WAL")
cursor.execute("PRAGMA busy_timeout = 10000")
cursor.execute("PRAGMA synchronous = NORMAL")
cursor.execute("PRAGMA foreign_keys = ON")
# Set a short lock_timeout for ALTER TABLE migrations so they don't
# block forever if pg_dump holds ACCESS SHARE locks. Columns already
# exist after first run, so skipping on timeout is safe.
try:
cursor.execute("SET lock_timeout = '2s'")
except Exception:
pass # SQLite doesn't support SET
# Main downloads table - unified for all platforms
cursor.execute('''
CREATE TABLE IF NOT EXISTS downloads (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url_hash TEXT UNIQUE NOT NULL, -- SHA256 hash of URL for deduplication
url TEXT NOT NULL,
platform TEXT NOT NULL, -- 'instagram', 'tiktok', 'forum'
source TEXT, -- Username for social media, forum name for forums
content_type TEXT, -- 'post', 'story', 'reel', 'video', 'image', etc.
filename TEXT,
file_path TEXT,
file_size INTEGER,
file_hash TEXT, -- SHA256 of file content
method TEXT, -- Download method used
media_id TEXT, -- Platform-specific media identifier
post_date DATETIME,
download_date DATETIME DEFAULT CURRENT_TIMESTAMP,
status TEXT DEFAULT 'completed', -- 'completed', 'failed', 'pending'
attempts INTEGER DEFAULT 1,
error_message TEXT,
metadata TEXT -- JSON string for platform-specific data (includes intended_path when in review)
)
''')
# Forum-specific tables (still needed for thread tracking)
cursor.execute('''
CREATE TABLE IF NOT EXISTS forum_threads (
thread_id TEXT PRIMARY KEY,
forum_name TEXT NOT NULL,
thread_url TEXT UNIQUE NOT NULL,
thread_title TEXT,
author TEXT,
created_date DATETIME,
last_checked DATETIME,
last_post_date DATETIME,
post_count INTEGER DEFAULT 0,
status TEXT DEFAULT 'active', -- 'active', 'expired', 'completed'
monitor_until DATETIME,
metadata TEXT
)
''')
cursor.execute('''
CREATE TABLE IF NOT EXISTS forum_posts (
post_id TEXT PRIMARY KEY,
thread_id TEXT NOT NULL,
post_url TEXT,
author TEXT,
post_date DATETIME,
content_hash TEXT,
has_images BOOLEAN DEFAULT 0,
images_downloaded INTEGER DEFAULT 0,
metadata TEXT,
FOREIGN KEY (thread_id) REFERENCES forum_threads(thread_id)
)
''')
# Search monitoring table
cursor.execute('''
CREATE TABLE IF NOT EXISTS search_monitors (
search_id TEXT PRIMARY KEY,
platform TEXT NOT NULL,
source TEXT, -- Forum name or username
search_query TEXT,
search_url TEXT,
last_checked DATETIME,
check_frequency_hours INTEGER DEFAULT 24,
active BOOLEAN DEFAULT 1,
results_found INTEGER DEFAULT 0,
metadata TEXT
)
''')
# Scheduler state table
cursor.execute('''
CREATE TABLE IF NOT EXISTS scheduler_state (
task_id TEXT PRIMARY KEY,
last_run DATETIME,
next_run DATETIME,
run_count INTEGER DEFAULT 0,
status TEXT DEFAULT 'active', -- 'active', 'paused', 'disabled'
error_count INTEGER DEFAULT 0,
last_error TEXT,
metadata TEXT -- JSON string for additional data
)
''')
# Thread check history table to prevent duplicate checks
cursor.execute('''
CREATE TABLE IF NOT EXISTS thread_check_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
thread_id TEXT NOT NULL,
forum_name TEXT NOT NULL,
check_time DATETIME DEFAULT CURRENT_TIMESTAMP,
last_post_date DATETIME,
new_posts_found INTEGER DEFAULT 0,
images_downloaded INTEGER DEFAULT 0,
status TEXT DEFAULT 'completed', -- 'completed', 'failed', 'skipped'
UNIQUE(thread_id, check_time)
)
''')
# Index for efficient lookups
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_thread_check_history
ON thread_check_history(thread_id, check_time DESC)
''')
# Download queue (unified)
cursor.execute('''
CREATE TABLE IF NOT EXISTS download_queue (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
platform TEXT NOT NULL,
source TEXT,
referer TEXT,
save_path TEXT,
priority INTEGER DEFAULT 5, -- 1-10, lower is higher priority
status TEXT DEFAULT 'pending', -- 'pending', 'downloading', 'completed', 'failed'
attempts INTEGER DEFAULT 0,
max_attempts INTEGER DEFAULT 3,
created_date DATETIME DEFAULT CURRENT_TIMESTAMP,
download_date DATETIME,
error_message TEXT,
metadata TEXT
)
''')
# Notifications table (tracks sent Pushover notifications)
cursor.execute('''
CREATE TABLE IF NOT EXISTS notifications (
id INTEGER PRIMARY KEY AUTOINCREMENT,
platform TEXT NOT NULL,
source TEXT,
content_type TEXT,
message TEXT NOT NULL,
title TEXT,
priority INTEGER DEFAULT 0,
download_count INTEGER DEFAULT 1,
sent_at DATETIME DEFAULT CURRENT_TIMESTAMP,
status TEXT DEFAULT 'sent', -- 'sent', 'failed'
response_data TEXT, -- JSON response from Pushover API
metadata TEXT -- Additional JSON metadata
)
''')
# Recycle Bin table for soft deletes
cursor.execute('''
CREATE TABLE IF NOT EXISTS recycle_bin (
id TEXT PRIMARY KEY, -- UUID used as unique filename
original_path TEXT NOT NULL, -- Full original path for restore
original_filename TEXT NOT NULL, -- Original filename (shown in UI)
recycle_path TEXT NOT NULL, -- Current path in recycle bin
file_extension TEXT, -- .jpg, .mp4, etc.
file_size INTEGER, -- Size in bytes
file_hash TEXT, -- SHA256 of file content
original_mtime REAL, -- Original modification timestamp (Unix time)
deleted_from TEXT NOT NULL, -- 'downloads', 'media', 'review'
deleted_at DATETIME DEFAULT CURRENT_TIMESTAMP, -- When deleted
deleted_by TEXT, -- User who deleted (username)
metadata TEXT, -- JSON: platform, source, content_type, face_recognition, etc.
restore_count INTEGER DEFAULT 0 -- How many times restored
)
''')
# Instagram Perceptual Hashes table (for duplicate detection)
cursor.execute('''
CREATE TABLE IF NOT EXISTS instagram_perceptual_hashes (
id TEXT PRIMARY KEY,
file_path TEXT NOT NULL UNIQUE,
filename TEXT,
platform TEXT,
source TEXT,
content_type TEXT,
perceptual_hash TEXT NOT NULL,
text_overlay_count INTEGER DEFAULT 0,
text_overlay_chars INTEGER DEFAULT 0,
quality_score REAL DEFAULT 0,
clean_score REAL DEFAULT 0,
resolution INTEGER DEFAULT 0,
file_size INTEGER DEFAULT 0,
width INTEGER DEFAULT 0,
height INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# File Inventory table (database-first file tracking)
cursor.execute('''
CREATE TABLE IF NOT EXISTS file_inventory (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT NOT NULL UNIQUE,
filename TEXT NOT NULL,
platform TEXT NOT NULL,
source TEXT,
content_type TEXT,
method TEXT, -- Download method used
video_id TEXT, -- Video identifier for video platforms
file_size INTEGER,
file_hash TEXT,
width INTEGER,
height INTEGER,
location TEXT NOT NULL DEFAULT 'final',
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
last_verified TIMESTAMP,
metadata TEXT
)
''')
# Universal Video Downloads table (YouTube, Vimeo, Dailymotion, Bilibili, etc.)
cursor.execute('''
CREATE TABLE IF NOT EXISTS video_downloads (
id INTEGER PRIMARY KEY AUTOINCREMENT,
platform TEXT NOT NULL DEFAULT 'youtube',
video_id TEXT NOT NULL,
url TEXT NOT NULL,
title TEXT,
uploader TEXT,
upload_date DATETIME,
duration INTEGER,
file_path TEXT,
file_size INTEGER,
download_date DATETIME DEFAULT CURRENT_TIMESTAMP,
status TEXT DEFAULT 'completed',
metadata TEXT,
UNIQUE(platform, video_id)
)
''')
# Universal Video Preview List table (for multi-device sync)
cursor.execute('''
CREATE TABLE IF NOT EXISTS video_preview_list (
id INTEGER PRIMARY KEY AUTOINCREMENT,
platform TEXT NOT NULL DEFAULT 'youtube',
video_id TEXT NOT NULL,
url TEXT NOT NULL,
title TEXT,
uploader TEXT,
upload_date DATETIME,
duration INTEGER,
description TEXT,
thumbnail TEXT,
thumbnail_data BLOB,
view_count INTEGER,
like_count INTEGER,
already_downloaded INTEGER DEFAULT 0,
added_date DATETIME DEFAULT CURRENT_TIMESTAMP,
metadata TEXT,
UNIQUE(platform, video_id)
)
''')
# Add thumbnail_data column if not exists (migration for existing tables)
_safe_alter(cursor, "ALTER TABLE video_preview_list ADD COLUMN IF NOT EXISTS thumbnail_data BLOB")
_safe_alter(cursor, "ALTER TABLE video_downloads ADD COLUMN IF NOT EXISTS thumbnail_data BLOB")
# Migrate old youtube_downloads to video_downloads if exists
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='youtube_downloads'")
if cursor.fetchone():
cursor.execute('''
INSERT OR IGNORE INTO video_downloads
(platform, video_id, url, title, uploader, upload_date, duration, file_path, file_size, download_date, status, metadata)
SELECT 'youtube', video_id, url, title, uploader, upload_date, duration, file_path, file_size, download_date, status, metadata
FROM youtube_downloads
''')
# Migrate old youtube_preview_list to video_preview_list if exists
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='youtube_preview_list'")
if cursor.fetchone():
cursor.execute('''
INSERT OR IGNORE INTO video_preview_list
(platform, video_id, url, title, uploader, upload_date, duration, description, thumbnail, view_count, like_count, already_downloaded, added_date, metadata)
SELECT 'youtube', video_id, url, title, uploader, upload_date, duration, description, thumbnail, view_count, like_count, already_downloaded, added_date, metadata
FROM youtube_preview_list
''')
# Video downloads indexes
cursor.execute('CREATE INDEX IF NOT EXISTS idx_video_platform ON video_downloads(platform)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_video_id ON video_downloads(video_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_video_platform_id ON video_downloads(platform, video_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_video_uploader ON video_downloads(uploader)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_video_upload_date ON video_downloads(upload_date)')
# Create optimized indexes
# Single column indexes
cursor.execute('CREATE INDEX IF NOT EXISTS idx_url_hash ON downloads(url_hash)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_platform ON downloads(platform)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_source ON downloads(source)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_status ON downloads(status)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_download_date ON downloads(download_date)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_post_date ON downloads(post_date)')
# Composite indexes for common queries
cursor.execute('CREATE INDEX IF NOT EXISTS idx_platform_source ON downloads(platform, source)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_platform_status ON downloads(platform, status)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_source_content ON downloads(source, content_type)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_platform_date ON downloads(platform, post_date)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_source_date ON downloads(source, download_date)')
# File hash index for deduplication
cursor.execute('CREATE INDEX IF NOT EXISTS idx_file_hash ON downloads(file_hash)')
# Composite index for deduplication queries (file_hash + platform)
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_file_hash_platform
ON downloads(file_hash, platform)
WHERE file_hash IS NOT NULL
''')
# Add media_id column if it doesn't exist (for fast metadata queries)
_safe_alter(cursor, 'ALTER TABLE downloads ADD COLUMN IF NOT EXISTS media_id TEXT')
# Index for media_id (fast metadata queries)
cursor.execute('CREATE INDEX IF NOT EXISTS idx_media_id ON downloads(media_id)')
# Forum-specific indexes
cursor.execute('CREATE INDEX IF NOT EXISTS idx_forum_threads_status ON forum_threads(status)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_forum_threads_monitor ON forum_threads(monitor_until)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_forum_posts_thread ON forum_posts(thread_id)')
# Queue indexes
cursor.execute('CREATE INDEX IF NOT EXISTS idx_queue_status ON download_queue(status)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_queue_priority ON download_queue(priority, status)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_queue_platform ON download_queue(platform, status)')
# Notifications indexes
cursor.execute('CREATE INDEX IF NOT EXISTS idx_notifications_sent_at ON notifications(sent_at DESC)')
# Instagram Perceptual Hashes indexes
cursor.execute('CREATE INDEX IF NOT EXISTS idx_perceptual_hash_platform_source ON instagram_perceptual_hashes(platform, source)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_perceptual_hash_file_path ON instagram_perceptual_hashes(file_path)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_perceptual_hash_value ON instagram_perceptual_hashes(perceptual_hash)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_notifications_platform ON notifications(platform)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_notifications_source ON notifications(source)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_notifications_status ON notifications(status)')
# Recycle Bin indexes
cursor.execute('CREATE INDEX IF NOT EXISTS idx_recycle_deleted_at ON recycle_bin(deleted_at DESC)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_recycle_deleted_from ON recycle_bin(deleted_from)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_recycle_deleted_by ON recycle_bin(deleted_by)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_recycle_original_path ON recycle_bin(original_path)')
# File Inventory indexes
cursor.execute('CREATE INDEX IF NOT EXISTS idx_fi_platform_location ON file_inventory(platform, location, created_date DESC)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_fi_source ON file_inventory(source, created_date DESC)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_fi_location ON file_inventory(location)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_fi_hash ON file_inventory(file_hash)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_fi_file_path ON file_inventory(file_path)')
# Add tracking columns for dashboard card exclusions (migration for existing databases)
_safe_alter(cursor, 'ALTER TABLE file_inventory ADD COLUMN IF NOT EXISTS moved_from_review INTEGER DEFAULT 0')
_safe_alter(cursor, 'ALTER TABLE file_inventory ADD COLUMN IF NOT EXISTS moved_from_media INTEGER DEFAULT 0')
_safe_alter(cursor, 'ALTER TABLE file_inventory ADD COLUMN IF NOT EXISTS from_discovery INTEGER DEFAULT 0')
# Indexes for downloads.filename (used in subqueries)
cursor.execute('CREATE INDEX IF NOT EXISTS idx_downloads_filename ON downloads(filename)')
# Face recognition scans table
cursor.execute('''
CREATE TABLE IF NOT EXISTS face_recognition_scans (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT NOT NULL,
has_match INTEGER DEFAULT 0,
matched_person TEXT,
confidence REAL,
face_count INTEGER DEFAULT 0,
scan_date DATETIME DEFAULT CURRENT_TIMESTAMP,
scan_type TEXT DEFAULT 'standard'
)
''')
# Indexes for face_recognition_scans table
cursor.execute('CREATE INDEX IF NOT EXISTS idx_face_scan_file_path ON face_recognition_scans(file_path)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_face_scan_date ON face_recognition_scans(scan_date DESC)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_face_scan_has_match ON face_recognition_scans(has_match)')
# Enable optimizations
cursor.execute('PRAGMA journal_mode=WAL')
cursor.execute('PRAGMA synchronous=NORMAL')
cursor.execute('PRAGMA cache_size=10000')
cursor.execute('PRAGMA temp_store=MEMORY')
# Create cleanup triggers
cursor.execute('''
CREATE TRIGGER IF NOT EXISTS cleanup_old_downloads
AFTER INSERT ON downloads
WHEN (SELECT COUNT(*) FROM downloads) > 100000
BEGIN
DELETE FROM downloads
WHERE download_date < datetime('now', '-180 days')
AND status = 'completed';
END
''')
cursor.execute('''
CREATE TRIGGER IF NOT EXISTS cleanup_failed_downloads
AFTER INSERT ON downloads
WHEN (SELECT COUNT(*) FROM downloads WHERE status = 'failed') > 10000
BEGIN
DELETE FROM downloads
WHERE download_date < datetime('now', '-30 days')
AND status = 'failed';
END
''')
cursor.execute('''
CREATE TRIGGER IF NOT EXISTS expire_forum_monitors
AFTER INSERT ON forum_threads
BEGIN
UPDATE forum_threads
SET status = 'expired'
WHERE monitor_until IS NOT NULL
AND monitor_until < datetime('now')
AND status = 'active';
END
''')
cursor.execute('''
CREATE TRIGGER IF NOT EXISTS cleanup_old_queue
AFTER INSERT ON download_queue
WHEN (SELECT COUNT(*) FROM download_queue WHERE status IN ('completed', 'failed')) > 50000
BEGIN
DELETE FROM download_queue
WHERE created_date < datetime('now', '-90 days')
AND status IN ('completed', 'failed');
END
''')
# ============================================================================
# SMART CONTENT ARCHIVE & DISCOVERY SYSTEM TABLES
# ============================================================================
# Tags table - hierarchical tagging system
cursor.execute('''
CREATE TABLE IF NOT EXISTS tags (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
slug TEXT NOT NULL UNIQUE,
parent_id INTEGER REFERENCES tags(id) ON DELETE SET NULL,
color TEXT DEFAULT '#6366f1',
icon TEXT DEFAULT 'tag',
description TEXT,
source TEXT DEFAULT 'auto',
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# File-Tag relationships (many-to-many)
cursor.execute('''
CREATE TABLE IF NOT EXISTS file_tags (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL REFERENCES file_inventory(id) ON DELETE CASCADE,
tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
created_by TEXT,
UNIQUE(file_id, tag_id)
)
''')
# Smart Folders - saved query filters (virtual folders)
cursor.execute('''
CREATE TABLE IF NOT EXISTS smart_folders (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
slug TEXT NOT NULL UNIQUE,
icon TEXT DEFAULT 'folder',
color TEXT DEFAULT '#6366f1',
description TEXT,
filters TEXT NOT NULL,
sort_by TEXT DEFAULT 'post_date',
sort_order TEXT DEFAULT 'desc',
is_system INTEGER DEFAULT 0,
display_order INTEGER DEFAULT 0,
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# Collections - manually curated groups (like albums)
cursor.execute('''
CREATE TABLE IF NOT EXISTS collections (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
slug TEXT NOT NULL UNIQUE,
description TEXT,
cover_file_id INTEGER REFERENCES file_inventory(id) ON DELETE SET NULL,
color TEXT DEFAULT '#6366f1',
is_public INTEGER DEFAULT 0,
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# Collection-File relationships (many-to-many with ordering)
cursor.execute('''
CREATE TABLE IF NOT EXISTS collection_files (
id INTEGER PRIMARY KEY AUTOINCREMENT,
collection_id INTEGER NOT NULL REFERENCES collections(id) ON DELETE CASCADE,
file_id INTEGER NOT NULL REFERENCES file_inventory(id) ON DELETE CASCADE,
display_order INTEGER DEFAULT 0,
added_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
added_by TEXT,
UNIQUE(collection_id, file_id)
)
''')
# Content Embeddings - CLIP vectors for semantic search
cursor.execute('''
CREATE TABLE IF NOT EXISTS content_embeddings (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL UNIQUE REFERENCES file_inventory(id) ON DELETE CASCADE,
embedding BLOB NOT NULL,
embedding_model TEXT DEFAULT 'clip-vit-base-patch32',
embedding_version INTEGER DEFAULT 1,
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# NOTE: tags and file_tags tables are created earlier in the Smart Content Archive section
# (lines 658-682) - do not duplicate here
# Discovery Scan Queue - background processing for new files
cursor.execute('''
CREATE TABLE IF NOT EXISTS discovery_scan_queue (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL REFERENCES file_inventory(id) ON DELETE CASCADE,
file_path TEXT NOT NULL,
scan_type TEXT NOT NULL DEFAULT 'embedding',
priority INTEGER DEFAULT 5,
status TEXT DEFAULT 'pending',
attempts INTEGER DEFAULT 0,
error_message TEXT,
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
started_date TIMESTAMP,
completed_date TIMESTAMP,
UNIQUE(file_id, scan_type)
)
''')
# User preferences - for storing per-user settings (dashboard state, etc.)
cursor.execute('''
CREATE TABLE IF NOT EXISTS user_preferences (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id TEXT NOT NULL,
preference_key TEXT NOT NULL,
preference_value TEXT,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(user_id, preference_key)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_user_preferences_user ON user_preferences(user_id)')
# Discovery System indexes
cursor.execute('CREATE INDEX IF NOT EXISTS idx_tags_parent ON tags(parent_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_tags_slug ON tags(slug)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_file_tags_file ON file_tags(file_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_file_tags_tag ON file_tags(tag_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_smart_folders_order ON smart_folders(display_order)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_collections_slug ON collections(slug)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_collection_files_collection ON collection_files(collection_id, display_order)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_collection_files_file ON collection_files(file_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_content_embeddings_file ON content_embeddings(file_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_discovery_queue_status ON discovery_scan_queue(status, priority)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_discovery_queue_file ON discovery_scan_queue(file_id)')
# Insert default system smart folders
cursor.execute('''
INSERT OR IGNORE INTO smart_folders (name, slug, icon, filters, is_system, display_order)
VALUES
('Recent Downloads', 'recent-downloads', 'clock', '{"date_range": "7d"}', 1, 1),
('Images', 'images', 'image', '{"media_type": "image"}', 1, 2),
('Videos', 'videos', 'video', '{"media_type": "video"}', 1, 3),
('Instagram', 'instagram', 'instagram', '{"platform": "instagram"}', 1, 4),
('TikTok', 'tiktok', 'video', '{"platform": "tiktok"}', 1, 5),
('Large Files', 'large-files', 'hard-drive', '{"size_min": 10485760}', 1, 6)
''')
# ============================================================================
# SCRAPER PROXY CONFIGURATION SYSTEM
# See docs/SCRAPER_PROXY_SYSTEM.md for full documentation
# ============================================================================
# Scrapers table - centralized configuration for all download modules
cursor.execute('''
CREATE TABLE IF NOT EXISTS scrapers (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
type TEXT NOT NULL, -- 'direct', 'proxy', 'forum', 'cli_tool'
module TEXT, -- Python module name, NULL for cli_tool
base_url TEXT, -- Primary URL for the scraper
target_platform TEXT, -- 'instagram', 'snapchat', 'tiktok', NULL for forums/cli
enabled INTEGER DEFAULT 1, -- Enable/disable scraper
-- Proxy settings
proxy_enabled INTEGER DEFAULT 0,
proxy_url TEXT, -- e.g., "socks5://user:pass@host:port"
-- Cloudflare/Cookie settings
flaresolverr_required INTEGER DEFAULT 0,
cookies_json TEXT, -- JSON blob of cookies
cookies_updated_at TEXT, -- ISO timestamp of last cookie update
-- Test status
last_test_at TEXT, -- ISO timestamp of last test
last_test_status TEXT, -- 'success', 'failed', 'timeout'
last_test_message TEXT, -- Error message if failed
-- Module-specific settings
settings_json TEXT, -- Additional JSON settings per-scraper
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
# Scrapers indexes
cursor.execute('CREATE INDEX IF NOT EXISTS idx_scrapers_type ON scrapers(type)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_scrapers_enabled ON scrapers(enabled)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_scrapers_target_platform ON scrapers(target_platform)')
# Seed default scrapers (INSERT OR IGNORE to not overwrite existing)
default_scrapers = [
('imginn', 'Imginn', 'proxy', 'imginn_module', 'https://imginn.com', 'instagram', 1),
('imginn_api', 'ImgInn API', 'proxy', 'imginn_api_module', 'https://imginn.com', 'instagram', 1),
('fastdl', 'FastDL', 'proxy', 'fastdl_module', 'https://fastdl.app', 'instagram', 1),
('toolzu', 'Toolzu', 'proxy', 'toolzu_module', 'https://toolzu.com', 'instagram', 1),
('snapchat', 'Snapchat Direct', 'direct', 'snapchat_scraper', 'https://snapchat.com', 'snapchat', 0),
('instagram', 'Instagram (Direct)', 'direct', 'instaloader_module', 'https://instagram.com', 'instagram', 0),
('tiktok', 'TikTok', 'direct', 'tiktok_module', 'https://tiktok.com', 'tiktok', 0),
('coppermine', 'Coppermine', 'direct', 'coppermine_module', 'https://hqdiesel.net', None, 1),
('forum_phun', 'Phun.org', 'forum', 'forum_downloader', 'https://forum.phun.org', None, 1),
('forum_hqcelebcorner', 'HQCelebCorner', 'forum', 'forum_downloader', 'https://hqcelebcorner.com', None, 0),
('forum_picturepub', 'PicturePub', 'forum', 'forum_downloader', 'https://picturepub.net', None, 0),
('ytdlp', 'yt-dlp', 'cli_tool', None, None, None, 0),
('gallerydl', 'gallery-dl', 'cli_tool', None, None, None, 0),
]
for scraper in default_scrapers:
cursor.execute('''
INSERT OR IGNORE INTO scrapers
(id, name, type, module, base_url, target_platform, flaresolverr_required)
VALUES (?, ?, ?, ?, ?, ?, ?)
''', scraper)
# Migrate cookies from files to database (one-time migration)
self._migrate_scraper_cookies(cursor)
# Error log table for tracking errors from log files
cursor.execute('''
CREATE TABLE IF NOT EXISTS error_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
error_hash TEXT UNIQUE NOT NULL, -- Hash of module+message for deduplication
module TEXT NOT NULL, -- Module name (e.g., 'FaceRecognition', 'Forum')
level TEXT DEFAULT 'ERROR', -- Log level (ERROR, CRITICAL)
message TEXT NOT NULL, -- Error message
first_seen DATETIME NOT NULL, -- First occurrence
last_seen DATETIME NOT NULL, -- Most recent occurrence
occurrence_count INTEGER DEFAULT 1, -- How many times this error occurred
log_file TEXT, -- Which log file/component it came from
line_context TEXT, -- JSON: lines before/after for context
dismissed_at DATETIME, -- NULL if not dismissed
viewed_at DATETIME, -- NULL if not viewed in dashboard
push_alert_sent_at DATETIME, -- When we last sent a push about this
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
''')
# Error log indexes
cursor.execute('CREATE INDEX IF NOT EXISTS idx_error_log_hash ON error_log(error_hash)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_error_log_dismissed ON error_log(dismissed_at)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_error_log_last_seen ON error_log(last_seen)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_error_log_viewed ON error_log(viewed_at)')
# Error tracking settings (last dashboard visit, etc.)
cursor.execute('''
CREATE TABLE IF NOT EXISTS error_tracking (
id INTEGER PRIMARY KEY,
user_id TEXT UNIQUE NOT NULL DEFAULT 'default',
last_dashboard_visit DATETIME,
last_errors_viewed DATETIME,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
''')
# Add UNIQUE constraint to user_id if missing (migration for existing tables)
try:
# Try PostgreSQL information_schema approach first
cursor.execute("""
SELECT COUNT(*) FROM information_schema.table_constraints
WHERE table_name = 'error_tracking' AND constraint_type = 'UNIQUE'
""")
has_unique = cursor.fetchone()[0] > 0
if not has_unique:
cursor.execute('ALTER TABLE error_tracking ADD CONSTRAINT error_tracking_user_id_key UNIQUE (user_id)')
logger.info("Migrated error_tracking table to add UNIQUE constraint on user_id")
except Exception:
# Fallback: try CREATE UNIQUE INDEX (works on both SQLite and PostgreSQL)
try:
cursor.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_error_tracking_user_id ON error_tracking(user_id)')
except Exception:
pass # Constraint/index may already exist
# ============================================================================
# CELEBRITY DISCOVERY SYSTEM TABLES
# For searching and discovering content from talk shows, interviews, etc.
# ============================================================================
# Celebrity Profiles - people to search for
cursor.execute('''
CREATE TABLE IF NOT EXISTS celebrity_profiles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
slug TEXT NOT NULL UNIQUE,
image_url TEXT,
notes TEXT,
enabled INTEGER DEFAULT 1,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
''')
# Search Presets - saved searches for each celebrity
cursor.execute('''
CREATE TABLE IF NOT EXISTS celebrity_search_presets (
id INTEGER PRIMARY KEY AUTOINCREMENT,
celebrity_id INTEGER NOT NULL REFERENCES celebrity_profiles(id) ON DELETE CASCADE,
name TEXT NOT NULL,
source_type TEXT NOT NULL, -- 'youtube_channel', 'youtube_search', 'youtube_rss'
source_value TEXT NOT NULL, -- channel_id, search query, or RSS URL
keywords TEXT, -- JSON array of filter keywords
content_type TEXT, -- 'interview', 'red_carpet', 'photoshoot', 'bts', 'premiere', 'all'
enabled INTEGER DEFAULT 1,
last_checked DATETIME,
check_frequency_hours INTEGER DEFAULT 24,
results_count INTEGER DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
''')
# Discovered Videos - videos found from search presets
cursor.execute('''
CREATE TABLE IF NOT EXISTS celebrity_discovered_videos (
id INTEGER PRIMARY KEY AUTOINCREMENT,
preset_id INTEGER NOT NULL REFERENCES celebrity_search_presets(id) ON DELETE CASCADE,
celebrity_id INTEGER NOT NULL REFERENCES celebrity_profiles(id) ON DELETE CASCADE,
video_id TEXT NOT NULL,
platform TEXT NOT NULL DEFAULT 'youtube',
url TEXT NOT NULL,
title TEXT,
channel_name TEXT,
channel_id TEXT,
thumbnail TEXT,
duration INTEGER,
upload_date DATETIME,
view_count INTEGER,
description TEXT,
content_type TEXT, -- detected or manual: 'interview', 'red_carpet', etc.
status TEXT DEFAULT 'new', -- 'new', 'queued', 'downloaded', 'ignored', 'watched'
discovered_at DATETIME DEFAULT CURRENT_TIMESTAMP,
status_updated_at DATETIME,
downloaded_path TEXT,
metadata TEXT, -- JSON for extra data
UNIQUE(celebrity_id, platform, video_id)
)
''')
# Celebrity Discovery indexes
cursor.execute('CREATE INDEX IF NOT EXISTS idx_celeb_profile_slug ON celebrity_profiles(slug)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_celeb_profile_enabled ON celebrity_profiles(enabled)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_celeb_preset_celebrity ON celebrity_search_presets(celebrity_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_celeb_preset_enabled ON celebrity_search_presets(enabled)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_celeb_preset_type ON celebrity_search_presets(source_type)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_celeb_discovered_preset ON celebrity_discovered_videos(preset_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_celeb_discovered_celebrity ON celebrity_discovered_videos(celebrity_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_celeb_discovered_status ON celebrity_discovered_videos(status)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_celeb_discovered_date ON celebrity_discovered_videos(discovered_at DESC)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_celeb_discovered_upload ON celebrity_discovered_videos(upload_date DESC)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_celeb_discovered_video_id ON celebrity_discovered_videos(platform, video_id)')
# ============================================================================
# CELEBRITY APPEARANCES TRACKING
# ============================================================================
# Celebrity Appearances - track upcoming media appearances (TV, podcasts, radio)
cursor.execute('''
CREATE TABLE IF NOT EXISTS celebrity_appearances (
id INTEGER PRIMARY KEY AUTOINCREMENT,
celebrity_id INTEGER NOT NULL,
celebrity_name TEXT NOT NULL,
appearance_type TEXT NOT NULL,
show_name TEXT NOT NULL,
episode_title TEXT,
network TEXT,
appearance_date DATETIME NOT NULL,
announcement_date DATETIME,
url TEXT,
watch_url TEXT,
description TEXT,
tmdb_show_id INTEGER,
tmdb_episode_id INTEGER,
season_number INTEGER,
episode_number INTEGER,
status TEXT DEFAULT 'upcoming',
notified BOOLEAN DEFAULT FALSE,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (celebrity_id) REFERENCES celebrity_profiles(id) ON DELETE CASCADE,
UNIQUE(celebrity_id, appearance_type, tmdb_show_id, season_number, episode_number)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_appearances_date ON celebrity_appearances(appearance_date)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_appearances_celebrity ON celebrity_appearances(celebrity_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_appearances_status ON celebrity_appearances(status)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_appearances_type ON celebrity_appearances(appearance_type)')
# Partial unique index for movies (to prevent duplicates since season/episode are NULL)
cursor.execute('''
CREATE UNIQUE INDEX IF NOT EXISTS idx_appearances_movie_unique
ON celebrity_appearances(celebrity_id, appearance_type, tmdb_show_id)
WHERE appearance_type = 'Movie'
''')
# Partial unique index for podcasts (to prevent duplicates)
cursor.execute('''
CREATE UNIQUE INDEX IF NOT EXISTS idx_appearances_podcast_unique
ON celebrity_appearances(celebrity_id, appearance_type, tmdb_show_id)
WHERE appearance_type = 'Podcast'
''')
# Appearance notifications log - track all sent notifications
cursor.execute('''
CREATE TABLE IF NOT EXISTS appearance_notifications (
id INTEGER PRIMARY KEY AUTOINCREMENT,
appearance_id INTEGER,
celebrity_name TEXT NOT NULL,
show_name TEXT NOT NULL,
appearance_type TEXT NOT NULL,
appearance_date DATE NOT NULL,
notification_type TEXT NOT NULL,
message TEXT,
poster_url TEXT,
sent_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (appearance_id) REFERENCES celebrity_appearances(id) ON DELETE SET NULL
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_appearance_notifications_date ON appearance_notifications(sent_at)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_appearance_notifications_type ON appearance_notifications(notification_type)')
# Appearance configuration - singleton table for TMDb and other API settings
cursor.execute('''
CREATE TABLE IF NOT EXISTS appearance_config (
id INTEGER PRIMARY KEY CHECK (id = 1),
tmdb_api_key TEXT,
tmdb_enabled BOOLEAN DEFAULT TRUE,
tmdb_check_interval_hours INTEGER DEFAULT 12,
tmdb_last_check DATETIME,
podcast_enabled BOOLEAN DEFAULT FALSE,
podcast_sources TEXT,
radio_enabled BOOLEAN DEFAULT FALSE,
radio_sources TEXT,
notify_new_appearances BOOLEAN DEFAULT TRUE,
notify_days_before INTEGER DEFAULT 1,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
''')
cursor.execute('INSERT OR IGNORE INTO appearance_config (id) VALUES (1)')
# Extend celebrity_profiles table with TMDb fields
cursor.execute('PRAGMA table_info(celebrity_profiles)')
columns = [row[1] for row in cursor.fetchall()]
if 'tmdb_person_id' not in columns:
if _safe_alter(cursor, 'ALTER TABLE celebrity_profiles ADD COLUMN IF NOT EXISTS tmdb_person_id INTEGER'):
try:
cursor.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_celebrity_tmdb_person ON celebrity_profiles(tmdb_person_id) WHERE tmdb_person_id IS NOT NULL')
except Exception:
pass
if 'tmdb_last_sync' not in columns:
_safe_alter(cursor, 'ALTER TABLE celebrity_profiles ADD COLUMN IF NOT EXISTS tmdb_last_sync DATETIME')
if 'podchaser_creator_id' not in columns:
if _safe_alter(cursor, 'ALTER TABLE celebrity_profiles ADD COLUMN IF NOT EXISTS podchaser_creator_id TEXT'):
try:
cursor.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_celebrity_podchaser_creator ON celebrity_profiles(podchaser_creator_id) WHERE podchaser_creator_id IS NOT NULL')
except Exception:
pass
if 'podchaser_last_sync' not in columns:
_safe_alter(cursor, 'ALTER TABLE celebrity_profiles ADD COLUMN IF NOT EXISTS podchaser_last_sync DATETIME')
# Extend appearance_config table with Podchaser fields
cursor.execute('PRAGMA table_info(appearance_config)')
config_columns = [row[1] for row in cursor.fetchall()]
if 'podchaser_api_key' not in config_columns:
_safe_alter(cursor, 'ALTER TABLE appearance_config ADD COLUMN IF NOT EXISTS podchaser_api_key TEXT')
if 'podchaser_enabled' not in config_columns:
_safe_alter(cursor, 'ALTER TABLE appearance_config ADD COLUMN IF NOT EXISTS podchaser_enabled BOOLEAN DEFAULT FALSE')
if 'podchaser_last_check' not in config_columns:
_safe_alter(cursor, 'ALTER TABLE appearance_config ADD COLUMN IF NOT EXISTS podchaser_last_check DATETIME')
# Extend celebrity_appearances table with filmography/credit fields
cursor.execute('PRAGMA table_info(celebrity_appearances)')
appearance_columns = [row[1] for row in cursor.fetchall()]
if 'credit_type' not in appearance_columns:
_safe_alter(cursor, "ALTER TABLE celebrity_appearances ADD COLUMN IF NOT EXISTS credit_type TEXT DEFAULT 'acting'")
if 'character_name' not in appearance_columns:
_safe_alter(cursor, 'ALTER TABLE celebrity_appearances ADD COLUMN IF NOT EXISTS character_name TEXT')
if 'job_title' not in appearance_columns:
_safe_alter(cursor, 'ALTER TABLE celebrity_appearances ADD COLUMN IF NOT EXISTS job_title TEXT')
if 'plex_rating_key' not in appearance_columns:
_safe_alter(cursor, 'ALTER TABLE celebrity_appearances ADD COLUMN IF NOT EXISTS plex_rating_key TEXT')
if 'plex_library_id' not in appearance_columns:
_safe_alter(cursor, 'ALTER TABLE celebrity_appearances ADD COLUMN IF NOT EXISTS plex_library_id INTEGER')
# Create index for credit_type filtering
cursor.execute('CREATE INDEX IF NOT EXISTS idx_appearances_credit_type ON celebrity_appearances(credit_type)')
# Update unique constraint to include credit_type (allows multiple credits per appearance)
# Check if we need to migrate the table (if credit_type isn't in the unique constraint)
# Check if credit_type is already in the unique constraint
cursor.execute("PRAGMA table_info(celebrity_appearances)")
col_names = [row[1] for row in cursor.fetchall()]
needs_migration = 'credit_type' not in col_names
if needs_migration:
# Need to recreate table with updated unique constraint
try:
cursor.execute('''
CREATE TABLE IF NOT EXISTS celebrity_appearances_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
celebrity_id INTEGER NOT NULL,
celebrity_name TEXT NOT NULL,
appearance_type TEXT NOT NULL,
show_name TEXT NOT NULL,
episode_title TEXT,
network TEXT,
appearance_date DATETIME NOT NULL,
announcement_date DATETIME,
url TEXT,
watch_url TEXT,
description TEXT,
tmdb_show_id INTEGER,
tmdb_episode_id INTEGER,
season_number INTEGER,
episode_number INTEGER,
status TEXT DEFAULT 'upcoming',
notified BOOLEAN DEFAULT FALSE,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
credit_type TEXT DEFAULT 'acting',
character_name TEXT,
job_title TEXT,
plex_rating_key TEXT,
plex_library_id INTEGER,
FOREIGN KEY (celebrity_id) REFERENCES celebrity_profiles(id) ON DELETE CASCADE,
UNIQUE(celebrity_id, appearance_type, tmdb_show_id, season_number, episode_number, credit_type)
)
''')
# Copy existing data
cursor.execute('''
INSERT OR IGNORE INTO celebrity_appearances_new
SELECT id, celebrity_id, celebrity_name, appearance_type, show_name, episode_title,
network, appearance_date, announcement_date, url, watch_url, description,
tmdb_show_id, tmdb_episode_id, season_number, episode_number, status,
CASE WHEN notified = 1 THEN TRUE WHEN notified = 0 THEN FALSE ELSE notified END,
created_at, updated_at,
COALESCE(credit_type, 'acting'), character_name, job_title,
plex_rating_key, plex_library_id
FROM celebrity_appearances
''')
cursor.execute('DROP TABLE celebrity_appearances')
cursor.execute('ALTER TABLE celebrity_appearances_new RENAME TO celebrity_appearances')
# Recreate indexes
cursor.execute('CREATE INDEX IF NOT EXISTS idx_appearances_date ON celebrity_appearances(appearance_date)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_appearances_celebrity ON celebrity_appearances(celebrity_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_appearances_status ON celebrity_appearances(status)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_appearances_type ON celebrity_appearances(appearance_type)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_appearances_credit_type ON celebrity_appearances(credit_type)')
cursor.execute('''
CREATE UNIQUE INDEX IF NOT EXISTS idx_appearances_movie_unique
ON celebrity_appearances(celebrity_id, appearance_type, tmdb_show_id, credit_type)
WHERE appearance_type = 'Movie'
''')
cursor.execute('''
CREATE UNIQUE INDEX IF NOT EXISTS idx_appearances_podcast_unique
ON celebrity_appearances(celebrity_id, appearance_type, tmdb_show_id, credit_type)
WHERE appearance_type = 'Podcast'
''')
logger.info("Migrated celebrity_appearances table to include credit_type in unique constraint")
except Exception as e:
logger.warning(f"Could not migrate celebrity_appearances table: {e}")
# Extend appearance_config table with Plex settings
cursor.execute('PRAGMA table_info(appearance_config)')
config_columns = [row[1] for row in cursor.fetchall()]
if 'plex_url' not in config_columns:
_safe_alter(cursor, 'ALTER TABLE appearance_config ADD COLUMN IF NOT EXISTS plex_url TEXT')
if 'plex_token' not in config_columns:
_safe_alter(cursor, 'ALTER TABLE appearance_config ADD COLUMN IF NOT EXISTS plex_token TEXT')
# ============================================================================
# UNIFIED VIDEO DOWNLOAD QUEUE
# ============================================================================
# Video download queue - unified queue for all video sources
cursor.execute('''
CREATE TABLE IF NOT EXISTS video_download_queue (
id INTEGER PRIMARY KEY AUTOINCREMENT,
platform TEXT NOT NULL DEFAULT 'youtube',
video_id TEXT NOT NULL,
url TEXT NOT NULL,
title TEXT NOT NULL,
custom_title TEXT, -- User-editable title for filename
channel_name TEXT,
thumbnail TEXT,
duration INTEGER,
upload_date DATETIME,
custom_date DATETIME, -- User-editable date
view_count INTEGER,
description TEXT,
source_type TEXT, -- 'celebrity', 'manual', 'search', etc.
source_id INTEGER, -- Reference to source (celebrity_id, etc.)
source_name TEXT, -- Display name of source
priority INTEGER DEFAULT 5, -- 1-10, lower is higher priority
status TEXT DEFAULT 'pending', -- 'pending', 'downloading', 'completed', 'failed', 'paused'
progress INTEGER DEFAULT 0, -- Download progress percentage
file_path TEXT,
file_size INTEGER,
error_message TEXT,
attempts INTEGER DEFAULT 0,
max_attempts INTEGER DEFAULT 3,
added_at DATETIME DEFAULT CURRENT_TIMESTAMP,
started_at DATETIME,
completed_at DATETIME,
metadata TEXT, -- JSON for extra data
UNIQUE(platform, video_id)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_vdq_status ON video_download_queue(status)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_vdq_priority ON video_download_queue(priority, status)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_vdq_source ON video_download_queue(source_type, source_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_vdq_added ON video_download_queue(added_at DESC)')
# ============================================================================
# YOUTUBE CHANNEL MONITOR
# ============================================================================
# YouTube channel monitor global settings
cursor.execute('''
CREATE TABLE IF NOT EXISTS youtube_monitor_settings (
id INTEGER PRIMARY KEY CHECK (id = 1),
phrases TEXT NOT NULL DEFAULT '[]',
check_interval_hours INTEGER DEFAULT 6,
quality TEXT DEFAULT 'best',
enabled INTEGER DEFAULT 1,
last_checked TEXT,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
# Insert default settings row if not exists
cursor.execute('''
INSERT OR IGNORE INTO youtube_monitor_settings (id, phrases, check_interval_hours, quality, enabled)
VALUES (1, '[]', 6, 'best', 1)
''')
# YouTube channel monitors - just the channels to monitor
cursor.execute('''
CREATE TABLE IF NOT EXISTS youtube_channel_monitors (
id INTEGER PRIMARY KEY AUTOINCREMENT,
channel_url TEXT NOT NULL UNIQUE,
channel_name TEXT,
enabled INTEGER DEFAULT 1,
last_checked TEXT,
videos_found INTEGER DEFAULT 0,
created_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_ycm_enabled ON youtube_channel_monitors(enabled)')
# YouTube monitor history - track which videos we've already seen/processed
cursor.execute('''
CREATE TABLE IF NOT EXISTS youtube_monitor_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
monitor_id INTEGER,
video_id TEXT NOT NULL,
video_title TEXT,
matched_phrase TEXT,
action TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (monitor_id) REFERENCES youtube_channel_monitors(id)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_ymh_monitor ON youtube_monitor_history(monitor_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_ymh_video ON youtube_monitor_history(video_id)')
cursor.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_ymh_unique ON youtube_monitor_history(monitor_id, video_id)')
# ============================================================================
# EASYNEWS INTEGRATION TABLES
# ============================================================================
# Easynews configuration (singleton, id=1)
cursor.execute('''
CREATE TABLE IF NOT EXISTS easynews_config (
id INTEGER PRIMARY KEY CHECK (id = 1),
username TEXT,
password TEXT,
enabled INTEGER DEFAULT 0,
check_interval_hours INTEGER DEFAULT 12,
last_check TEXT,
auto_download INTEGER DEFAULT 0,
min_quality TEXT DEFAULT '720p',
proxy_enabled INTEGER DEFAULT 0,
proxy_type TEXT DEFAULT 'http',
proxy_host TEXT,
proxy_port INTEGER,
proxy_username TEXT,
proxy_password TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
# Insert default config row if not exists
cursor.execute('''
INSERT OR IGNORE INTO easynews_config (id, enabled, check_interval_hours, auto_download, min_quality)
VALUES (1, 0, 12, 0, '720p')
''')
# Easynews search terms to monitor
cursor.execute('''
CREATE TABLE IF NOT EXISTS easynews_searches (
id INTEGER PRIMARY KEY AUTOINCREMENT,
search_term TEXT NOT NULL,
media_type TEXT DEFAULT 'any',
tmdb_id INTEGER,
tmdb_title TEXT,
poster_url TEXT,
enabled INTEGER DEFAULT 1,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_ens_enabled ON easynews_searches(enabled)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_ens_term ON easynews_searches(search_term)')
# Easynews discovered results
cursor.execute('''
CREATE TABLE IF NOT EXISTS easynews_results (
id INTEGER PRIMARY KEY AUTOINCREMENT,
search_id INTEGER,
filename TEXT NOT NULL,
download_url TEXT NOT NULL,
size_bytes INTEGER,
post_date TEXT,
discovered_at TEXT DEFAULT CURRENT_TIMESTAMP,
parsed_title TEXT,
parsed_season INTEGER,
parsed_episode INTEGER,
parsed_year INTEGER,
tmdb_id INTEGER,
tmdb_title TEXT,
poster_url TEXT,
quality TEXT,
status TEXT DEFAULT 'new',
download_path TEXT,
file_hash TEXT,
FOREIGN KEY (search_id) REFERENCES easynews_searches(id)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_enr_search ON easynews_results(search_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_enr_status ON easynews_results(status)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_enr_filename ON easynews_results(filename)')
cursor.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_enr_unique ON easynews_results(filename, download_url)')
# ============================================================================
# YOUTUBE CHANNEL MONITOR MIGRATIONS (v11.20.0)
# ============================================================================
# Add new columns to youtube_channel_monitors for status management
# Step 1: Add all columns first
_safe_alter(cursor, "ALTER TABLE youtube_channel_monitors ADD COLUMN IF NOT EXISTS status TEXT DEFAULT 'active'")
_safe_alter(cursor, "ALTER TABLE youtube_channel_monitors ADD COLUMN IF NOT EXISTS always_active INTEGER DEFAULT 0")
_safe_alter(cursor, "ALTER TABLE youtube_channel_monitors ADD COLUMN IF NOT EXISTS last_video_date TEXT")
_safe_alter(cursor, "ALTER TABLE youtube_channel_monitors ADD COLUMN IF NOT EXISTS last_check_date TEXT")
_safe_alter(cursor, "ALTER TABLE youtube_channel_monitors ADD COLUMN IF NOT EXISTS paused_date TEXT")
_safe_alter(cursor, "ALTER TABLE youtube_channel_monitors ADD COLUMN IF NOT EXISTS paused_reason TEXT")
_safe_alter(cursor, "ALTER TABLE youtube_channel_monitors ADD COLUMN IF NOT EXISTS total_videos_found INTEGER DEFAULT 0")
# Step 2: Migrate existing enabled=0 channels to paused_manual status (after columns exist)
cursor.execute("""
UPDATE youtube_channel_monitors
SET status = 'paused_manual',
paused_date = datetime('now'),
paused_reason = 'Manually disabled before v11.20.0'
WHERE enabled = 0 AND (status IS NULL OR status = 'active')
""")
migrated_count = cursor.rowcount
if migrated_count > 0:
self.log(f"Migrated {migrated_count} disabled channels to paused_manual status", "info")
# Create index for status queries
cursor.execute('CREATE INDEX IF NOT EXISTS idx_ycm_status ON youtube_channel_monitors(status)')
# Add new settings columns to youtube_monitor_settings
_safe_alter(cursor, "ALTER TABLE youtube_monitor_settings ADD COLUMN IF NOT EXISTS auto_pause_threshold_months INTEGER DEFAULT 24")
_safe_alter(cursor, "ALTER TABLE youtube_monitor_settings ADD COLUMN IF NOT EXISTS paused_check_interval_days INTEGER DEFAULT 14")
_safe_alter(cursor, "ALTER TABLE youtube_monitor_settings ADD COLUMN IF NOT EXISTS auto_start_queue INTEGER DEFAULT 0")
_safe_alter(cursor, "ALTER TABLE youtube_monitor_settings ADD COLUMN IF NOT EXISTS notifications_enabled INTEGER DEFAULT 1")
_safe_alter(cursor, "ALTER TABLE youtube_monitor_settings ADD COLUMN IF NOT EXISTS max_results_per_phrase INTEGER DEFAULT 100")
# ============================================================================
# PAID CONTENT FEATURE TABLES
# For tracking content from subscription-based creator platforms
# (OnlyFans, Fansly, Patreon, Fanbox, etc.) via Coomer.su and Kemono.su APIs
# ============================================================================
# Table 1: paid_content_services - API Configuration for Coomer/Kemono
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_services (
id TEXT PRIMARY KEY, -- 'coomer', 'kemono'
name TEXT NOT NULL,
base_url TEXT NOT NULL, -- e.g., https://coomer.party
enabled INTEGER DEFAULT 1,
session_cookie TEXT,
session_updated_at TEXT,
last_health_check TEXT,
health_status TEXT DEFAULT 'unknown', -- 'healthy', 'degraded', 'down'
supported_services TEXT, -- JSON: ['onlyfans', 'fansly']
rate_limit_requests INTEGER DEFAULT 2,
rate_limit_window_seconds INTEGER DEFAULT 1,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
# Seed paid_content_services data
# Note: These services change domains frequently. The api_client has fallback defaults.
cursor.execute('''
INSERT OR IGNORE INTO paid_content_services (id, name, base_url, supported_services) VALUES
('coomer', 'Coomer', 'https://coomer.party', '["onlyfans", "fansly", "candfans"]')
''')
cursor.execute('''
INSERT OR IGNORE INTO paid_content_services (id, name, base_url, supported_services) VALUES
('kemono', 'Kemono', 'https://kemono.party', '["patreon", "fanbox", "gumroad", "subscribestar", "discord"]')
''')
cursor.execute('''
INSERT OR IGNORE INTO paid_content_services (id, name, base_url, supported_services) VALUES
('youtube', 'YouTube', 'https://www.youtube.com', '["youtube"]')
''')
cursor.execute('''
INSERT OR IGNORE INTO paid_content_services (id, name, base_url, supported_services) VALUES
('fansly_direct', 'Fansly Direct', 'https://apiv3.fansly.com/api/v1', '["fansly"]')
''')
cursor.execute('''
INSERT OR IGNORE INTO paid_content_services (id, name, base_url, supported_services) VALUES
('pornhub', 'Pornhub', 'https://www.pornhub.com', '["pornhub"]')
''')
cursor.execute('''
INSERT OR IGNORE INTO paid_content_services (id, name, base_url, supported_services) VALUES
('onlyfans_direct', 'OnlyFans Direct', 'https://onlyfans.com/api2/v2', '["onlyfans"]')
''')
cursor.execute('''
INSERT OR IGNORE INTO paid_content_services (id, name, base_url, supported_services) VALUES
('xhamster', 'xHamster', 'https://xhamster.com', '["xhamster"]')
''')
cursor.execute('''
INSERT OR IGNORE INTO paid_content_services (id, name, base_url, supported_services) VALUES
('tiktok', 'TikTok', 'https://www.tiktok.com', '["tiktok"]')
''')
cursor.execute('''
INSERT OR IGNORE INTO paid_content_services (id, name, base_url, supported_services) VALUES
('instagram', 'Instagram', 'https://www.instagram.com', '["instagram"]')
''')
cursor.execute('''
INSERT OR IGNORE INTO paid_content_services (id, name, base_url, supported_services) VALUES
('soundgasm', 'Soundgasm', 'https://soundgasm.net', '["soundgasm"]')
''')
cursor.execute('''
INSERT OR IGNORE INTO paid_content_services (id, name, base_url, supported_services) VALUES
('snapchat', 'Snapchat', 'https://www.snapchat.com', '["snapchat"]')
''')
cursor.execute('''
INSERT OR IGNORE INTO paid_content_services (id, name, base_url, supported_services) VALUES
('hqcelebcorner', 'HQCelebCorner', 'https://www.hqcelebcorner.net', '["hqcelebcorner"]')
''')
cursor.execute('''
INSERT OR IGNORE INTO paid_content_services (id, name, base_url, supported_services) VALUES
('picturepub', 'PicturePub', 'https://picturepub.net', '["picturepub"]')
''')
cursor.execute('''
INSERT OR IGNORE INTO paid_content_services (id, name, base_url, supported_services) VALUES
('reddit', 'Reddit', 'https://www.reddit.com', '["reddit"]')
''')
# Migrate existing old URLs to new domains
cursor.execute('''
UPDATE paid_content_services SET base_url = 'https://coomer.party'
WHERE id = 'coomer' AND (base_url LIKE '%coomer.su%' OR base_url LIKE '%coomer.st%')
''')
cursor.execute('''
UPDATE paid_content_services SET base_url = 'https://kemono.party'
WHERE id = 'kemono' AND (base_url LIKE '%kemono.su%' OR base_url LIKE '%kemono.cr%')
''')
# Table 2: paid_content_identities - Creator Linking (same person across platforms)
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_identities (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL, -- User-defined unified name
slug TEXT NOT NULL UNIQUE,
profile_image_url TEXT,
notes TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
# Creator Groups (named collections of creators for filtering)
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_creator_groups (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
description TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_creator_group_members (
group_id INTEGER NOT NULL REFERENCES paid_content_creator_groups(id) ON DELETE CASCADE,
creator_id INTEGER NOT NULL REFERENCES paid_content_creators(id) ON DELETE CASCADE,
added_at TEXT DEFAULT CURRENT_TIMESTAMP,
filter_tagged_users TEXT DEFAULT NULL,
filter_tag_ids TEXT DEFAULT NULL,
PRIMARY KEY (group_id, creator_id)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_group_members_creator ON paid_content_creator_group_members(creator_id)')
# Table 3: paid_content_creators - Tracked Creators
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_creators (
id INTEGER PRIMARY KEY AUTOINCREMENT,
service_id TEXT NOT NULL, -- 'coomer' or 'kemono'
platform TEXT NOT NULL, -- 'onlyfans', 'patreon', etc.
creator_id TEXT NOT NULL, -- Platform-specific ID
username TEXT NOT NULL,
display_name TEXT,
profile_image_url TEXT,
banner_image_url TEXT,
bio TEXT, -- Creator bio/description
joined_date TEXT, -- When creator joined the platform
location TEXT, -- Creator's location
external_links TEXT, -- JSON array of social links
identity_id INTEGER REFERENCES paid_content_identities(id) ON DELETE SET NULL,
enabled INTEGER DEFAULT 1,
last_checked TEXT,
last_post_date TEXT,
post_count INTEGER DEFAULT 0,
downloaded_count INTEGER DEFAULT 0,
total_size_bytes INTEGER DEFAULT 0,
auto_download INTEGER DEFAULT 1,
download_embeds INTEGER DEFAULT 1,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(service_id, platform, creator_id),
FOREIGN KEY (service_id) REFERENCES paid_content_services(id)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_creators_identity ON paid_content_creators(identity_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_creators_service_platform ON paid_content_creators(service_id, platform)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_creators_enabled ON paid_content_creators(enabled)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_creators_last_checked ON paid_content_creators(last_checked)')
# Add bio column if it doesn't exist (migration)
_safe_alter(cursor, 'ALTER TABLE paid_content_creators ADD COLUMN IF NOT EXISTS bio TEXT')
# Add joined_date, location, external_links columns (migration)
for col in ['joined_date TEXT', 'location TEXT', 'external_links TEXT']:
_safe_alter(cursor, f'ALTER TABLE paid_content_creators ADD COLUMN IF NOT EXISTS {col}')
# Add last_coomer_check column (migration)
_safe_alter(cursor, 'ALTER TABLE paid_content_creators ADD COLUMN IF NOT EXISTS last_coomer_check TEXT')
# Add sync settings and tagged user filter columns (migration)
for col in ['sync_posts INTEGER DEFAULT 1', 'sync_stories INTEGER DEFAULT 1',
'sync_highlights INTEGER DEFAULT 1', 'filter_tagged_users TEXT DEFAULT NULL',
'use_authenticated_api INTEGER DEFAULT 0']:
_safe_alter(cursor, f'ALTER TABLE paid_content_creators ADD COLUMN IF NOT EXISTS {col}')
# Add filter columns to group members (migration)
for col in ['filter_tagged_users TEXT DEFAULT NULL', 'filter_tag_ids TEXT DEFAULT NULL']:
_safe_alter(cursor, f'ALTER TABLE paid_content_creator_group_members ADD COLUMN IF NOT EXISTS {col}')
# Table 4: paid_content_posts - Individual Posts
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_posts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
creator_id INTEGER NOT NULL,
post_id TEXT NOT NULL, -- ID from Coomer/Kemono API
title TEXT,
content TEXT, -- Post text
published_at TEXT,
added_at TEXT, -- When added to archive
edited_at TEXT,
has_attachments INTEGER DEFAULT 0,
attachment_count INTEGER DEFAULT 0,
downloaded INTEGER DEFAULT 0,
download_date TEXT,
embed_count INTEGER DEFAULT 0,
embed_downloaded INTEGER DEFAULT 0,
is_favorited INTEGER DEFAULT 0,
is_viewed INTEGER DEFAULT 0,
view_date TEXT,
local_path TEXT, -- Directory where post files are stored
metadata TEXT, -- JSON for additional data
deleted_at TEXT DEFAULT NULL, -- Soft delete timestamp
UNIQUE(creator_id, post_id),
FOREIGN KEY (creator_id) REFERENCES paid_content_creators(id) ON DELETE CASCADE
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_posts_creator ON paid_content_posts(creator_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_posts_published ON paid_content_posts(published_at DESC)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_posts_downloaded ON paid_content_posts(downloaded)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_posts_favorited ON paid_content_posts(is_favorited)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_posts_viewed ON paid_content_posts(is_viewed)')
# Add soft-delete column (migration)
_safe_alter(cursor, 'ALTER TABLE paid_content_posts ADD COLUMN IF NOT EXISTS deleted_at TEXT DEFAULT NULL')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_posts_deleted ON paid_content_posts(deleted_at)')
# Table 5: paid_content_attachments - Post Attachments
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_attachments (
id INTEGER PRIMARY KEY AUTOINCREMENT,
post_id INTEGER NOT NULL,
attachment_index INTEGER DEFAULT 0, -- Order in post
name TEXT NOT NULL,
file_type TEXT, -- 'image', 'video', 'archive', 'document'
extension TEXT,
server_path TEXT, -- Path on Coomer/Kemono server
download_url TEXT,
file_size INTEGER,
width INTEGER,
height INTEGER,
duration INTEGER, -- For videos (seconds)
status TEXT DEFAULT 'pending', -- 'pending', 'downloading', 'completed', 'failed', 'duplicate', 'skipped'
local_path TEXT,
local_filename TEXT,
file_hash TEXT, -- SHA256
perceptual_hash TEXT,
error_message TEXT,
download_attempts INTEGER DEFAULT 0,
last_attempt TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
downloaded_at TEXT,
UNIQUE(post_id, server_path),
FOREIGN KEY (post_id) REFERENCES paid_content_posts(id) ON DELETE CASCADE
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_attachments_post ON paid_content_attachments(post_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_attachments_status ON paid_content_attachments(status)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_attachments_hash ON paid_content_attachments(file_hash)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_attachments_phash ON paid_content_attachments(perceptual_hash)')
# Add thumbnail_data column if it doesn't exist (migration)
_safe_alter(cursor, 'ALTER TABLE paid_content_attachments ADD COLUMN IF NOT EXISTS thumbnail_data BLOB')
# Add quality recheck tracking columns (migration)
_safe_alter(cursor, 'ALTER TABLE paid_content_attachments ADD COLUMN IF NOT EXISTS needs_quality_recheck INTEGER DEFAULT 0')
_safe_alter(cursor, 'ALTER TABLE paid_content_attachments ADD COLUMN IF NOT EXISTS last_quality_check TEXT')
_safe_alter(cursor, 'ALTER TABLE paid_content_attachments ADD COLUMN IF NOT EXISTS quality_recheck_count INTEGER DEFAULT 0')
# Table 6a: paid_content_messages - Chat messages from creators
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_messages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
creator_id INTEGER NOT NULL,
message_id TEXT NOT NULL,
text TEXT,
sent_at TEXT,
is_from_creator INTEGER DEFAULT 1,
is_tip INTEGER DEFAULT 0,
tip_amount REAL,
price REAL,
is_free INTEGER DEFAULT 1,
is_purchased INTEGER DEFAULT 0,
has_attachments INTEGER DEFAULT 0,
attachment_count INTEGER DEFAULT 0,
is_read INTEGER DEFAULT 0,
reply_to_message_id TEXT,
metadata TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(creator_id, message_id),
FOREIGN KEY (creator_id) REFERENCES paid_content_creators(id) ON DELETE CASCADE
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_messages_creator ON paid_content_messages(creator_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_messages_sent_at ON paid_content_messages(sent_at DESC)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_messages_from_creator ON paid_content_messages(is_from_creator)')
# Add message_id column to paid_content_attachments (migration)
_safe_alter(cursor, 'ALTER TABLE paid_content_attachments ADD COLUMN IF NOT EXISTS message_id INTEGER REFERENCES paid_content_messages(id) ON DELETE CASCADE')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_attachments_message ON paid_content_attachments(message_id)')
# Table 6: paid_content_embeds - Embedded Videos (YouTube, etc.)
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_embeds (
id INTEGER PRIMARY KEY AUTOINCREMENT,
post_id INTEGER NOT NULL,
url TEXT NOT NULL,
platform TEXT, -- 'youtube', 'vimeo', etc.
video_id TEXT,
title TEXT,
status TEXT DEFAULT 'pending', -- 'pending', 'downloading', 'completed', 'failed', 'skipped'
local_path TEXT,
local_filename TEXT,
file_size INTEGER,
duration INTEGER,
error_message TEXT,
download_attempts INTEGER DEFAULT 0,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
downloaded_at TEXT,
UNIQUE(post_id, url),
FOREIGN KEY (post_id) REFERENCES paid_content_posts(id) ON DELETE CASCADE
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_embeds_post ON paid_content_embeds(post_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_embeds_status ON paid_content_embeds(status)')
# Table 7: paid_content_favorites - User Favorites
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_favorites (
id INTEGER PRIMARY KEY AUTOINCREMENT,
item_type TEXT NOT NULL, -- 'creator', 'post', 'attachment'
item_id INTEGER NOT NULL,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(item_type, item_id)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_favorites_type_item ON paid_content_favorites(item_type, item_id)')
# Table 8: paid_content_download_history - For Retry Tracking
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_download_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
attachment_id INTEGER,
embed_id INTEGER,
url TEXT,
attempt_date TEXT DEFAULT CURRENT_TIMESTAMP,
status TEXT NOT NULL, -- 'success', 'failed', 'skipped'
error_message TEXT,
response_code INTEGER,
duration_seconds REAL,
FOREIGN KEY (attachment_id) REFERENCES paid_content_attachments(id) ON DELETE CASCADE,
FOREIGN KEY (embed_id) REFERENCES paid_content_embeds(id) ON DELETE CASCADE
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_history_attachment ON paid_content_download_history(attachment_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_history_status ON paid_content_download_history(status)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_history_date ON paid_content_download_history(attempt_date DESC)')
# Table 9: paid_content_notifications - Notification History
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_notifications (
id INTEGER PRIMARY KEY AUTOINCREMENT,
notification_type TEXT NOT NULL, -- 'new_content', 'download_complete', 'sync_complete', 'error'
creator_id INTEGER,
post_id INTEGER,
title TEXT NOT NULL,
message TEXT NOT NULL,
download_count INTEGER DEFAULT 0,
file_count INTEGER DEFAULT 0,
is_read INTEGER DEFAULT 0,
read_at TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (creator_id) REFERENCES paid_content_creators(id) ON DELETE SET NULL,
FOREIGN KEY (post_id) REFERENCES paid_content_posts(id) ON DELETE SET NULL
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_notifications_created ON paid_content_notifications(created_at DESC)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_notifications_read ON paid_content_notifications(is_read)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_notifications_creator ON paid_content_notifications(creator_id)')
# Add metadata column if not exists (for storing media_files)
_safe_alter(cursor, 'ALTER TABLE paid_content_notifications ADD COLUMN IF NOT EXISTS metadata TEXT')
# Table 10: paid_content_config - Settings (Singleton)
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_config (
id INTEGER PRIMARY KEY CHECK (id = 1), -- Singleton
base_download_path TEXT DEFAULT '/paid-content',
organize_by_date INTEGER DEFAULT 1,
organize_by_post INTEGER DEFAULT 1,
check_interval_hours INTEGER DEFAULT 6,
max_concurrent_downloads INTEGER DEFAULT 3,
download_embeds INTEGER DEFAULT 1,
embed_quality TEXT DEFAULT 'best',
notifications_enabled INTEGER DEFAULT 1,
push_notifications_enabled INTEGER DEFAULT 1,
perceptual_duplicate_detection INTEGER DEFAULT 1,
perceptual_threshold INTEGER DEFAULT 12, -- Hamming distance
auto_retry_failed INTEGER DEFAULT 1,
retry_max_attempts INTEGER DEFAULT 3,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
# Insert default paid content config
cursor.execute('INSERT OR IGNORE INTO paid_content_config (id) VALUES (1)')
# Table 11: paid_content_recycle_bin - Soft-deleted content
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_recycle_bin (
id INTEGER PRIMARY KEY AUTOINCREMENT,
item_type TEXT NOT NULL, -- 'post', 'attachment', 'creator'
original_id INTEGER NOT NULL,
original_data TEXT NOT NULL, -- JSON of original record
deleted_at TEXT DEFAULT CURRENT_TIMESTAMP,
deleted_by TEXT,
restore_path TEXT,
UNIQUE(item_type, original_id)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_recycle_type ON paid_content_recycle_bin(item_type)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_recycle_deleted_at ON paid_content_recycle_bin(deleted_at DESC)')
# Table 12: paid_content_tags - Tag definitions
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_tags (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
slug TEXT NOT NULL UNIQUE,
color TEXT DEFAULT '#6b7280', -- Hex color for UI display
description TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_tags_slug ON paid_content_tags(slug)')
# Insert default PPV tag
cursor.execute('''
INSERT OR IGNORE INTO paid_content_tags (name, slug, color, description)
VALUES ('PPV', 'ppv', '#f59e0b', 'Pay-per-view content')
''')
# Insert default Short tag (for xHamster moments/shorts)
cursor.execute('''
INSERT OR IGNORE INTO paid_content_tags (name, slug, color, description)
VALUES ('Short', 'short', '#8b5cf6', 'Short-form content (moments, clips)')
''')
# Table 13: paid_content_post_tags - Junction table for post-tag relationships
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_post_tags (
post_id INTEGER NOT NULL,
tag_id INTEGER NOT NULL,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (post_id, tag_id),
FOREIGN KEY (post_id) REFERENCES paid_content_posts(id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES paid_content_tags(id) ON DELETE CASCADE
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_post_tags_post ON paid_content_post_tags(post_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_post_tags_tag ON paid_content_post_tags(tag_id)')
# Table 13b: paid_content_post_tagged_users - Junction table for Instagram tagged users
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_post_tagged_users (
id INTEGER PRIMARY KEY AUTOINCREMENT,
post_id INTEGER NOT NULL,
username TEXT NOT NULL,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(post_id, username),
FOREIGN KEY (post_id) REFERENCES paid_content_posts(id) ON DELETE CASCADE
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_tagged_users_username ON paid_content_post_tagged_users(username)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_tagged_users_post_id ON paid_content_post_tagged_users(post_id)')
# Table 14: paid_content_auto_tag_rules - Auto-tagging rules for sync
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_auto_tag_rules (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
enabled INTEGER DEFAULT 1,
conditions TEXT NOT NULL,
tag_ids TEXT NOT NULL,
priority INTEGER DEFAULT 0,
match_count INTEGER DEFAULT 0,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_auto_tag_rules_enabled ON paid_content_auto_tag_rules(enabled)')
# Table 15: paid_content_watch_later - Watch later playlist queue
cursor.execute('''
CREATE TABLE IF NOT EXISTS paid_content_watch_later (
id INTEGER PRIMARY KEY AUTOINCREMENT,
attachment_id INTEGER NOT NULL UNIQUE,
post_id INTEGER NOT NULL,
creator_id INTEGER NOT NULL,
position INTEGER NOT NULL DEFAULT 0,
added_at TEXT DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (attachment_id) REFERENCES paid_content_attachments(id) ON DELETE CASCADE,
FOREIGN KEY (post_id) REFERENCES paid_content_posts(id) ON DELETE CASCADE,
FOREIGN KEY (creator_id) REFERENCES paid_content_creators(id) ON DELETE CASCADE
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_watch_later_position ON paid_content_watch_later(position)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pc_watch_later_attachment ON paid_content_watch_later(attachment_id)')
# ============================================================================
# PRIVATE GALLERY TABLES
# Encrypted private media storage with person tracking
# ============================================================================
# Private Gallery Config - Singleton settings table
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_config (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
''')
# Insert default config values
default_config = [
('storage_path', '/opt/immich/private'),
('thumbnail_path', '/opt/immich/private/thumbnails'),
('organize_by_person', 'true'),
('organize_by_date', 'true'),
('auto_lock_minutes', '30'),
('duplicate_auto_select_distance', '2'),
('is_setup_complete', 'false'),
]
for key, value in default_config:
cursor.execute('''
INSERT OR IGNORE INTO private_media_config (key, value)
VALUES (?, ?)
''', (key, value))
# Private Gallery Relationships - Configurable relationship types
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_relationships (
id INTEGER PRIMARY KEY AUTOINCREMENT,
encrypted_name TEXT NOT NULL,
color TEXT DEFAULT '#6366f1',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
''')
# Private Gallery Persons - People tracked in the gallery
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_persons (
id INTEGER PRIMARY KEY AUTOINCREMENT,
encrypted_name TEXT NOT NULL,
encrypted_sort_name TEXT,
relationship_id INTEGER NOT NULL,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (relationship_id) REFERENCES private_media_relationships(id) ON DELETE RESTRICT
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_persons_relationship ON private_media_persons(relationship_id)')
# Private Gallery Posts - Groups multiple media items together
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_posts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
person_id INTEGER,
encrypted_description TEXT,
encrypted_media_date TEXT NOT NULL,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (person_id) REFERENCES private_media_persons(id) ON DELETE SET NULL
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_posts_person ON private_media_posts(person_id)')
# Private Gallery Media - The actual media items
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media (
id INTEGER PRIMARY KEY AUTOINCREMENT,
post_id INTEGER,
storage_id TEXT NOT NULL UNIQUE,
encrypted_filename TEXT NOT NULL,
encrypted_description TEXT,
file_hash TEXT NOT NULL,
file_size INTEGER NOT NULL,
file_type TEXT NOT NULL,
mime_type TEXT NOT NULL,
width INTEGER,
height INTEGER,
duration REAL,
person_id INTEGER,
encrypted_media_date TEXT NOT NULL,
source_type TEXT,
encrypted_source_path TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (post_id) REFERENCES private_media_posts(id) ON DELETE CASCADE,
FOREIGN KEY (person_id) REFERENCES private_media_persons(id) ON DELETE SET NULL
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_media_person ON private_media(person_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_media_hash ON private_media(file_hash)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_media_file_type ON private_media(file_type)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_media_storage_id ON private_media(storage_id)')
# Migration: Add perceptual_hash column for perceptual duplicate detection
_safe_alter(cursor, 'ALTER TABLE private_media ADD COLUMN IF NOT EXISTS perceptual_hash TEXT')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_media_phash ON private_media(perceptual_hash)')
# Migration: Add post_id column to existing private_media table
_safe_alter(cursor, 'ALTER TABLE private_media ADD COLUMN IF NOT EXISTS post_id INTEGER REFERENCES private_media_posts(id) ON DELETE CASCADE')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_media_post_id ON private_media(post_id)')
# Migration: Add is_read column to private_media_posts
_safe_alter(cursor, 'ALTER TABLE private_media_posts ADD COLUMN IF NOT EXISTS is_read INTEGER DEFAULT 0')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_posts_is_read ON private_media_posts(is_read)')
# Private Gallery Tags - Encrypted tag definitions for private gallery
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_gallery_tags (
id INTEGER PRIMARY KEY AUTOINCREMENT,
encrypted_name TEXT NOT NULL,
color TEXT DEFAULT '#6b7280',
encrypted_description TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
# Private Gallery Tags Junction Table
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_tags (
media_id INTEGER NOT NULL,
tag_id INTEGER NOT NULL,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (media_id, tag_id),
FOREIGN KEY (media_id) REFERENCES private_media(id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES private_gallery_tags(id) ON DELETE CASCADE
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_tags_media ON private_media_tags(media_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_tags_tag ON private_media_tags(tag_id)')
# Private Gallery Post Tags - Tags linked to posts (for grouped uploads)
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_post_tags (
post_id INTEGER NOT NULL,
tag_id INTEGER NOT NULL,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (post_id, tag_id),
FOREIGN KEY (post_id) REFERENCES private_media_posts(id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES private_gallery_tags(id) ON DELETE CASCADE
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_post_tags_post ON private_media_post_tags(post_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_post_tags_tag ON private_media_post_tags(tag_id)')
# Private Gallery Person Default Tags - Default tags auto-applied when person is selected
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_person_default_tags (
person_id INTEGER NOT NULL,
tag_id INTEGER NOT NULL,
PRIMARY KEY (person_id, tag_id),
FOREIGN KEY (person_id) REFERENCES private_media_persons(id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES private_gallery_tags(id) ON DELETE CASCADE
)
''')
# Private Gallery Person Groups (named collections of persons for filtering)
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_person_groups (
id INTEGER PRIMARY KEY AUTOINCREMENT,
encrypted_name TEXT NOT NULL,
encrypted_description TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_person_group_members (
group_id INTEGER NOT NULL REFERENCES private_media_person_groups(id) ON DELETE CASCADE,
person_id INTEGER NOT NULL REFERENCES private_media_persons(id) ON DELETE CASCADE,
added_at TEXT DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (group_id, person_id)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_person_group_members_person ON private_media_person_group_members(person_id)')
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_person_group_tag_members (
group_id INTEGER NOT NULL REFERENCES private_media_person_groups(id) ON DELETE CASCADE,
tag_id INTEGER NOT NULL REFERENCES private_gallery_tags(id) ON DELETE CASCADE,
added_at TEXT DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (group_id, tag_id)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_person_group_tag_members_tag ON private_media_person_group_tag_members(tag_id)')
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_person_group_excluded_tags (
group_id INTEGER NOT NULL REFERENCES private_media_person_groups(id) ON DELETE CASCADE,
tag_id INTEGER NOT NULL REFERENCES private_gallery_tags(id) ON DELETE CASCADE,
added_at TEXT DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (group_id, tag_id)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_person_group_excluded_tags_tag ON private_media_person_group_excluded_tags(tag_id)')
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_person_group_relationship_members (
group_id INTEGER NOT NULL REFERENCES private_media_person_groups(id) ON DELETE CASCADE,
relationship_id INTEGER NOT NULL REFERENCES private_media_relationships(id) ON DELETE CASCADE,
added_at TEXT DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (group_id, relationship_id)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pm_person_group_rel_members_rel ON private_media_person_group_relationship_members(relationship_id)')
# Migration: Add min_resolution column to person groups
_safe_alter(cursor, 'ALTER TABLE private_media_person_groups ADD COLUMN IF NOT EXISTS min_resolution INTEGER DEFAULT 0')
# Reddit community monitoring for private gallery
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_reddit_communities (
id INTEGER PRIMARY KEY AUTOINCREMENT,
subreddit_name TEXT NOT NULL,
person_id INTEGER NOT NULL,
enabled INTEGER DEFAULT 1,
last_checked TEXT,
total_media_found INTEGER DEFAULT 0,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (person_id) REFERENCES private_media_persons(id) ON DELETE CASCADE
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_reddit_communities_person ON private_media_reddit_communities(person_id)')
cursor.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_reddit_communities_unique ON private_media_reddit_communities(subreddit_name, person_id)')
# Track which Reddit posts have been processed (avoid re-downloading)
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_reddit_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
community_id INTEGER NOT NULL,
reddit_post_id TEXT NOT NULL,
media_count INTEGER DEFAULT 0,
processed_at TEXT DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (community_id) REFERENCES private_media_reddit_communities(id) ON DELETE CASCADE
)
''')
cursor.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_reddit_history_unique ON private_media_reddit_history(community_id, reddit_post_id)')
# Private Gallery Import Auth - Per-domain authentication for URL imports
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_gallery_import_auth (
id INTEGER PRIMARY KEY AUTOINCREMENT,
domain TEXT NOT NULL UNIQUE,
auth_type TEXT NOT NULL DEFAULT 'basic',
encrypted_username TEXT,
encrypted_password TEXT,
encrypted_cookies_json TEXT,
encrypted_user_agent TEXT,
notes TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_pg_import_auth_domain ON private_gallery_import_auth(domain)')
# Scraper account → person mappings for Instagram, TikTok, Snapchat
cursor.execute('''
CREATE TABLE IF NOT EXISTS private_media_scraper_accounts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
platform TEXT NOT NULL,
username TEXT NOT NULL,
person_id INTEGER NOT NULL,
enabled INTEGER DEFAULT 1,
last_imported_at TEXT,
last_imported_file_id INTEGER DEFAULT 0,
total_media_imported INTEGER DEFAULT 0,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (person_id) REFERENCES private_media_persons(id) ON DELETE CASCADE,
UNIQUE (platform, username, person_id)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_scraper_accounts_platform ON private_media_scraper_accounts(platform)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_scraper_accounts_person ON private_media_scraper_accounts(person_id)')
# Migration: Add last_imported_file_id column if missing
_safe_alter(cursor, 'ALTER TABLE private_media_scraper_accounts ADD COLUMN IF NOT EXISTS last_imported_file_id INTEGER DEFAULT 0')
# Migration: Add original_post_id column to track media moved between posts
_safe_alter(cursor, 'ALTER TABLE private_media ADD COLUMN IF NOT EXISTS original_post_id INTEGER REFERENCES private_media_posts(id) ON DELETE SET NULL')
# ============================================================================
# PRESS MONITOR TABLES
# For tracking news articles about celebrities from GDELT
# ============================================================================
cursor.execute('''
CREATE TABLE IF NOT EXISTS press_config (
id INTEGER PRIMARY KEY CHECK (id = 1),
enabled INTEGER DEFAULT 1,
check_interval_hours INTEGER DEFAULT 6,
max_records_per_query INTEGER DEFAULT 25,
notify_new_articles INTEGER DEFAULT 1,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
cursor.execute('''
INSERT OR IGNORE INTO press_config (id, enabled, check_interval_hours, max_records_per_query, notify_new_articles)
VALUES (1, 1, 6, 25, 1)
''')
cursor.execute('''
CREATE TABLE IF NOT EXISTS press_articles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
celebrity_id INTEGER NOT NULL,
title TEXT,
url TEXT NOT NULL,
url_hash TEXT NOT NULL,
domain TEXT,
published_date TEXT,
image_url TEXT,
language TEXT,
country TEXT,
article_content TEXT,
snippet TEXT,
fetched_at TEXT DEFAULT CURRENT_TIMESTAMP,
notified INTEGER DEFAULT 0,
read INTEGER DEFAULT 0,
FOREIGN KEY (celebrity_id) REFERENCES celebrity_profiles(id) ON DELETE CASCADE
)
''')
cursor.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_press_url_hash ON press_articles(url_hash)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_press_celebrity ON press_articles(celebrity_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_press_published ON press_articles(published_date DESC)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_press_domain ON press_articles(domain)')
# Add celebrity_ids column to press_config if not exists (migration)
_safe_alter(cursor, "ALTER TABLE press_config ADD COLUMN IF NOT EXISTS celebrity_ids TEXT")
# ============================================================================
# KEY-VALUE STORE TABLE
# For general application settings (fingerprints, cache, etc.)
# ============================================================================
cursor.execute('''
CREATE TABLE IF NOT EXISTS key_value_store (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
)
''')
conn.commit()
def get_url_hash(self, url: str) -> str:
"""Generate SHA256 hash of URL"""
return hashlib.sha256(url.encode('utf-8')).hexdigest()
# ============================================================================
# KEY-VALUE STORE METHODS
# Simple key-value storage for application settings
# ============================================================================
def get_setting(self, key: str) -> Optional[str]:
"""
Get a setting value from the key-value store.
Args:
key: Setting key name
Returns:
Setting value as string, or None if not found
"""
with self.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('SELECT value FROM key_value_store WHERE key = ?', (key,))
row = cursor.fetchone()
return row[0] if row else None
def set_setting(self, key: str, value: str) -> bool:
"""
Set a setting value in the key-value store.
Args:
key: Setting key name
value: Setting value (as string)
Returns:
True if successful
"""
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
INSERT INTO key_value_store (key, value, updated_at)
VALUES (?, ?, CURRENT_TIMESTAMP)
ON CONFLICT(key) DO UPDATE SET
value = excluded.value,
updated_at = CURRENT_TIMESTAMP
''', (key, value))
conn.commit()
return True
def delete_setting(self, key: str) -> bool:
"""
Delete a setting from the key-value store.
Args:
key: Setting key name
Returns:
True if deleted, False if key didn't exist
"""
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('DELETE FROM key_value_store WHERE key = ?', (key,))
conn.commit()
return cursor.rowcount > 0
@staticmethod
def get_file_hash(file_path: str) -> Optional[str]:
"""
Calculate SHA256 hash of a file
Args:
file_path: Path to the file
Returns:
SHA256 hash of file content, or None if file doesn't exist or error occurs
"""
try:
file_path = Path(file_path)
if not file_path.exists() or not file_path.is_file():
return None
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
# Read file in chunks to handle large files efficiently
for byte_block in iter(lambda: f.read(65536), b""):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()
except Exception as e:
logger.error(f"Failed to calculate file hash for {file_path}: {e}")
return None
# ============================================================================
# SCRAPER CONFIGURATION METHODS
# See docs/SCRAPER_PROXY_SYSTEM.md for full documentation
# ============================================================================
def _migrate_scraper_cookies(self, cursor):
"""
Migrate cookies from JSON files to database (one-time migration).
Called during database initialization.
"""
import os
cookie_files = {
'coppermine': '/opt/media-downloader/cookies/coppermine_cookies.json',
'imginn': '/opt/media-downloader/cookies/imginn_cookies.json',
'fastdl': '/opt/media-downloader/cookies/fastdl_cookies.json',
'snapchat': '/opt/media-downloader/cookies/snapchat_cookies.json',
'forum_phun': '/opt/media-downloader/cookies/forum_cookies_phun.org.json',
'forum_hqcelebcorner': '/opt/media-downloader/cookies/forum_cookies_HQCelebCorner.json',
'forum_picturepub': '/opt/media-downloader/cookies/forum_cookies_PicturePub.json',
}
for scraper_id, cookie_file in cookie_files.items():
if os.path.exists(cookie_file):
try:
# Check if scraper already has cookies (don't overwrite)
cursor.execute(
'SELECT cookies_json FROM scrapers WHERE id = ?',
(scraper_id,)
)
row = cursor.fetchone()
if row and row[0]:
# Already has cookies, skip
continue
with open(cookie_file, 'r') as f:
data = json.load(f)
# Store in database
cursor.execute('''
UPDATE scrapers
SET cookies_json = ?, cookies_updated_at = ?
WHERE id = ?
''', (json.dumps(data), datetime.now().isoformat(), scraper_id))
logger.info(f"Migrated cookies for {scraper_id} from {cookie_file}")
except Exception as e:
logger.warning(f"Failed to migrate cookies for {scraper_id}: {e}")
def get_all_scrapers(self, type_filter: str = None) -> List[Dict]:
"""
Get all scrapers with optional type filter.
Args:
type_filter: Optional filter by type ('direct', 'proxy', 'forum', 'cli_tool')
Returns:
List of scraper configurations
"""
with self.get_connection() as conn:
cursor = conn.cursor()
if type_filter:
cursor.execute('''
SELECT * FROM scrapers WHERE type = ? ORDER BY type, name
''', (type_filter,))
else:
cursor.execute('SELECT * FROM scrapers ORDER BY type, name')
rows = cursor.fetchall()
scrapers = []
for row in rows:
scraper = dict(row)
# Parse cookies to get count
cookies_count = 0
cookies_fresh = False
if scraper.get('cookies_json'):
try:
cookie_data = json.loads(scraper['cookies_json'])
# Handle both formats: list of cookies or dict with 'cookies' key
if isinstance(cookie_data, list):
cookies = cookie_data
elif isinstance(cookie_data, dict):
cookies = cookie_data.get('cookies', [])
else:
cookies = []
cookies_count = len(cookies)
# Check if cookies are fresh (updated within last 24 hours)
if scraper.get('cookies_updated_at'):
updated = datetime.fromisoformat(scraper['cookies_updated_at'])
age_hours = (datetime.now() - updated).total_seconds() / 3600
cookies_fresh = age_hours < 24
except (json.JSONDecodeError, ValueError, TypeError, AttributeError):
pass # Invalid JSON or datetime format
scraper['cookies_count'] = cookies_count
scraper['cookies_fresh'] = cookies_fresh
# Convert enabled to boolean for frontend
scraper['enabled'] = bool(scraper.get('enabled', 1))
scraper['proxy_enabled'] = bool(scraper.get('proxy_enabled', 0))
scraper['flaresolverr_required'] = bool(scraper.get('flaresolverr_required', 0))
scrapers.append(scraper)
return scrapers
def get_scraper(self, scraper_id: str) -> Optional[Dict]:
"""
Get a single scraper configuration.
Args:
scraper_id: Scraper ID (e.g., 'imginn', 'forum_phun')
Returns:
Scraper configuration dict or None if not found
"""
with self.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('SELECT * FROM scrapers WHERE id = ?', (scraper_id,))
row = cursor.fetchone()
if not row:
return None
scraper = dict(row)
# Convert enabled to boolean
scraper['enabled'] = bool(scraper.get('enabled', 1))
scraper['proxy_enabled'] = bool(scraper.get('proxy_enabled', 0))
scraper['flaresolverr_required'] = bool(scraper.get('flaresolverr_required', 0))
return scraper
def update_scraper(self, scraper_id: str, updates: Dict) -> bool:
"""
Update scraper settings.
Args:
scraper_id: Scraper ID
updates: Dictionary of fields to update
Returns:
True if updated, False if scraper not found
"""
# Allowed fields for update
allowed_fields = [
'name', 'base_url', 'enabled', 'proxy_enabled', 'proxy_url',
'flaresolverr_required', 'settings_json'
]
# Filter to only allowed fields
filtered_updates = {k: v for k, v in updates.items() if k in allowed_fields}
if not filtered_updates:
return False
# Build UPDATE query
set_clauses = [f"{field} = ?" for field in filtered_updates.keys()]
set_clauses.append("updated_at = ?")
values = list(filtered_updates.values())
values.append(datetime.now().isoformat())
values.append(scraper_id)
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute(f'''
UPDATE scrapers
SET {", ".join(set_clauses)}
WHERE id = ?
''', values)
return cursor.rowcount > 0
def get_scraper_cookies(self, scraper_id: str) -> Optional[List[Dict]]:
"""
Get cookies for a scraper.
Args:
scraper_id: Scraper ID
Returns:
List of cookie dicts or None if no cookies
"""
with self.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
'SELECT cookies_json FROM scrapers WHERE id = ?',
(scraper_id,)
)
row = cursor.fetchone()
if not row or not row['cookies_json']:
return None
try:
data = json.loads(row['cookies_json'])
return data.get('cookies', [])
except (json.JSONDecodeError, TypeError, KeyError):
return None # Invalid JSON format
def get_scraper_cookies_user_agent(self, scraper_id: str) -> Optional[str]:
"""
Get the user_agent stored with a scraper's cookies.
This is critical for Cloudflare cf_clearance cookies which are
fingerprinted to the browser that solved the challenge.
Args:
scraper_id: Scraper ID
Returns:
User agent string or None if not stored
"""
with self.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
'SELECT cookies_json FROM scrapers WHERE id = ?',
(scraper_id,)
)
row = cursor.fetchone()
if not row or not row['cookies_json']:
return None
try:
data = json.loads(row['cookies_json'])
return data.get('user_agent')
except (json.JSONDecodeError, TypeError, KeyError):
return None
def save_scraper_cookies(self, scraper_id: str, cookies: List[Dict],
user_agent: str = None, merge: bool = True) -> bool:
"""
Save cookies for a scraper.
IMPORTANT: By default, this MERGES with existing cookies to preserve
session/auth cookies while updating Cloudflare cookies.
Args:
scraper_id: Scraper ID
cookies: List of cookie dicts
user_agent: Optional user agent (important for cf_clearance)
merge: If True, merge with existing cookies (default). If False, replace all.
Returns:
True if saved successfully
"""
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
final_cookies = cookies
if merge:
# Get existing cookies
cursor.execute(
'SELECT cookies_json FROM scrapers WHERE id = ?',
(scraper_id,)
)
row = cursor.fetchone()
if row and row['cookies_json']:
try:
existing_data = json.loads(row['cookies_json'])
existing_cookies = existing_data if isinstance(existing_data, list) else existing_data.get('cookies', [])
# Merge: existing cookies as base, new cookies override
cookie_map = {c['name']: c for c in existing_cookies}
for cookie in cookies:
cookie_map[cookie['name']] = cookie
final_cookies = list(cookie_map.values())
logger.debug(f"Merged {len(cookies)} new cookies with {len(existing_cookies)} existing -> {len(final_cookies)} total")
except (json.JSONDecodeError, TypeError, KeyError):
pass # Invalid existing cookies, use new ones only
# Prepare data to save
data = {
'cookies': final_cookies,
'timestamp': datetime.now().isoformat()
}
if user_agent:
data['user_agent'] = user_agent
cursor.execute('''
UPDATE scrapers
SET cookies_json = ?, cookies_updated_at = ?, updated_at = ?
WHERE id = ?
''', (json.dumps(data), datetime.now().isoformat(),
datetime.now().isoformat(), scraper_id))
return cursor.rowcount > 0
def clear_scraper_cookies(self, scraper_id: str) -> bool:
"""
Clear all cookies for a scraper.
Args:
scraper_id: Scraper ID
Returns:
True if cleared successfully
"""
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE scrapers
SET cookies_json = NULL, cookies_updated_at = NULL, updated_at = ?
WHERE id = ?
''', (datetime.now().isoformat(), scraper_id))
return cursor.rowcount > 0
def update_scraper_test_status(self, scraper_id: str, status: str,
message: str = None) -> bool:
"""
Update the test status for a scraper.
Args:
scraper_id: Scraper ID
status: Status string ('success', 'failed', 'timeout')
message: Optional error message
Returns:
True if updated successfully
"""
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE scrapers
SET last_test_at = ?, last_test_status = ?, last_test_message = ?, updated_at = ?
WHERE id = ?
''', (datetime.now().isoformat(), status, message,
datetime.now().isoformat(), scraper_id))
return cursor.rowcount > 0
def create_scraper(self, scraper: Dict) -> bool:
"""
Create a new scraper entry (used when new forums are added).
Args:
scraper: Scraper configuration dict with at least 'id', 'name', 'type'
Returns:
True if created successfully
"""
required_fields = ['id', 'name', 'type']
if not all(field in scraper for field in required_fields):
logger.error(f"Missing required fields for scraper: {required_fields}")
return False
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
INSERT OR IGNORE INTO scrapers
(id, name, type, module, base_url, target_platform, enabled,
proxy_enabled, proxy_url, flaresolverr_required, settings_json)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
scraper['id'],
scraper['name'],
scraper['type'],
scraper.get('module'),
scraper.get('base_url'),
scraper.get('target_platform'),
1 if scraper.get('enabled', True) else 0,
1 if scraper.get('proxy_enabled', False) else 0,
scraper.get('proxy_url'),
1 if scraper.get('flaresolverr_required', False) else 0,
json.dumps(scraper.get('settings')) if scraper.get('settings') else None
))
return cursor.rowcount > 0
def delete_scraper(self, scraper_id: str) -> bool:
"""
Delete a scraper entry (used when forums are removed).
Args:
scraper_id: Scraper ID to delete
Returns:
True if deleted successfully
"""
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('DELETE FROM scrapers WHERE id = ?', (scraper_id,))
return cursor.rowcount > 0
def get_scraper_cookies_dict(self, scraper_id: str) -> Dict[str, str]:
"""
Get cookies as a simple name->value dictionary (for requests library).
Args:
scraper_id: Scraper ID
Returns:
Dictionary of cookie name->value pairs
"""
cookies = self.get_scraper_cookies(scraper_id)
if not cookies:
return {}
return {c['name']: c['value'] for c in cookies}
def is_downloaded(self, url: str, platform: str = None) -> bool:
"""
Check if URL has been downloaded (hash persists even if file deleted)
Args:
url: URL to check
platform: Optional platform filter
Returns:
True if already downloaded (prevents redownload of deleted content)
"""
url_hash = self.get_url_hash(url)
with self.get_connection() as conn:
cursor = conn.cursor()
if platform:
cursor.execute(
"SELECT 1 FROM downloads WHERE url_hash = ? AND platform = ? LIMIT 1",
(url_hash, platform)
)
else:
cursor.execute(
"SELECT 1 FROM downloads WHERE url_hash = ? LIMIT 1",
(url_hash,)
)
return cursor.fetchone() is not None
def is_file_hash_downloaded(self, file_hash: str) -> bool:
"""
Check if a file with this hash has been downloaded, is in recycle bin, or review queue
(hash persists even if file deleted)
Args:
file_hash: SHA256 hash of file content
Returns:
True if file with this hash exists in downloads, recycle_bin, or file_inventory
"""
if not file_hash:
return False
with self.get_connection() as conn:
cursor = conn.cursor()
# Check downloads table (exclude temp files to avoid false positives during move operation)
cursor.execute(
"SELECT 1 FROM downloads WHERE file_hash = ? AND file_path NOT LIKE '%/temp/%' AND file_path NOT LIKE '%\\temp\\%' LIMIT 1",
(file_hash,)
)
if cursor.fetchone():
return True
# Check recycle_bin table
cursor.execute(
"SELECT 1 FROM recycle_bin WHERE file_hash = ? LIMIT 1",
(file_hash,)
)
if cursor.fetchone():
return True
# Check file_inventory (review queue, media files, etc.)
cursor.execute(
"SELECT 1 FROM file_inventory WHERE file_hash = ? LIMIT 1",
(file_hash,)
)
return cursor.fetchone() is not None
def get_download_by_file_hash(self, file_hash: str) -> Optional[Dict]:
"""
Get download record by file hash (returns record even if file deleted)
Args:
file_hash: SHA256 hash of file content
Returns:
Dictionary with download record, or None if not found
"""
if not file_hash:
return None
with self.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT
id, url, platform, source, content_type,
filename, file_path, post_date, download_date,
file_size, file_hash, metadata
FROM downloads
WHERE file_hash = ?
AND file_path IS NOT NULL
AND file_path NOT LIKE '%/temp/%'
AND file_path NOT LIKE '%\\temp\\%'
ORDER BY download_date DESC
LIMIT 1
''', (file_hash,))
row = cursor.fetchone()
if row:
result = dict(row)
# Parse post_date if it's a string
if result.get('post_date'):
try:
result['post_date'] = datetime.fromisoformat(result['post_date'])
except (ValueError, TypeError) as e:
logger.debug(f"Failed to parse post_date: {e}")
# Parse metadata JSON if present
if result.get('metadata'):
try:
result['metadata'] = json.loads(result['metadata'])
except (ValueError, TypeError, json.JSONDecodeError) as e:
logger.debug(f"Failed to parse metadata JSON: {e}")
return result
return None
def get_download_by_media_id(self, media_id: str, platform: str = 'instagram', method: str = None) -> Optional[Dict]:
"""
Get download record by Instagram media ID
Args:
media_id: Instagram media ID to search for
platform: Platform filter (default 'instagram')
method: Optional method filter (fastdl, imginn, toolzu, instaloader)
Returns:
Dictionary with download record including post_date and filename, or None if not found
"""
with self.get_connection() as conn:
cursor = conn.cursor()
# Use indexed media_id column for fast lookup (10-100x faster than LIKE)
logger.debug(f"Searching for media_id={media_id}, platform={platform}, method={method}")
if method:
cursor.execute('''
SELECT
id, url, platform, source, content_type,
filename, file_path, post_date, download_date,
metadata, method
FROM downloads
WHERE platform = ?
AND method = ?
AND media_id = ?
LIMIT 1
''', (platform, method, media_id))
else:
cursor.execute('''
SELECT
id, url, platform, source, content_type,
filename, file_path, post_date, download_date,
metadata, method
FROM downloads
WHERE platform = ?
AND media_id = ?
LIMIT 1
''', (platform, media_id))
row = cursor.fetchone()
if row:
# Convert Row object to dictionary
result = dict(row)
# Parse post_date if it's a string
if result.get('post_date'):
try:
result['post_date'] = datetime.fromisoformat(result['post_date'])
except (ValueError, TypeError) as e:
logger.debug(f"Failed to parse post_date: {e}")
# Parse metadata JSON if present
if result.get('metadata'):
try:
result['metadata'] = json.loads(result['metadata'])
except (ValueError, TypeError, json.JSONDecodeError) as e:
logger.debug(f"Failed to parse metadata JSON: {e}")
return result
return None
def mark_fastdl_upgraded(self, media_id: str) -> bool:
"""
Mark a FastDL record as upgraded with Toolzu high-res version
Args:
media_id: Instagram media ID
Returns:
True if successfully marked, False otherwise
"""
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Find the FastDL record (using indexed media_id column)
# Now uses platform='instagram' with method='fastdl'
cursor.execute('''
SELECT id, metadata FROM downloads
WHERE platform = 'instagram'
AND method = 'fastdl'
AND media_id = ?
LIMIT 1
''', (media_id,))
row = cursor.fetchone()
if not row:
return False
# Update metadata to mark as upgraded
record_id = row['id']
try:
metadata = json.loads(row['metadata']) if row['metadata'] else {}
except (ValueError, TypeError, json.JSONDecodeError) as e:
logger.debug(f"Failed to parse metadata for record {record_id}: {e}")
metadata = {}
metadata['upgraded'] = True
metadata['upgraded_date'] = datetime.now().isoformat()
cursor.execute('''
UPDATE downloads
SET metadata = ?
WHERE id = ?
''', (json.dumps(metadata), record_id))
conn.commit()
return True
def delete_downloads_by_date_range(self, platform: str, source: str = None,
days_back: int = 7) -> Dict[str, any]:
"""
Delete downloads from the last N days for a specific platform/source
Args:
platform: Platform to delete from ('instagram', 'tiktok', 'forum')
source: Optional source filter (username, forum name, etc.)
days_back: Number of days back to delete (default: 7)
Returns:
Dictionary with deletion statistics and file paths to delete
"""
cutoff_date = datetime.now() - timedelta(days=days_back)
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# First, get the records to be deleted (so we can delete files)
if source:
cursor.execute('''
SELECT id, file_path, filename, download_date, metadata
FROM downloads
WHERE platform = ?
AND source = ?
AND download_date >= ?
ORDER BY download_date DESC
''', (platform, source, cutoff_date.isoformat()))
else:
cursor.execute('''
SELECT id, file_path, filename, download_date, metadata
FROM downloads
WHERE platform = ?
AND download_date >= ?
ORDER BY download_date DESC
''', (platform, cutoff_date.isoformat()))
records = cursor.fetchall()
# Collect file paths and statistics
file_paths = []
record_ids = []
for row in records:
record_ids.append(row['id'])
if row['file_path']:
file_paths.append(row['file_path'])
# Delete the records
if record_ids:
placeholders = ','.join('?' * len(record_ids))
cursor.execute(f'''
DELETE FROM downloads
WHERE id IN ({placeholders})
''', record_ids)
deleted_count = cursor.rowcount
else:
deleted_count = 0
conn.commit()
return {
'deleted_count': deleted_count,
'file_paths': file_paths,
'cutoff_date': cutoff_date.isoformat(),
'platform': platform,
'source': source
}
def record_download(self,
url: str,
platform: str,
source: str,
content_type: str = None,
filename: str = None,
file_path: str = None,
file_size: int = None,
file_hash: str = None,
post_date: datetime = None,
status: str = 'completed',
error_message: str = None,
metadata: Dict = None,
method: str = None,
max_retries: int = 3) -> bool:
"""
Record a download in the unified database with retry logic
Note: Duplicate file hash checking is handled by MoveManager before recording
Returns:
True if successfully recorded, False if duplicate
"""
url_hash = self.get_url_hash(url)
# Extract media_id from metadata for fast queries
media_id = None
if metadata and isinstance(metadata, dict):
media_id = metadata.get('media_id') or metadata.get('video_id') or metadata.get('post_id')
# Use local time with T separator for download_date
download_date = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
for attempt in range(max_retries):
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
INSERT INTO downloads (
url_hash, url, platform, source, content_type,
filename, file_path, file_size, file_hash,
post_date, download_date, status, error_message, metadata, media_id, method
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
url_hash, url, platform, source, content_type,
filename, file_path, file_size, file_hash,
post_date.isoformat() if post_date else None,
download_date,
status, error_message,
json.dumps(metadata) if metadata else None,
media_id, method
))
conn.commit()
return True
except sqlite3.IntegrityError:
# Duplicate entry - this is expected, not an error
return False
except sqlite3.OperationalError as e:
if _is_lock_error(e):
if attempt < max_retries - 1:
# Wait with exponential backoff
wait_time = min(5, (2 ** attempt) * 0.5)
logger.warning(f"Database locked, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
time.sleep(wait_time)
continue
else:
logger.error(f"Failed to record download after {max_retries} attempts: {e}")
return False
else:
logger.error(f"Database error: {e}")
return False
except Exception as e:
logger.error(f"Failed to record download: {e}")
return False
return False
def get_platform_stats(self, platform: str = None) -> Dict:
"""Get download statistics by platform"""
with self.get_connection() as conn:
cursor = conn.cursor()
if platform:
cursor.execute('''
SELECT
COUNT(*) as total,
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed,
SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed,
SUM(file_size) as total_size,
MIN(download_date) as first_download,
MAX(download_date) as last_download
FROM downloads
WHERE platform = ?
''', (platform,))
else:
cursor.execute('''
SELECT
platform,
COUNT(*) as total,
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed,
SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed,
SUM(file_size) as total_size
FROM downloads
GROUP BY platform
''')
if platform:
row = cursor.fetchone()
if row:
return dict(row)
else:
return [dict(row) for row in cursor.fetchall()]
return {}
def update_file_location(self, url: str, platform: str, final_path: str,
final_hash: str = None, max_retries: int = 3) -> bool:
"""
Update the file_path and file_hash for a download record after moving
Args:
url: Original download URL
platform: Platform name
final_path: Final destination path after moving
final_hash: Optional SHA256 hash of final file (will calculate if not provided)
max_retries: Number of retries for locked database
Returns:
True if updated successfully, False otherwise
"""
import os
import time
url_hash = self.get_url_hash(url)
# Calculate hash if not provided
if not final_hash and os.path.exists(final_path):
try:
final_hash = self.get_file_hash(final_path)
except Exception:
pass # Continue without hash if calculation fails
for attempt in range(max_retries):
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE downloads
SET file_path = ?, file_hash = ?
WHERE url_hash = ? AND platform = ?
''', (final_path, final_hash, url_hash, platform))
conn.commit()
return cursor.rowcount > 0
except sqlite3.OperationalError as e:
if _is_lock_error(e):
if attempt < max_retries - 1:
time.sleep(0.1 * (2 ** attempt)) # Exponential backoff
continue
return False
except Exception as e:
return False
return False
def update_file_location_by_filename(self, filename: str, platform: str, source: str,
final_path: str, final_hash: str = None,
max_retries: int = 3) -> bool:
"""
Update the file_path and file_hash for a download record by filename
Args:
filename: Original filename in database
platform: Platform name
source: Source (username/forum name)
final_path: Final destination path after moving
final_hash: Optional SHA256 hash of final file (will calculate if not provided)
max_retries: Number of retries for locked database
Returns:
True if updated successfully, False otherwise
"""
import os
import time
# Calculate hash if not provided
if not final_hash and os.path.exists(final_path):
try:
final_hash = self.get_file_hash(final_path)
except Exception:
pass # Continue without hash if calculation fails
for attempt in range(max_retries):
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE downloads
SET file_path = ?, file_hash = ?
WHERE filename = ? AND platform = ? AND source = ?
''', (final_path, final_hash, filename, platform, source))
conn.commit()
return cursor.rowcount > 0
except sqlite3.OperationalError as e:
if _is_lock_error(e):
if attempt < max_retries - 1:
time.sleep(0.1 * (2 ** attempt)) # Exponential backoff
continue
return False
except Exception as e:
return False
return False
def get_scheduler_state(self, task_id: str) -> Optional[Dict]:
"""Get scheduler state for a task"""
with self.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT * FROM scheduler_state WHERE task_id = ?
''', (task_id,))
row = cursor.fetchone()
return dict(row) if row else None
def update_scheduler_state(self, task_id: str, last_run: datetime = None,
next_run: datetime = None, status: str = None,
error: str = None, metadata: Dict = None) -> bool:
"""Update scheduler state for a task"""
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Check if record exists
cursor.execute('SELECT task_id FROM scheduler_state WHERE task_id = ?', (task_id,))
exists = cursor.fetchone() is not None
if exists:
# Update existing record
updates = []
params = []
if last_run:
updates.append('last_run = ?')
params.append(last_run)
if next_run:
updates.append('next_run = ?')
params.append(next_run)
if status:
updates.append('status = ?')
params.append(status)
if error:
updates.append('last_error = ?')
updates.append('error_count = error_count + 1')
params.append(error)
else:
updates.append('run_count = run_count + 1')
if metadata:
updates.append('metadata = ?')
params.append(json.dumps(metadata))
params.append(task_id)
cursor.execute(f'''
UPDATE scheduler_state
SET {', '.join(updates)}
WHERE task_id = ?
''', params)
else:
# Insert new record
cursor.execute('''
INSERT INTO scheduler_state (task_id, last_run, next_run, status, metadata)
VALUES (?, ?, ?, ?, ?)
''', (task_id, last_run, next_run, status or 'active',
json.dumps(metadata) if metadata else None))
conn.commit()
return True
def get_all_scheduler_states(self) -> List[Dict]:
"""Get all scheduler states"""
with self.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('SELECT * FROM scheduler_state ORDER BY task_id')
return [dict(row) for row in cursor.fetchall()]
def was_thread_checked_recently(self, thread_id: str, hours: int = 6) -> bool:
"""Check if a thread was checked recently within specified hours"""
with self.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT COUNT(*) as count FROM thread_check_history
WHERE thread_id = ?
AND check_time > datetime('now', ? || ' hours')
AND status = 'completed'
''', (thread_id, f'-{hours}'))
result = cursor.fetchone()
return result['count'] > 0 if result else False
def record_thread_check(self, thread_id: str, forum_name: str,
last_post_date: datetime = None,
new_posts: int = 0, images: int = 0,
status: str = 'completed') -> bool:
"""Record that a thread was checked"""
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
try:
cursor.execute('''
INSERT INTO thread_check_history
(thread_id, forum_name, last_post_date, new_posts_found,
images_downloaded, status)
VALUES (?, ?, ?, ?, ?, ?)
''', (thread_id, forum_name, last_post_date, new_posts, images, status))
conn.commit()
return True
except Exception as e:
logger.error(f"Error recording thread check: {e}", module="Forum")
return False
def get_thread_last_check(self, thread_id: str) -> Optional[Dict]:
"""Get the last check information for a thread"""
with self.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT * FROM thread_check_history
WHERE thread_id = ?
ORDER BY check_time DESC
LIMIT 1
''', (thread_id,))
row = cursor.fetchone()
return dict(row) if row else None
def add_to_queue(self,
url: str,
platform: str,
source: str = None,
referer: str = None,
save_path: str = None,
priority: int = 5,
metadata: Dict = None) -> bool:
"""Add item to download queue with retry logic"""
for attempt in range(3):
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
INSERT INTO download_queue (
url, platform, source, referer, save_path,
priority, metadata
) VALUES (?, ?, ?, ?, ?, ?, ?)
''', (
url, platform, source, referer, save_path,
priority, json.dumps(metadata) if metadata else None
))
conn.commit()
return True
except sqlite3.IntegrityError:
# Already in queue - this is expected
return False
except sqlite3.OperationalError as e:
if _is_lock_error(e) and attempt < 2:
time.sleep(0.5 * (2 ** attempt))
continue
return False
except Exception:
return False
return False
def get_queue_items(self, platform: str = None, status: str = 'pending', limit: int = 100) -> List[Dict]:
"""Get items from download queue"""
with self.get_connection() as conn:
cursor = conn.cursor()
if platform:
cursor.execute('''
SELECT * FROM download_queue
WHERE platform = ? AND status = ?
ORDER BY priority ASC, created_date ASC
LIMIT ?
''', (platform, status, limit))
else:
cursor.execute('''
SELECT * FROM download_queue
WHERE status = ?
ORDER BY priority ASC, created_date ASC
LIMIT ?
''', (status, limit))
return [dict(row) for row in cursor.fetchall()]
def update_queue_status(self, queue_id: int, status: str, error_message: str = None):
"""Update queue item status with retry logic"""
for attempt in range(3):
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
if error_message:
cursor.execute('''
UPDATE download_queue
SET status = ?, error_message = ?, attempts = attempts + 1
WHERE id = ?
''', (status, error_message, queue_id))
else:
cursor.execute('''
UPDATE download_queue
SET status = ?, download_date = CURRENT_TIMESTAMP
WHERE id = ?
''', (status, queue_id))
conn.commit()
return
except sqlite3.OperationalError as e:
if _is_lock_error(e) and attempt < 2:
time.sleep(0.5 * (2 ** attempt))
continue
logger.error(f"Failed to update queue status: {e}")
return
except Exception as e:
logger.error(f"Error updating queue status: {e}")
return
def migrate_from_old_databases(self,
fastdl_db: str = None,
tiktok_db: str = None,
forum_dbs: List[str] = None,
verbose: bool = True) -> Dict[str, int]:
"""
Migrate data from old separate databases to unified database
Args:
fastdl_db: Path to FastDL database
tiktok_db: Path to TikTok database
forum_dbs: List of paths to forum databases
verbose: Print progress messages
Returns:
Dictionary with migration statistics
"""
stats = {
'fastdl': 0,
'tiktok': 0,
'forum': 0,
'errors': 0
}
# Migrate FastDL
if fastdl_db and Path(fastdl_db).exists():
if verbose:
print(f"Migrating FastDL database: {fastdl_db}")
old_conn = sqlite3.connect(fastdl_db)
old_conn.row_factory = sqlite3.Row
cursor = old_conn.cursor()
cursor.execute("SELECT * FROM downloads")
for row in cursor.fetchall():
try:
# Convert media_id to URL (best effort)
url = row['download_url'] if row['download_url'] else f"instagram://{row['media_id']}"
self.record_download(
url=url,
platform='instagram',
source=row['username'],
content_type=row['content_type'],
filename=row['filename'],
post_date=datetime.fromisoformat(row['post_date']) if row['post_date'] else None,
metadata={'media_id': row['media_id'], 'original_metadata': row['metadata']}
)
stats['fastdl'] += 1
except Exception as e:
if verbose:
print(f" Error migrating FastDL record: {e}")
stats['errors'] += 1
old_conn.close()
# Migrate TikTok
if tiktok_db and Path(tiktok_db).exists():
if verbose:
print(f"Migrating TikTok database: {tiktok_db}")
old_conn = sqlite3.connect(tiktok_db)
old_conn.row_factory = sqlite3.Row
cursor = old_conn.cursor()
cursor.execute("SELECT * FROM downloads")
for row in cursor.fetchall():
try:
url = f"https://www.tiktok.com/@{row['username']}/video/{row['video_id']}"
self.record_download(
url=url,
platform='tiktok',
source=row['username'],
content_type='video',
filename=row['filename'],
post_date=datetime.fromisoformat(row['post_date']) if row['post_date'] else None,
metadata={'video_id': row['video_id'], 'original_metadata': row['metadata']}
)
stats['tiktok'] += 1
except Exception as e:
if verbose:
print(f" Error migrating TikTok record: {e}")
stats['errors'] += 1
old_conn.close()
# Migrate Forum databases
if forum_dbs:
for forum_db in forum_dbs:
if Path(forum_db).exists():
if verbose:
print(f"Migrating Forum database: {forum_db}")
old_conn = sqlite3.connect(forum_db)
old_conn.row_factory = sqlite3.Row
cursor = old_conn.cursor()
# Extract forum name from filename (e.g., forum_PicturePub.db -> PicturePub)
forum_name = Path(forum_db).stem.replace('forum_', '')
# Migrate threads
try:
cursor.execute("SELECT * FROM threads")
with self.get_connection(for_write=True) as conn:
new_cursor = conn.cursor()
for row in cursor.fetchall():
try:
new_cursor.execute('''
INSERT OR IGNORE INTO forum_threads (
thread_id, forum_name, thread_url, thread_title,
author, created_date, last_checked, last_post_date,
post_count, status, monitor_until, metadata
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
row['thread_id'], forum_name, row['thread_url'],
row['thread_title'], row['author'], row['created_date'],
row['last_checked'], row['last_post_date'],
row['post_count'], row['status'],
row['monitor_until'], row['metadata']
))
except Exception as e:
if verbose:
print(f" Error migrating thread: {e}")
stats['errors'] += 1
conn.commit()
except Exception as e:
if verbose:
print(f" Error migrating threads: {e}")
logger.warning(f"Failed to migrate threads from {forum_db}: {e}")
# Migrate download queue items as downloads
try:
cursor.execute("SELECT * FROM download_queue WHERE status = 'completed'")
for row in cursor.fetchall():
try:
self.record_download(
url=row['url'],
platform='forums',
source=forum_name,
content_type='image',
file_path=row['save_path'],
metadata={
'thread_id': row['thread_id'],
'post_id': row['post_id'],
'original_metadata': row['metadata']
}
)
stats['forum'] += 1
except Exception as e:
if verbose:
print(f" Error migrating forum download: {e}")
stats['errors'] += 1
except Exception as e:
if verbose:
print(f" Error migrating forum downloads: {e}")
logger.warning(f"Failed to migrate forum downloads from {forum_db}: {e}")
old_conn.close()
if verbose:
print(f"\nMigration complete:")
print(f" FastDL records: {stats['fastdl']}")
print(f" TikTok records: {stats['tiktok']}")
print(f" Forum records: {stats['forum']}")
print(f" Errors: {stats['errors']}")
print(f" Total migrated: {stats['fastdl'] + stats['tiktok'] + stats['forum']}")
return stats
def log_face_recognition_scan(self, file_path: str, has_match: bool,
matched_person: str = None, confidence: float = None,
face_count: int = 0, scan_type: str = 'auto') -> bool:
"""
Log a face recognition scan result
Args:
file_path: Path to the scanned file
has_match: Whether a face was matched
matched_person: Name of matched person (if any)
confidence: Match confidence (0-1)
face_count: Number of faces detected
scan_type: Type of scan ('auto', 'manual', 'retroactive')
Returns:
True if logged successfully
"""
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Try to find download_id from file_path
cursor.execute('SELECT id FROM downloads WHERE file_path = ? LIMIT 1', (file_path,))
row = cursor.fetchone()
download_id = row['id'] if row else None
# Insert scan result
# Convert numpy float to Python float for proper numeric storage
confidence_value = float(confidence) if confidence is not None else None
cursor.execute('''
INSERT INTO face_recognition_scans
(download_id, file_path, has_match, matched_person, confidence, face_count, scan_type)
VALUES (?, ?, ?, ?, ?, ?, ?)
''', (download_id, file_path, has_match, matched_person, confidence_value, face_count, scan_type))
conn.commit()
return True
except Exception as e:
logger.error(f"Failed to log face recognition scan: {e}")
return False
@staticmethod
def _convert_face_scan_row(row) -> Dict:
"""Convert a face_recognition_scans row, handling bytes→proper types."""
import struct
result = dict(row)
for key, value in result.items():
if isinstance(value, bytes):
if key == 'confidence':
try:
result[key] = struct.unpack('d', value)[0]
except (struct.error, ValueError):
result[key] = None
else:
try:
result[key] = value.decode('utf-8')
except (UnicodeDecodeError, AttributeError):
result[key] = None
return result
def get_face_recognition_result(self, file_path: str) -> Optional[Dict]:
"""
Get face recognition result for a file
Args:
file_path: Path to the file
Returns:
Dictionary with scan result or None
"""
try:
with self.get_connection() as conn:
cursor = conn.cursor()
# Try exact path match first
cursor.execute('''
SELECT has_match, matched_person, confidence, face_count, scan_date, scan_type
FROM face_recognition_scans
WHERE file_path = ?
ORDER BY scan_date DESC
LIMIT 1
''', (file_path,))
row = cursor.fetchone()
if row:
return self._convert_face_scan_row(row)
# If no exact match, try matching by filename (in case file was moved)
import os
filename = os.path.basename(file_path)
cursor.execute('''
SELECT has_match, matched_person, confidence, face_count, scan_date, scan_type
FROM face_recognition_scans
WHERE file_path LIKE ?
ORDER BY scan_date DESC
LIMIT 1
''', (f'%{filename}',))
row = cursor.fetchone()
if row:
return self._convert_face_scan_row(row)
return None
except Exception as e:
logger.error(f"Failed to get face recognition result: {e}")
return None
def get_face_recognition_results_batch(self, file_paths: List[str]) -> Dict[str, Dict]:
"""
Get face recognition results for multiple files in a single query.
Args:
file_paths: List of file paths to look up
Returns:
Dictionary mapping file_path -> face recognition result dict
"""
if not file_paths:
return {}
try:
with self.get_connection() as conn:
cursor = conn.cursor()
# Use a single query with IN clause for batch lookup
placeholders = ','.join(['?' for _ in file_paths])
cursor.execute(f'''
SELECT file_path, has_match, matched_person, confidence, face_count, scan_date, scan_type
FROM face_recognition_scans
WHERE file_path IN ({placeholders})
''', file_paths)
results = {}
for row in cursor.fetchall():
file_path = row['file_path']
results[file_path] = {
'has_match': row['has_match'],
'matched_person': row['matched_person'],
'confidence': row['confidence'],
'face_count': row['face_count'],
'scan_date': row['scan_date'],
'scan_type': row['scan_type']
}
return results
except Exception as e:
logger.error(f"Failed to batch get face recognition results: {e}")
return {}
def _get_dimensions_batch(self, file_paths: List[str]) -> Dict[str, tuple]:
"""
Batch lookup dimensions from media_metadata.db - avoids N+1 queries.
Args:
file_paths: List of file paths to look up
Returns:
Dictionary mapping file_path -> (width, height) tuple
"""
import hashlib
import sqlite3
if not file_paths:
return {}
try:
metadata_db_path = self.db_path.parent / 'media_metadata.db'
# Build hash -> path mapping
hash_to_path = {}
for fp in file_paths:
file_hash = hashlib.sha256(fp.encode()).hexdigest()
hash_to_path[file_hash] = fp
# Query all at once
conn = sqlite3.connect(str(metadata_db_path))
placeholders = ','.join('?' * len(hash_to_path))
cursor = conn.execute(
f"SELECT file_hash, width, height FROM media_metadata WHERE file_hash IN ({placeholders})",
list(hash_to_path.keys())
)
# Build result mapping by file_path
result = {}
for row in cursor.fetchall():
file_hash, width, height = row
if file_hash in hash_to_path:
result[hash_to_path[file_hash]] = (width, height)
conn.close()
return result
except Exception as e:
logger.error(f"Failed to batch get dimensions: {e}")
return {}
# ==================== Recycle Bin Methods ====================
def move_to_recycle_bin(self, file_path: str, deleted_from: str, deleted_by: str = None, metadata: dict = None) -> Optional[str]:
"""
Move a file to the recycle bin (soft delete)
Args:
file_path: Original file path to delete
deleted_from: Where file was deleted from ('downloads', 'media', 'review')
deleted_by: Username of person who deleted
metadata: Additional metadata (platform, source, content_type, etc.)
Returns:
UUID of recycle bin entry, or None if failed
"""
import uuid
import shutil
import os
try:
source_path = Path(file_path)
if not source_path.exists():
logger.error(f"File not found for recycle: {file_path}")
return None
# Generate UUID for unique filename
recycle_id = str(uuid.uuid4())
file_extension = source_path.suffix
original_filename = source_path.name
file_size = source_path.stat().st_size
original_mtime = source_path.stat().st_mtime
# Create recycle directory if needed
recycle_dir = Path("/opt/immich/recycle")
recycle_dir.mkdir(parents=True, exist_ok=True)
# Build recycle path with UUID
recycle_path = recycle_dir / f"{recycle_id}{file_extension}"
# Record in database (do this BEFORE moving file)
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Get metadata from file_inventory to preserve it for restoration
cursor.execute('''
SELECT platform, source, content_type, width, height, created_date FROM file_inventory
WHERE file_path = ?
''', (file_path,))
inventory_row = cursor.fetchone()
# Get dates from downloads table to preserve for restoration
cursor.execute('''
SELECT post_date, download_date FROM downloads
WHERE file_path = ?
ORDER BY download_date DESC LIMIT 1
''', (file_path,))
downloads_row = cursor.fetchone()
# Merge file_inventory metadata with passed metadata
full_metadata = metadata.copy() if metadata else {}
if inventory_row:
full_metadata.setdefault('platform', inventory_row['platform'])
full_metadata.setdefault('source', inventory_row['source'])
full_metadata.setdefault('content_type', inventory_row['content_type'])
if inventory_row['width']:
full_metadata.setdefault('width', inventory_row['width'])
if inventory_row['height']:
full_metadata.setdefault('height', inventory_row['height'])
if downloads_row:
full_metadata.setdefault('post_date', downloads_row['post_date'])
full_metadata.setdefault('download_date', downloads_row['download_date'])
# Fallback: Use file_inventory.created_date as download_date if not from downloads
if 'download_date' not in full_metadata and inventory_row and inventory_row['created_date']:
full_metadata['download_date'] = inventory_row['created_date']
# Fallback: Extract post_date from filename if not available (format: YYYYMMDD_HHMMSS_...)
if 'post_date' not in full_metadata:
import re
date_match = re.match(r'^[^_]*_(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2})_', original_filename)
if date_match:
full_metadata['post_date'] = f"{date_match.group(1)}-{date_match.group(2)}-{date_match.group(3)}T{date_match.group(4)}:{date_match.group(5)}:{date_match.group(6)}"
# CRITICAL: Force content_type to be 'image' or 'video' based on file extension
ext = source_path.suffix.lower()
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif', '.webp', '.bmp', '.tiff'}
full_metadata['content_type'] = 'image' if ext in image_exts else 'video'
# Calculate file hash BEFORE moving file (for duplicate detection)
file_hash = None
try:
file_hash = UnifiedDatabase.get_file_hash(str(source_path))
except Exception as e:
logger.warning(f"Failed to calculate hash for recycle bin: {e}")
# Check if this hash already exists in recycle bin (prevent internal duplicates)
if file_hash:
cursor.execute("SELECT id, original_filename FROM recycle_bin WHERE file_hash = ? LIMIT 1", (file_hash,))
existing = cursor.fetchone()
if existing:
logger.info(f"File hash already in recycle bin (existing: {existing['original_filename']}), skipping duplicate: {original_filename}")
# Delete the source file since we already have this in recycle bin
try:
source_path.unlink()
logger.debug(f"Deleted duplicate file: {source_path}")
except Exception as e:
logger.warning(f"Failed to delete duplicate file {source_path}: {e}")
# Still need to clean up the database records (downloads, file_inventory)
cursor.execute('DELETE FROM file_inventory WHERE file_path = ?', (file_path,))
# Delete from downloads table
cursor.execute('SELECT url FROM downloads WHERE file_path = ? LIMIT 1', (file_path,))
url_row = cursor.fetchone()
if url_row and url_row['url']:
cursor.execute('DELETE FROM downloads WHERE url = ?', (url_row['url'],))
else:
cursor.execute('DELETE FROM downloads WHERE file_path = ?', (file_path,))
return existing['id'] # Return existing recycle bin ID
# Update perceptual hash path BEFORE moving file (so duplicate detection still works)
cursor.execute('''
UPDATE instagram_perceptual_hashes SET file_path = ? WHERE file_path = ?
''', (str(recycle_path), file_path))
if cursor.rowcount > 0:
logger.debug(f"Updated perceptual hash path to recycle: {original_filename}")
# Update face recognition scan path
cursor.execute('''
UPDATE face_recognition_scans SET file_path = ? WHERE file_path = ?
''', (str(recycle_path), file_path))
# Update semantic embeddings path
try:
cursor.execute('''
UPDATE semantic_embeddings SET file_path = ? WHERE file_path = ?
''', (str(recycle_path), file_path))
except sqlite3.OperationalError:
pass # Table may not exist
# Move file to recycle bin (preserves mtime)
shutil.move(str(source_path), str(recycle_path))
# Restore original mtime in case shutil.move changed it
os.utime(str(recycle_path), (original_mtime, original_mtime))
# Insert into recycle_bin
cursor.execute('''
INSERT INTO recycle_bin
(id, original_path, original_filename, recycle_path, file_extension,
file_size, original_mtime, deleted_from, deleted_by, metadata, restore_count, file_hash)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0, ?)
''', (
recycle_id,
str(source_path),
original_filename,
str(recycle_path),
file_extension,
file_size,
original_mtime,
deleted_from,
deleted_by,
json.dumps(full_metadata, default=str),
file_hash
))
# Update file_inventory to 'recycle' location (preserves tags and collections)
# Don't delete - just update location and path so tags/collections are preserved for restore
cursor.execute('''
UPDATE file_inventory
SET location = 'recycle', file_path = ?, last_verified = CURRENT_TIMESTAMP
WHERE file_path = ?
''', (str(recycle_path), file_path))
if cursor.rowcount > 0:
logger.debug(f"Updated file_inventory location to recycle: {original_filename}")
# Delete content embeddings when moving to recycle bin
# (will be regenerated if file is restored to final)
cursor.execute('''
DELETE FROM content_embeddings
WHERE file_id IN (SELECT id FROM file_inventory WHERE file_path = ?)
''', (str(recycle_path),))
if cursor.rowcount > 0:
logger.debug(f"Deleted content embedding for recycled file: {original_filename}")
# Delete from downloads table - for carousel posts, delete ALL entries with same URL
# First get the URL for this file
cursor.execute('SELECT url FROM downloads WHERE file_path = ? LIMIT 1', (file_path,))
url_row = cursor.fetchone()
if url_row and url_row['url']:
# Delete all downloads with this URL (handles carousel posts with multiple files)
cursor.execute('DELETE FROM downloads WHERE url = ?', (url_row['url'],))
logger.info(f"Deleted {cursor.rowcount} downloads records for URL: {url_row['url']}")
else:
# Fallback to just deleting by file_path
cursor.execute('DELETE FROM downloads WHERE file_path = ?', (file_path,))
logger.info(f"Deleted {cursor.rowcount} downloads records")
logger.info(f"Moved to recycle bin: {original_filename} (ID: {recycle_id})")
return recycle_id
except Exception as e:
logger.error(f"Failed to move file to recycle bin: {e}")
return None
def restore_from_recycle_bin(self, recycle_id: str) -> bool:
"""
Restore a file from recycle bin to its original location
Args:
recycle_id: UUID of recycle bin entry
Returns:
True if restored successfully
"""
import shutil
import os
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Get recycle bin record
cursor.execute('''
SELECT original_path, recycle_path, original_mtime, restore_count, original_filename,
deleted_from, file_size, metadata
FROM recycle_bin WHERE id = ?
''', (recycle_id,))
row = cursor.fetchone()
if not row:
logger.error(f"Recycle bin entry not found: {recycle_id}")
return False
original_path = Path(row['original_path'])
recycle_path = Path(row['recycle_path'])
original_mtime = row['original_mtime']
restore_count = row['restore_count']
original_filename = row['original_filename']
deleted_from = row['deleted_from']
file_size = row['file_size']
metadata = json.loads(row['metadata']) if row['metadata'] else {}
if not recycle_path.exists():
logger.error(f"Recycle file not found: {recycle_path}")
return False
# Extract metadata
platform = metadata.get('platform', 'unknown')
# Normalize platform name (forum -> forums for consistency)
if platform == 'forum':
platform = 'forums'
source = metadata.get('source', 'unknown')
# Infer platform/source from original path if not in metadata
original_path_str_lower = str(original_path).lower()
if platform == 'unknown':
if '/instagram/' in original_path_str_lower:
platform = 'instagram'
elif '/tiktok/' in original_path_str_lower:
platform = 'tiktok'
elif '/snapchat/' in original_path_str_lower:
platform = 'snapchat'
elif '/reddit/' in original_path_str_lower:
platform = 'reddit'
if source == 'unknown' and original_filename:
import re
# Extract username from filename pattern: username_YYYYMMDD_...
src_match = re.match(r'^(.+?)[-_](\d{8})[-_]', original_filename)
if src_match:
source = src_match.group(1)
# CRITICAL: Determine content_type from file extension only (must be 'image' or 'video')
ext = original_path.suffix.lower()
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif', '.webp', '.bmp', '.tiff'}
content_type = 'image' if ext in image_exts else 'video'
# Check if original_path is a temp folder - if so, redirect to proper destination
original_path_str = str(original_path)
if '/temp/' in original_path_str or original_path_str.startswith('/opt/media-downloader/temp/'):
# File was never moved from temp - determine correct destination
logger.warning(f"Original path is temp folder, redirecting to proper destination: {original_path}")
# Determine destination based on platform and content type from path
if '/instagram/' in original_path_str or platform == 'instagram':
if '/stories/' in original_path_str:
dest_base = Path('/opt/immich/md/social media/instagram/stories')
elif '/tagged/' in original_path_str:
dest_base = Path('/opt/immich/review/social media/instagram/tagged')
else:
dest_base = Path('/opt/immich/md/social media/instagram/posts')
elif '/tiktok/' in original_path_str or platform == 'tiktok':
dest_base = Path('/opt/immich/md/social media/tiktok/reels')
elif '/snapchat/' in original_path_str or platform == 'snapchat':
dest_base = Path('/opt/immich/md/social media/snapchat')
else:
# Default to review folder
dest_base = Path('/opt/immich/review')
# For Instagram/TikTok, add username subdirectory
if platform in ('instagram', 'tiktok') or '/instagram/' in original_path_str or '/tiktok/' in original_path_str:
import re
username_match = re.match(r'^(.+?)_(\d{8})_', original_filename)
# For tagged content, extract username from filename (poster's username)
# because source in database is the monitored account, not the poster
if '/tagged/' in original_path_str and username_match:
dest_base = dest_base / username_match.group(1)
elif source and source != 'unknown':
dest_base = dest_base / source
elif username_match:
# Fallback: extract username from filename
dest_base = dest_base / username_match.group(1)
final_path = dest_base / original_filename
logger.info(f"Redirected restore path from temp to: {final_path}")
else:
# Normal case - restore to original path
# But check if Instagram/TikTok file needs username subdirectory added
final_path = original_path
if platform in ('instagram', 'tiktok') or '/instagram/' in original_path_str or '/tiktok/' in original_path_str:
# Check if path is missing username subdirectory
# Pattern: .../instagram/stories/filename (missing username)
# vs: .../instagram/stories/username/filename (has username)
import re
path_parts = original_path_str.split('/')
# Find the content type folder (stories, posts, reels, tagged)
content_folders = ['stories', 'posts', 'reels', 'tagged']
for i, part in enumerate(path_parts):
if part in content_folders and i + 1 < len(path_parts):
# Check if next part is the filename (no username subdirectory)
next_part = path_parts[i + 1]
if next_part == original_filename:
# Missing username subdirectory - add it
# For tagged content, extract username from filename (poster's username)
# because source in database is the monitored account, not the poster
username_match = re.match(r'^(.+?)_(\d{8})_', original_filename)
if part == 'tagged' and username_match:
# Tagged content - use poster's username from filename
username = username_match.group(1)
elif source and source != 'unknown':
username = source
elif username_match:
# Fallback: extract from filename
username = username_match.group(1)
else:
username = None
if username:
# Insert username subdirectory
new_parts = path_parts[:i+1] + [username] + path_parts[i+1:]
final_path = Path('/'.join(new_parts))
logger.info(f"Added username subdirectory to restore path: {final_path}")
break
# Check if final path already exists
if final_path.exists():
logger.warning(f"File already exists at destination: {final_path}")
return False
# Ensure parent directory exists
final_path.parent.mkdir(parents=True, exist_ok=True)
# Move file from recycle bin to final location
shutil.move(str(recycle_path), str(final_path))
try:
# Restore original mtime
os.utime(str(final_path), (original_mtime, original_mtime))
# Update file_inventory with restored path (preserves tags and collections)
# Use T separator for ISO 8601 format (consistent timezone handling in frontend)
from datetime import datetime
created_date = datetime.fromtimestamp(original_mtime).strftime("%Y-%m-%dT%H:%M:%S")
# Determine location based on where it was deleted from
location = 'review' if deleted_from == 'review' else 'final'
# First try to update existing record (file_inventory row kept during recycle)
# Also restore platform/source from metadata in case they were lost
cursor.execute('''
UPDATE file_inventory
SET file_path = ?, location = ?, last_verified = CURRENT_TIMESTAMP,
platform = CASE WHEN platform = 'unknown' OR platform IS NULL THEN ? ELSE platform END,
source = CASE WHEN source = 'unknown' OR source IS NULL THEN ? ELSE source END
WHERE file_path = ?
''', (str(final_path), location, platform, source, str(recycle_path)))
if cursor.rowcount == 0:
# No existing record - insert new one (for legacy entries before this fix)
cursor.execute('''
INSERT INTO file_inventory (
file_path, filename, platform, source, content_type,
file_size, location, created_date, last_verified
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
ON CONFLICT(file_path) DO UPDATE SET
filename = excluded.filename,
platform = excluded.platform,
source = excluded.source,
content_type = excluded.content_type,
file_size = excluded.file_size,
location = excluded.location,
created_date = excluded.created_date,
last_verified = CURRENT_TIMESTAMP
''', (str(final_path), original_filename, platform, source, content_type,
file_size, location, created_date))
else:
logger.debug(f"Updated existing file_inventory record for restored file: {original_filename}")
# Re-add to downloads table so file appears in UI
media_id = metadata.get('media_id', original_filename.split('_')[-1].split('.')[0])
url = metadata.get('url', '')
# Use original dates from metadata if available, otherwise use created_date
post_date = metadata.get('post_date') or created_date
download_date = metadata.get('download_date') or created_date
# Generate url_hash for the unique constraint
import hashlib
url_hash = hashlib.sha256(url.encode()).hexdigest() if url else hashlib.sha256(original_filename.encode()).hexdigest()
cursor.execute('''
INSERT INTO downloads (
url_hash, url, media_id, platform, source, filename, file_path, file_size,
post_date, download_date, status
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'completed')
ON CONFLICT(url_hash) DO UPDATE SET
filename = excluded.filename,
file_path = excluded.file_path,
file_size = excluded.file_size,
post_date = excluded.post_date,
download_date = excluded.download_date,
status = 'completed'
''', (url_hash, url, media_id, platform, source, original_filename, str(final_path),
file_size, post_date, download_date))
# Update perceptual hash path (from recycle to restored location)
cursor.execute('''
UPDATE instagram_perceptual_hashes SET file_path = ? WHERE file_path = ?
''', (str(final_path), str(recycle_path)))
if cursor.rowcount > 0:
logger.debug(f"Updated perceptual hash path from recycle: {original_filename}")
# Update face recognition scan path
cursor.execute('''
UPDATE face_recognition_scans SET file_path = ? WHERE file_path = ?
''', (str(final_path), str(recycle_path)))
# Update semantic embeddings path
try:
cursor.execute('''
UPDATE semantic_embeddings SET file_path = ? WHERE file_path = ?
''', (str(final_path), str(recycle_path)))
except sqlite3.OperationalError:
pass # Table may not exist
# Queue for discovery scan if restoring to 'final' (media library)
if location == 'final':
cursor.execute('SELECT id FROM file_inventory WHERE file_path = ?', (str(final_path),))
inv_row = cursor.fetchone()
if inv_row:
file_id = inv_row['id']
cursor.execute('''
INSERT INTO discovery_scan_queue (file_id, file_path, scan_type, priority, status)
VALUES (?, ?, 'embedding', 8, 'pending')
ON CONFLICT(file_id, scan_type) DO NOTHING
''', (file_id, str(final_path)))
if cursor.rowcount > 0:
logger.debug(f"Queued for discovery scan after restore: {original_filename}")
# Delete from recycle_bin table (do this last so if file_inventory insert fails, transaction rolls back)
cursor.execute('DELETE FROM recycle_bin WHERE id = ?', (recycle_id,))
logger.info(f"Restored from recycle bin: {original_filename} (restored {restore_count + 1} times)")
return True
except Exception as db_error:
# If database operations fail, move file back to recycle bin
logger.error(f"Database operation failed during restore, moving file back to recycle bin: {db_error}")
try:
shutil.move(str(final_path), str(recycle_path))
except Exception as move_error:
logger.error(f"CRITICAL: Failed to move file back to recycle bin: {move_error}")
raise
except Exception as e:
logger.error(f"Failed to restore from recycle bin: {e}")
return False
def permanently_delete_from_recycle_bin(self, recycle_id: str) -> bool:
"""
Permanently delete a file from recycle bin
Args:
recycle_id: UUID of recycle bin entry
Returns:
True if deleted successfully
"""
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Get recycle bin record
cursor.execute('SELECT recycle_path, original_filename FROM recycle_bin WHERE id = ?', (recycle_id,))
row = cursor.fetchone()
if not row:
logger.error(f"Recycle bin entry not found: {recycle_id}")
return False
recycle_path = Path(row['recycle_path'])
original_filename = row['original_filename']
# Delete physical file
if recycle_path.exists():
recycle_path.unlink()
# Delete from database
cursor.execute('DELETE FROM recycle_bin WHERE id = ?', (recycle_id,))
logger.info(f"Permanently deleted: {original_filename}")
return True
except Exception as e:
logger.error(f"Failed to permanently delete: {e}")
return False
def list_recycle_bin(self, deleted_from: str = None, platform: str = None, source: str = None,
search: str = None, media_type: str = None, date_from: str = None,
date_to: str = None, size_min: int = None, size_max: int = None,
sort_by: str = 'deleted_at', sort_order: str = 'desc',
limit: int = 100, offset: int = 0) -> Dict:
"""
List files in recycle bin
Args:
deleted_from: Filter by source ('downloads', 'media', 'review'), or None for all
platform: Filter by platform (instagram, tiktok, etc.)
source: Filter by source/username
search: Search in filename
media_type: Filter by type ('image', 'video', or None for all)
date_from: Filter by deletion date (YYYY-MM-DD)
date_to: Filter by deletion date (YYYY-MM-DD)
size_min: Minimum file size in bytes
size_max: Maximum file size in bytes
sort_by: Column to sort by ('deleted_at', 'file_size', 'filename', 'deleted_from')
sort_order: Sort direction ('asc' or 'desc')
limit: Maximum number of results
offset: Offset for pagination
Returns:
Dict with 'items' (list of recycle bin entries) and 'total' (total count for filter)
"""
try:
with self.get_connection() as conn:
cursor = conn.cursor()
# Build WHERE clause with table alias 'r.' from the start
# to avoid fragile chained .replace() calls
conditions = []
params = []
if deleted_from:
conditions.append('r.deleted_from = ?')
params.append(deleted_from)
if platform:
# Platform is stored in metadata JSON
conditions.append("json_extract(r.metadata, '$.platform') = ?")
params.append(platform)
if source:
# Source is stored in metadata JSON
conditions.append("json_extract(r.metadata, '$.source') = ?")
params.append(source)
if search:
conditions.append('r.original_filename LIKE ?')
params.append(f'%{search}%')
if media_type == 'image':
conditions.append("LOWER(r.file_extension) IN ('.jpg', '.jpeg', '.png', '.heic', '.heif', '.webp', '.bmp', '.avif', '.tiff', '.tif', '.gif')")
elif media_type == 'video':
conditions.append("LOWER(r.file_extension) IN ('.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv', '.m4v')")
if date_from:
conditions.append('r.deleted_at >= ?')
params.append(date_from)
if date_to:
conditions.append('r.deleted_at <= ?')
params.append(date_to + ' 23:59:59')
if size_min is not None:
conditions.append('r.file_size >= ?')
params.append(size_min)
if size_max is not None:
conditions.append('r.file_size <= ?')
params.append(size_max)
where_clause = ' AND '.join(conditions) if conditions else '1=1'
# Validate and build ORDER BY clause
valid_sort_columns = {
'deleted_at': 'r.deleted_at',
'file_size': 'r.file_size',
'filename': 'r.original_filename',
'deleted_from': 'r.deleted_from',
'download_date': "COALESCE(json_extract(r.metadata, '$.download_date'), CAST(r.deleted_at AS TEXT))",
'post_date': "COALESCE(json_extract(r.metadata, '$.post_date'), json_extract(r.metadata, '$.download_date'), CAST(r.deleted_at AS TEXT))",
'confidence': "COALESCE(fr.confidence, 0)"
}
sort_column = valid_sort_columns.get(sort_by, "COALESCE(json_extract(r.metadata, '$.download_date'), CAST(r.deleted_at AS TEXT))")
sort_dir = 'ASC' if sort_order == 'asc' else 'DESC'
# Get total count for the filter
cursor.execute(f'SELECT COUNT(*) as count FROM recycle_bin r WHERE {where_clause}', params)
total_count = cursor.fetchone()['count']
# Get paginated items with face recognition data
cursor.execute(f'''
SELECT r.id, r.original_path, r.original_filename, r.file_extension, r.file_size,
r.original_mtime, r.deleted_from, r.deleted_at, r.deleted_by, r.metadata, r.restore_count, r.recycle_path,
fr.has_match as face_has_match, fr.matched_person as face_matched_person,
fr.confidence as face_confidence, fr.face_count
FROM recycle_bin r
LEFT JOIN face_recognition_scans fr ON r.recycle_path = fr.file_path
WHERE {where_clause}
ORDER BY {sort_column} {sort_dir}
LIMIT ? OFFSET ?
''', params + [limit, offset])
rows = cursor.fetchall()
# Batch lookup dimensions from media_metadata.db for items missing width/height
dimensions_cache = {}
paths_needing_dimensions = []
for row in rows:
meta = {}
if row['metadata']:
try:
meta = json.loads(row['metadata'])
except (json.JSONDecodeError, TypeError):
pass
if not meta.get('width') or not meta.get('height'):
# Use original_path for dimension lookup
if row['original_path']:
paths_needing_dimensions.append(row['original_path'])
if paths_needing_dimensions:
dimensions_cache = self._get_dimensions_batch(paths_needing_dimensions)
items = []
for row in rows:
item = dict(row)
width, height = None, None
platform, source = None, None
download_date, post_date = None, None
# Try to get metadata from JSON first
video_id = None
if item.get('metadata'):
try:
meta = json.loads(item['metadata'])
width = meta.get('width')
height = meta.get('height')
platform = meta.get('platform')
source = meta.get('source')
download_date = meta.get('download_date')
post_date = meta.get('post_date')
video_id = meta.get('video_id')
except (json.JSONDecodeError, TypeError):
pass
# If dimensions not in metadata, check batch cache
if (not width or not height) and item.get('original_path'):
cached_dims = dimensions_cache.get(item['original_path'])
if cached_dims:
width, height = cached_dims
# Try to extract platform/source from filename if not in metadata
if not platform or not source:
filename = item.get('original_filename', '')
if filename and '_' in filename:
# Pattern: username_date_id... -> source is first part
parts = filename.split('_')
if len(parts) >= 2:
potential_source = parts[0]
# Validate it looks like a username (not a random prefix)
if len(potential_source) >= 3 and potential_source.replace('.', '').isalnum():
if not source:
source = potential_source
if not platform:
platform = 'instagram' # Default assumption for this filename pattern
if width:
item['width'] = width
if height:
item['height'] = height
if platform:
item['platform'] = platform
if source:
item['source'] = source
if download_date:
item['download_date'] = download_date
if post_date:
item['post_date'] = post_date
if video_id:
item['video_id'] = video_id
# Add face recognition data if available
if item.get('face_has_match') is not None or item.get('face_confidence') is not None:
item['face_recognition'] = {
'scanned': True,
'matched': bool(item.get('face_has_match')),
'matched_person': item.get('face_matched_person'),
'confidence': item.get('face_confidence'),
'face_count': item.get('face_count')
}
# Clean up raw face fields
for key in ['face_has_match', 'face_matched_person', 'face_confidence', 'face_count']:
item.pop(key, None)
items.append(item)
return {
'items': items,
'total': total_count
}
except Exception as e:
logger.error(f"Failed to list recycle bin: {e}")
return {'items': [], 'total': 0}
def get_recycle_bin_filters(self, platform: str = None) -> Dict:
"""
Get available filter options for recycle bin
Args:
platform: If provided, only return sources for this platform
Returns:
Dict with 'platforms' and 'sources' lists
"""
# Valid platform names (exclude download methods like 'fastdl', 'imginn', etc.)
valid_platforms = {
'instagram', 'tiktok', 'twitter', 'youtube', 'reddit',
'forums', 'erome', 'fapello', 'bunkr', 'coomer', 'kemono',
'onlyfans', 'fansly', 'patreon', 'pornhub', 'xvideos',
'redgifs', 'imgur', 'gfycat', 'streamable', 'vimeo',
'bilibili', 'coppermine', 'snapchat', 'facebook', 'pinterest',
'tumblr', 'flickr', 'dailymotion', 'twitch'
}
try:
with self.get_connection() as conn:
cursor = conn.cursor()
# Get distinct platforms from metadata
cursor.execute('''
SELECT DISTINCT json_extract(metadata, '$.platform') as platform
FROM recycle_bin
WHERE json_extract(metadata, '$.platform') IS NOT NULL
ORDER BY platform
''')
# Filter to only valid platforms
platforms = [
row['platform'] for row in cursor.fetchall()
if row['platform'] and row['platform'].lower() in valid_platforms
]
# Get distinct sources, optionally filtered by platform
if platform:
cursor.execute('''
SELECT DISTINCT json_extract(metadata, '$.source') as source
FROM recycle_bin
WHERE json_extract(metadata, '$.platform') = ?
AND json_extract(metadata, '$.source') IS NOT NULL
ORDER BY source
''', (platform,))
else:
cursor.execute('''
SELECT DISTINCT json_extract(metadata, '$.source') as source
FROM recycle_bin
WHERE json_extract(metadata, '$.source') IS NOT NULL
ORDER BY source
''')
sources = [row['source'] for row in cursor.fetchall() if row['source']]
return {
'platforms': platforms,
'sources': sources
}
except Exception as e:
logger.error(f"Failed to get recycle bin filters: {e}")
return {'platforms': [], 'sources': []}
def get_recycle_bin_stats(self) -> Dict:
"""
Get recycle bin statistics
Returns:
Dict with stats: total_count, total_size, count_by_source
"""
try:
with self.get_connection() as conn:
cursor = conn.cursor()
# Total count and size
cursor.execute('SELECT COUNT(*), COALESCE(SUM(file_size), 0) FROM recycle_bin')
row = cursor.fetchone()
total_count = row[0]
total_size = row[1]
# Count by source
cursor.execute('''
SELECT deleted_from, COUNT(*), COALESCE(SUM(file_size), 0)
FROM recycle_bin
GROUP BY deleted_from
''')
by_source = {}
for row in cursor.fetchall():
by_source[row[0]] = {
'count': row[1],
'size': row[2]
}
return {
'total_count': total_count,
'total_size': total_size,
'by_source': by_source
}
except Exception as e:
logger.error(f"Failed to get recycle bin stats: {e}")
return {'total_count': 0, 'total_size': 0, 'by_source': {}}
def empty_recycle_bin(self, older_than_days: int = None) -> int:
"""
Empty recycle bin (delete all or files older than X days)
Args:
older_than_days: Only delete files older than this many days, or None for all
Returns:
Number of files deleted
"""
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
if older_than_days:
# Delete files older than X days
cursor.execute('''
SELECT id, recycle_path FROM recycle_bin
WHERE deleted_at < datetime('now', ? || ' days')
''', (f'-{older_than_days}',))
else:
# Delete all files
cursor.execute('SELECT id, recycle_path FROM recycle_bin')
rows = cursor.fetchall()
deleted_count = 0
for row in rows:
recycle_id = row['id']
recycle_path = Path(row['recycle_path'])
# Delete physical file
if recycle_path.exists():
recycle_path.unlink()
# Delete from database
cursor.execute('DELETE FROM recycle_bin WHERE id = ?', (recycle_id,))
deleted_count += 1
logger.info(f"Emptied recycle bin: {deleted_count} files deleted")
return deleted_count
except Exception as e:
logger.error(f"Failed to empty recycle bin: {e}")
return 0
# ==================== File Inventory Methods ====================
def upsert_file_inventory(self, file_path: str, filename: str, platform: str,
source: str = None, content_type: str = None,
file_size: int = None, file_hash: str = None,
width: int = None, height: int = None,
location: str = 'final', metadata: dict = None,
created_date: str = None, method: str = None,
video_id: str = None,
max_retries: int = DB_MAX_RETRIES) -> bool:
"""
Insert or update file in inventory
Args:
file_path: Absolute path to file
filename: Filename for display
platform: Platform name (instagram, tiktok, snapchat, forum, coppermine)
source: Username, forum name, etc.
content_type: 'image' or 'video'
file_size: File size in bytes
file_hash: SHA256 hash for deduplication
width: Image/video width
height: Image/video height
location: 'final', 'review', or 'recycle'
metadata: Additional metadata as dict (will be JSON encoded)
created_date: Optional creation date (ISO format string or timestamp)
method: Download method (fastdl, imginn, toolzu, instaloader) for Instagram
video_id: Video ID for YouTube/other platforms (for thumbnail lookup)
max_retries: Maximum retry attempts for database lock errors
Returns:
bool: True if successful
"""
for attempt in range(max_retries):
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Convert metadata dict to JSON string
metadata_json = json.dumps(metadata) if metadata else None
# Use provided created_date or default to CURRENT_TIMESTAMP
if created_date:
cursor.execute('''
INSERT INTO file_inventory (
file_path, filename, platform, source, content_type,
file_size, file_hash, width, height, location, metadata,
created_date, last_verified, method, video_id
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, ?, ?)
ON CONFLICT(file_path) DO UPDATE SET
filename = excluded.filename,
platform = excluded.platform,
source = excluded.source,
content_type = excluded.content_type,
file_size = excluded.file_size,
file_hash = excluded.file_hash,
width = excluded.width,
height = excluded.height,
location = excluded.location,
metadata = excluded.metadata,
created_date = excluded.created_date,
last_verified = CURRENT_TIMESTAMP,
method = excluded.method,
video_id = COALESCE(excluded.video_id, file_inventory.video_id)
''', (file_path, filename, platform, source, content_type,
file_size, file_hash, width, height, location, metadata_json, created_date, method, video_id))
else:
cursor.execute('''
INSERT INTO file_inventory (
file_path, filename, platform, source, content_type,
file_size, file_hash, width, height, location, metadata,
created_date, last_verified, method, video_id
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, ?, ?)
ON CONFLICT(file_path) DO UPDATE SET
filename = excluded.filename,
platform = excluded.platform,
source = excluded.source,
content_type = excluded.content_type,
file_size = excluded.file_size,
file_hash = excluded.file_hash,
width = excluded.width,
height = excluded.height,
location = excluded.location,
metadata = excluded.metadata,
last_verified = CURRENT_TIMESTAMP,
method = excluded.method,
video_id = COALESCE(excluded.video_id, file_inventory.video_id)
''', (file_path, filename, platform, source, content_type,
file_size, file_hash, width, height, location, metadata_json, method, video_id))
# Auto-queue for discovery scan if location is 'final' (media library)
if location == 'final':
# Get the file_id for queueing
cursor.execute('SELECT id FROM file_inventory WHERE file_path = ?', (file_path,))
row = cursor.fetchone()
if row:
file_id = row['id']
# Queue for embedding generation (low priority since it's background)
cursor.execute('''
INSERT INTO discovery_scan_queue (file_id, file_path, scan_type, priority, status)
VALUES (?, ?, 'embedding', 8, 'pending')
ON CONFLICT(file_id, scan_type) DO NOTHING
''', (file_id, file_path))
if cursor.rowcount > 0:
logger.debug(f"Queued for discovery scan: {filename}")
return True
except sqlite3.OperationalError as e:
if _is_lock_error(e):
if attempt < max_retries - 1:
delay = min(DB_MAX_DELAY, DB_BASE_DELAY * (2 ** attempt))
logger.warning(f"Database locked during file inventory upsert, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries})")
time.sleep(delay)
continue
else:
logger.error(f"Failed to upsert file inventory after {max_retries} attempts: {e}")
return False
else:
logger.error(f"Database error during file inventory upsert: {e}")
return False
except Exception as e:
logger.error(f"Failed to upsert file inventory for {file_path}: {e}")
return False
return False
def delete_file_inventory(self, file_path: str, max_retries: int = DB_MAX_RETRIES) -> bool:
"""
Remove file from inventory (when permanently deleted)
Args:
file_path: Absolute path to file
max_retries: Maximum retry attempts for database lock errors
Returns:
bool: True if deleted
"""
for attempt in range(max_retries):
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('DELETE FROM file_inventory WHERE file_path = ?', (file_path,))
return cursor.rowcount > 0
except sqlite3.OperationalError as e:
if _is_lock_error(e):
if attempt < max_retries - 1:
delay = min(DB_MAX_DELAY, DB_BASE_DELAY * (2 ** attempt))
logger.warning(f"Database locked during file inventory delete, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries})")
time.sleep(delay)
continue
else:
logger.error(f"Failed to delete file inventory after {max_retries} attempts: {e}")
return False
else:
logger.error(f"Database error during file inventory delete: {e}")
return False
except Exception as e:
logger.error(f"Failed to delete file inventory for {file_path}: {e}")
return False
return False
def query_file_inventory(self, location: str = None, platform: str = None,
source: str = None, content_type: str = None,
limit: int = 50, offset: int = 0) -> List[Dict]:
"""
Query file inventory with filters and pagination
Args:
location: Filter by location ('final', 'review', 'recycle')
platform: Filter by platform
source: Filter by source (username, forum name, etc.)
content_type: Filter by content type ('image', 'video')
limit: Maximum number of results
offset: Offset for pagination
Returns:
List of file records as dictionaries
"""
try:
with self.get_connection() as conn:
cursor = conn.cursor()
# Build query dynamically
query = 'SELECT * FROM file_inventory WHERE 1=1'
params = []
if location:
query += ' AND location = ?'
params.append(location)
if platform:
query += ' AND platform = ?'
params.append(platform)
if source:
query += ' AND source = ?'
params.append(source)
if content_type:
query += ' AND content_type = ?'
params.append(content_type)
query += ' ORDER BY created_date DESC LIMIT ? OFFSET ?'
params.extend([limit, offset])
cursor.execute(query, params)
return [dict(row) for row in cursor.fetchall()]
except Exception as e:
logger.error(f"Failed to query file inventory: {e}")
return []
def update_file_inventory_location(self, file_path: str, new_location: str, new_file_path: str = None,
max_retries: int = DB_MAX_RETRIES) -> bool:
"""
Update file location and optionally file path (e.g., final → review → recycle)
Args:
file_path: Current absolute path to file
new_location: New location ('final', 'review', 'recycle')
new_file_path: Optional new file path (if file was moved)
max_retries: Maximum retry attempts for database lock errors
Returns:
bool: True if updated
"""
for attempt in range(max_retries):
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Get file_id and current location before update
cursor.execute('SELECT id, location FROM file_inventory WHERE file_path = ?', (file_path,))
row = cursor.fetchone()
file_id = row['id'] if row else None
old_location = row['location'] if row else None
# Set moved flags based on source/destination
moved_from_review = 1 if (old_location == 'review' and new_location == 'final') else None
moved_from_media = 1 if (old_location == 'final' and new_location == 'review') else None
if new_file_path:
if moved_from_review:
cursor.execute('''
UPDATE file_inventory
SET location = ?, file_path = ?, last_verified = CURRENT_TIMESTAMP, moved_from_review = 1
WHERE file_path = ?
''', (new_location, new_file_path, file_path))
elif moved_from_media:
cursor.execute('''
UPDATE file_inventory
SET location = ?, file_path = ?, last_verified = CURRENT_TIMESTAMP, moved_from_media = 1
WHERE file_path = ?
''', (new_location, new_file_path, file_path))
else:
cursor.execute('''
UPDATE file_inventory
SET location = ?, file_path = ?, last_verified = CURRENT_TIMESTAMP
WHERE file_path = ?
''', (new_location, new_file_path, file_path))
else:
if moved_from_review:
cursor.execute('''
UPDATE file_inventory
SET location = ?, last_verified = CURRENT_TIMESTAMP, moved_from_review = 1
WHERE file_path = ?
''', (new_location, file_path))
elif moved_from_media:
cursor.execute('''
UPDATE file_inventory
SET location = ?, last_verified = CURRENT_TIMESTAMP, moved_from_media = 1
WHERE file_path = ?
''', (new_location, file_path))
else:
cursor.execute('''
UPDATE file_inventory
SET location = ?, last_verified = CURRENT_TIMESTAMP
WHERE file_path = ?
''', (new_location, file_path))
updated = cursor.rowcount > 0
if updated and file_id:
# Delete embeddings when moving AWAY from 'final' (to review or recycle)
if old_location == 'final' and new_location in ('review', 'recycle'):
cursor.execute('DELETE FROM content_embeddings WHERE file_id = ?', (file_id,))
if cursor.rowcount > 0:
logger.debug(f"Deleted embedding for file moved to {new_location}: {Path(file_path).name}")
# Auto-queue for discovery scan if moving TO 'final' (media library)
elif new_location == 'final':
actual_path = new_file_path or file_path
cursor.execute('''
INSERT INTO discovery_scan_queue (file_id, file_path, scan_type, priority, status)
VALUES (?, ?, 'embedding', 8, 'pending')
ON CONFLICT(file_id, scan_type) DO NOTHING
''', (file_id, actual_path))
if cursor.rowcount > 0:
logger.debug(f"Queued for discovery scan after move to final: {Path(actual_path).name}")
return updated
except sqlite3.OperationalError as e:
if _is_lock_error(e):
if attempt < max_retries - 1:
delay = min(DB_MAX_DELAY, DB_BASE_DELAY * (2 ** attempt))
logger.warning(f"Database locked during location update, retrying in {delay:.1f}s (attempt {attempt + 1}/{max_retries})")
time.sleep(delay)
continue
else:
logger.error(f"Failed to update file inventory location after {max_retries} attempts: {e}")
return False
else:
logger.error(f"Database error during location update: {e}")
return False
except Exception as e:
logger.error(f"Failed to update file inventory location for {file_path}: {e}")
return False
return False
def get_file_inventory_count(self, location: str = None, platform: str = None) -> int:
"""
Get count of files in inventory
Args:
location: Filter by location
platform: Filter by platform
Returns:
int: Count of files
"""
try:
with self.get_connection() as conn:
cursor = conn.cursor()
query = 'SELECT COUNT(*) FROM file_inventory WHERE 1=1'
params = []
if location:
query += ' AND location = ?'
params.append(location)
if platform:
query += ' AND platform = ?'
params.append(platform)
cursor.execute(query, params)
return cursor.fetchone()[0]
except Exception as e:
logger.error(f"Failed to get file inventory count: {e}")
return 0
# ==================== Discovery Scan Queue Methods ====================
def queue_discovery_scan(self, file_id: int, file_path: str, scan_type: str = 'embedding', priority: int = 5) -> bool:
"""
Add a file to the discovery scan queue for background processing.
Args:
file_id: ID from file_inventory table
file_path: Path to the file
scan_type: Type of scan ('embedding', 'perceptual_hash', 'face_recognition')
priority: 1-10, lower = higher priority
Returns:
bool: True if queued successfully
"""
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
INSERT INTO discovery_scan_queue (file_id, file_path, scan_type, priority, status)
VALUES (?, ?, ?, ?, 'pending')
ON CONFLICT(file_id, scan_type) DO UPDATE SET
status = 'pending',
priority = MIN(excluded.priority, discovery_scan_queue.priority),
attempts = 0,
error_message = NULL
''', (file_id, file_path, scan_type, priority))
return True
except Exception as e:
logger.error(f"Failed to queue discovery scan for file_id {file_id}: {e}")
return False
def queue_file_for_discovery(self, file_path: str, scan_types: List[str] = None, priority: int = 5) -> bool:
"""
Queue a file for all discovery scans (convenience method).
Args:
file_path: Path to the file
scan_types: List of scan types (default: ['embedding'])
priority: 1-10, lower = higher priority
Returns:
bool: True if queued successfully
"""
if scan_types is None:
scan_types = ['embedding']
try:
with self.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('SELECT id FROM file_inventory WHERE file_path = ?', (file_path,))
row = cursor.fetchone()
if not row:
logger.debug(f"File not in inventory, cannot queue for discovery: {file_path}")
return False
file_id = row['id']
success = True
for scan_type in scan_types:
if not self.queue_discovery_scan(file_id, file_path, scan_type, priority):
success = False
return success
except Exception as e:
logger.error(f"Failed to queue file for discovery: {e}")
return False
def get_pending_discovery_scans(self, limit: int = 50, scan_type: str = None) -> List[Dict]:
"""
Get pending discovery scans from queue.
Args:
limit: Maximum number of items to return
scan_type: Filter by scan type (optional)
Returns:
List of queue items with file_id, file_path, scan_type
"""
try:
with self.get_connection() as conn:
cursor = conn.cursor()
if scan_type:
cursor.execute('''
SELECT id, file_id, file_path, scan_type, priority, attempts
FROM discovery_scan_queue
WHERE status = 'pending' AND scan_type = ?
ORDER BY priority ASC, created_date ASC
LIMIT ?
''', (scan_type, limit))
else:
cursor.execute('''
SELECT id, file_id, file_path, scan_type, priority, attempts
FROM discovery_scan_queue
WHERE status = 'pending'
ORDER BY priority ASC, created_date ASC
LIMIT ?
''', (limit,))
return [dict(row) for row in cursor.fetchall()]
except Exception as e:
logger.error(f"Failed to get pending discovery scans: {e}")
return []
def mark_discovery_scan_started(self, queue_id: int) -> bool:
"""Mark a queue item as started (in progress)."""
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
UPDATE discovery_scan_queue
SET status = 'processing', started_date = CURRENT_TIMESTAMP, attempts = attempts + 1
WHERE id = ?
''', (queue_id,))
return cursor.rowcount > 0
except Exception as e:
logger.error(f"Failed to mark discovery scan started: {e}")
return False
def mark_discovery_scan_completed(self, queue_id: int) -> bool:
"""Mark a queue item as completed (remove from queue)."""
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('DELETE FROM discovery_scan_queue WHERE id = ?', (queue_id,))
return cursor.rowcount > 0
except Exception as e:
logger.error(f"Failed to mark discovery scan completed: {e}")
return False
def mark_discovery_scan_failed(self, queue_id: int, error_message: str, max_attempts: int = 3) -> bool:
"""Mark a queue item as failed, or remove if max attempts reached."""
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Check current attempts
cursor.execute('SELECT attempts FROM discovery_scan_queue WHERE id = ?', (queue_id,))
row = cursor.fetchone()
if not row:
return False
if row['attempts'] >= max_attempts:
# Max attempts reached, mark as permanently failed
cursor.execute('''
UPDATE discovery_scan_queue
SET status = 'failed', error_message = ?, completed_date = CURRENT_TIMESTAMP
WHERE id = ?
''', (error_message, queue_id))
else:
# Reset to pending for retry
cursor.execute('''
UPDATE discovery_scan_queue
SET status = 'pending', error_message = ?
WHERE id = ?
''', (error_message, queue_id))
return True
except Exception as e:
logger.error(f"Failed to mark discovery scan failed: {e}")
return False
def get_discovery_queue_stats(self) -> Dict[str, Any]:
"""Get statistics about the discovery scan queue."""
try:
with self.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT
status,
scan_type,
COUNT(*) as count
FROM discovery_scan_queue
GROUP BY status, scan_type
''')
stats = {
'pending': 0,
'processing': 0,
'failed': 0,
'by_type': {}
}
for row in cursor.fetchall():
status = row['status']
scan_type = row['scan_type']
count = row['count']
if status in stats:
stats[status] += count
if scan_type not in stats['by_type']:
stats['by_type'][scan_type] = {'pending': 0, 'processing': 0, 'failed': 0}
if status in stats['by_type'][scan_type]:
stats['by_type'][scan_type][status] = count
return stats
except Exception as e:
logger.error(f"Failed to get discovery queue stats: {e}")
return {'pending': 0, 'processing': 0, 'failed': 0, 'by_type': {}}
def clear_discovery_queue(self, status: str = None, scan_type: str = None) -> int:
"""
Clear items from the discovery queue.
Args:
status: Only clear items with this status (optional)
scan_type: Only clear items with this scan type (optional)
Returns:
int: Number of items cleared
"""
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
query = 'DELETE FROM discovery_scan_queue WHERE 1=1'
params = []
if status:
query += ' AND status = ?'
params.append(status)
if scan_type:
query += ' AND scan_type = ?'
params.append(scan_type)
cursor.execute(query, params)
return cursor.rowcount
except Exception as e:
logger.error(f"Failed to clear discovery queue: {e}")
return 0
# ==================== End Discovery Scan Queue Methods ====================
def validate_database_sync(self, fix_issues: bool = False) -> Dict[str, Any]:
"""Validate database synchronization and optionally fix issues
Args:
fix_issues: If True, automatically fix detected sync issues
Returns:
Dict containing validation results and issues found
"""
results = {
'recycled_out_of_sync': 0,
'temp_files_orphaned': 0,
'review_files_moved': 0,
'completed_without_inventory': 0,
'issues_fixed': 0
}
try:
with self.pool.get_connection(for_write=fix_issues) as conn:
cursor = conn.cursor()
# Check 1: Files in recycle_bin but downloads table shows completed
cursor.execute('''
SELECT COUNT(*) FROM downloads d
INNER JOIN recycle_bin rb ON d.filename = rb.original_filename
WHERE d.status = 'completed'
''')
results['recycled_out_of_sync'] = cursor.fetchone()[0]
if fix_issues and results['recycled_out_of_sync'] > 0:
cursor.execute('''
UPDATE downloads
SET status = 'recycled',
file_path = (SELECT recycle_path FROM recycle_bin WHERE recycle_bin.original_filename = downloads.filename)
WHERE filename IN (SELECT original_filename FROM recycle_bin)
AND status = 'completed'
''')
results['issues_fixed'] += cursor.rowcount
# Check 2: Completed downloads with temp paths (orphaned)
cursor.execute('''
SELECT COUNT(*) FROM downloads
WHERE status = 'completed'
AND file_path LIKE '%/temp/%'
AND filename NOT IN (SELECT filename FROM file_inventory WHERE location = 'final')
''')
results['temp_files_orphaned'] = cursor.fetchone()[0]
if fix_issues and results['temp_files_orphaned'] > 0:
cursor.execute('''
UPDATE downloads
SET status = 'failed',
metadata = json_set(COALESCE(metadata, '{}'), '$.failure_reason', 'Orphaned temp file')
WHERE status = 'completed'
AND file_path LIKE '%/temp/%'
AND filename NOT IN (SELECT filename FROM file_inventory WHERE location = 'final')
''')
results['issues_fixed'] += cursor.rowcount
# Check 3: Files with review paths but actually in final location
cursor.execute('''
SELECT COUNT(*) FROM downloads d
WHERE d.file_path LIKE '%/review/%'
AND d.status = 'completed'
AND EXISTS (SELECT 1 FROM file_inventory fi WHERE fi.filename = d.filename AND fi.location = 'final')
''')
results['review_files_moved'] = cursor.fetchone()[0]
if fix_issues and results['review_files_moved'] > 0:
cursor.execute('''
UPDATE downloads
SET file_path = (
SELECT fi.file_path FROM file_inventory fi
WHERE fi.filename = downloads.filename AND fi.location = 'final' LIMIT 1
)
WHERE file_path LIKE '%/review/%'
AND status = 'completed'
AND filename IN (SELECT filename FROM file_inventory WHERE location = 'final')
''')
results['issues_fixed'] += cursor.rowcount
# Check 4: Completed downloads without file_inventory entry
cursor.execute('''
SELECT COUNT(*) FROM downloads d
WHERE d.status = 'completed'
AND d.file_path IS NOT NULL AND d.file_path <> ''
AND NOT EXISTS (SELECT 1 FROM file_inventory fi WHERE fi.filename = d.filename)
''')
results['completed_without_inventory'] = cursor.fetchone()[0]
if fix_issues and results['completed_without_inventory'] > 0:
cursor.execute('''
UPDATE downloads
SET status = 'failed',
metadata = json_set(COALESCE(metadata, '{}'), '$.failure_reason', 'File not in inventory')
WHERE status = 'completed'
AND file_path IS NOT NULL AND file_path <> ''
AND NOT EXISTS (SELECT 1 FROM file_inventory fi WHERE fi.filename = downloads.filename)
''')
results['issues_fixed'] += cursor.rowcount
if fix_issues:
conn.commit()
logger.info(f"Database validation complete: {results}")
return results
except Exception as e:
logger.error(f"Database validation failed: {e}")
return results
# ==================== Error Monitoring Methods ====================
def scan_logs_for_errors(self, since: datetime = None, max_hours: int = 24) -> List[Dict]:
"""
Scan log files for ERROR level entries
Args:
since: Only look for errors after this timestamp (default: 24 hours ago)
max_hours: Maximum hours to look back (default: 24)
Returns:
List of error dictionaries with module, message, timestamp, context
"""
import re
from pathlib import Path
if since is None:
since = datetime.now() - timedelta(hours=max_hours)
log_dir = Path('/opt/media-downloader/logs')
errors = []
# Log line pattern: 2025-12-03 06:30:01 [MediaDownloader.Module] [Module] [ERROR] Message
error_pattern = re.compile(
r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) '
r'\[MediaDownloader\.(\w+)\] '
r'\[(\w+)\] '
r'\[ERROR\] '
r'(.+)$'
)
# Get log files from last 24-48 hours
today = datetime.now()
dates_to_check = [
today.strftime('%Y%m%d'),
(today - timedelta(days=1)).strftime('%Y%m%d')
]
# Max file size to scan (50MB) - prevents OOM from huge log files
MAX_LOG_SIZE = 50 * 1024 * 1024
for log_file in log_dir.glob('*.log'):
# Skip if not from relevant dates
file_date = log_file.stem.split('_')[0] if '_' in log_file.stem else None
if file_date and file_date not in dates_to_check:
continue
# For files without date prefix (e.g. service.log), check modification time
if not file_date:
try:
mtime = datetime.fromtimestamp(log_file.stat().st_mtime)
if mtime < since:
continue
except Exception:
continue
# Skip files larger than MAX_LOG_SIZE to prevent OOM
try:
file_size = log_file.stat().st_size
if file_size > MAX_LOG_SIZE:
logger.debug(f"Skipping large log file {log_file.name} ({file_size / 1024 / 1024:.0f}MB > {MAX_LOG_SIZE / 1024 / 1024:.0f}MB limit)")
continue
except Exception:
continue
try:
# Read line-by-line with a sliding context window to avoid loading entire file
context_window = [] # Rolling window of recent lines for context
CONTEXT_SIZE = 5
with open(log_file, 'r', errors='replace') as f:
for i, line in enumerate(f):
line = line.rstrip('\n')
context_window.append(line)
if len(context_window) > CONTEXT_SIZE * 2 + 1:
context_window.pop(0)
match = error_pattern.match(line)
if match:
timestamp_str, module_full, module, message = match.groups()
try:
timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S')
except ValueError:
continue
if timestamp < since:
continue
# Use the sliding context window (lines before the error)
errors.append({
'timestamp': timestamp,
'module': module,
'message': message,
'log_file': log_file.name,
'line_number': i + 1,
'context': list(context_window)
})
except Exception as e:
logger.error(f"Error reading log file {log_file}: {e}")
continue
return errors
def process_and_store_errors(self, since: datetime = None) -> int:
"""
Scan logs for errors and store/update them in the database
Args:
since: Only process errors after this timestamp
Returns:
Number of new/updated errors
"""
import hashlib
import json
import re
errors = self.scan_logs_for_errors(since=since)
if not errors:
return 0
# Normalize error messages for deduplication
def normalize_message(msg: str) -> str:
"""Remove variable parts from error messages for grouping"""
# Replace file paths
msg = re.sub(r'/[\w/\-\.]+\.(jpg|png|mp4|webp|gif)', '{file}', msg)
# Replace URLs
msg = re.sub(r'https?://[^\s]+', '{url}', msg)
# Replace numbers
msg = re.sub(r'\b\d+\b', '{n}', msg)
# Replace UUIDs
msg = re.sub(r'[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}', '{uuid}', msg)
return msg
processed = 0
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
for error in errors:
normalized = normalize_message(error['message'])
error_hash = hashlib.sha256(f"{error['module']}:{normalized}".encode()).hexdigest()
# Check if this error already exists (including dismissed ones)
cursor.execute('''
SELECT id, occurrence_count, first_seen, dismissed_at
FROM error_log
WHERE error_hash = ?
''', (error_hash,))
existing = cursor.fetchone()
if existing:
# Update existing error - only un-dismiss if this is a NEW occurrence
# (i.e., error timestamp is after the last_seen time)
cursor.execute('SELECT last_seen FROM error_log WHERE id = ?', (existing['id'],))
last_seen_row = cursor.fetchone()
last_seen = last_seen_row['last_seen'] if last_seen_row else None
# Only update if this error occurrence is newer than what we've seen
if last_seen and error['timestamp'] <= last_seen:
# This is an old error we've already processed, skip it
continue
# This is a new occurrence - update and un-dismiss
cursor.execute('''
UPDATE error_log
SET last_seen = ?,
occurrence_count = occurrence_count + 1,
message = ?,
log_file = ?,
line_context = ?,
dismissed_at = NULL,
viewed_at = NULL
WHERE id = ?
''', (
error['timestamp'],
error['message'],
error['log_file'],
json.dumps(error['context']),
existing['id']
))
else:
# Insert new error
cursor.execute('''
INSERT INTO error_log
(error_hash, module, message, first_seen, last_seen,
occurrence_count, log_file, line_context)
VALUES (?, ?, ?, ?, ?, 1, ?, ?)
''', (
error_hash,
error['module'],
error['message'],
error['timestamp'],
error['timestamp'],
error['log_file'],
json.dumps(error['context'])
))
processed += 1
conn.commit()
except Exception as e:
logger.error(f"Error storing errors: {e}")
return processed
def get_recent_errors(self, since: datetime = None, include_dismissed: bool = False, limit: int = None) -> List[Dict]:
"""
Get recent errors from the database
Args:
since: Only get errors after this timestamp
include_dismissed: Include dismissed errors
limit: Maximum number of errors to return (None for no limit)
Returns:
List of error dictionaries
"""
try:
with self.get_connection() as conn:
cursor = conn.cursor()
query = '''
SELECT id, error_hash, module, message, first_seen, last_seen,
occurrence_count, log_file, line_context, dismissed_at, viewed_at
FROM error_log
WHERE 1=1
'''
params = []
if since:
# Use datetime() to normalize date format (handles both 'T' and space separators)
query += ' AND datetime(last_seen) >= datetime(?)'
params.append(since.isoformat())
if not include_dismissed:
query += ' AND dismissed_at IS NULL'
# Only show unviewed errors (to match the count shown in banner)
query += ' AND viewed_at IS NULL'
query += ' ORDER BY last_seen DESC'
if limit:
query += ' LIMIT ?'
params.append(limit)
cursor.execute(query, params)
errors = []
for row in cursor.fetchall():
errors.append({
'id': row['id'],
'error_hash': row['error_hash'],
'module': row['module'],
'message': row['message'],
'first_seen': row['first_seen'],
'last_seen': row['last_seen'],
'occurrence_count': row['occurrence_count'],
'log_file': row['log_file'],
'line_context': json.loads(row['line_context']) if row['line_context'] else [],
'dismissed_at': row['dismissed_at'],
'viewed_at': row['viewed_at']
})
return errors
except Exception as e:
logger.error(f"Error getting recent errors: {e}")
return []
def get_unviewed_error_count(self, since: datetime = None) -> int:
"""Get count of unviewed, undismissed errors since timestamp"""
try:
with self.get_connection() as conn:
cursor = conn.cursor()
query = '''
SELECT COUNT(*) as count
FROM error_log
WHERE dismissed_at IS NULL
AND viewed_at IS NULL
'''
params = []
if since:
# Use datetime() to normalize date format (handles both 'T' and space separators)
query += ' AND datetime(last_seen) >= datetime(?)'
params.append(since.isoformat())
cursor.execute(query, params)
result = cursor.fetchone()
return result['count'] if result else 0
except Exception as e:
logger.error(f"Error getting unviewed error count: {e}")
return 0
def dismiss_errors(self, error_ids: List[int] = None, dismiss_all: bool = False) -> int:
"""
Dismiss errors by ID or all unviewed errors
Args:
error_ids: List of error IDs to dismiss
dismiss_all: If True, dismiss all undismissed errors
Returns:
Number of errors dismissed
"""
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
now = datetime.now().isoformat()
if dismiss_all:
cursor.execute('''
UPDATE error_log
SET dismissed_at = ?
WHERE dismissed_at IS NULL
''', (now,))
elif error_ids:
placeholders = ','.join('?' * len(error_ids))
cursor.execute(f'''
UPDATE error_log
SET dismissed_at = ?
WHERE id IN ({placeholders}) AND dismissed_at IS NULL
''', [now] + error_ids)
else:
return 0
dismissed = cursor.rowcount
conn.commit()
return dismissed
except Exception as e:
logger.error(f"Error dismissing errors: {e}")
return 0
def mark_errors_viewed(self, error_ids: List[int] = None, mark_all: bool = False) -> int:
"""Mark errors as viewed"""
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
now = datetime.now().isoformat()
if mark_all:
cursor.execute('''
UPDATE error_log
SET viewed_at = ?
WHERE viewed_at IS NULL AND dismissed_at IS NULL
''', (now,))
elif error_ids:
placeholders = ','.join('?' * len(error_ids))
cursor.execute(f'''
UPDATE error_log
SET viewed_at = ?
WHERE id IN ({placeholders}) AND viewed_at IS NULL
''', [now] + error_ids)
else:
return 0
marked = cursor.rowcount
conn.commit()
return marked
except Exception as e:
logger.error(f"Error marking errors as viewed: {e}")
return 0
def get_last_dashboard_visit(self, user_id: str = 'default') -> Optional[datetime]:
"""Get the last time user visited the dashboard"""
try:
with self.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT last_dashboard_visit
FROM error_tracking
WHERE user_id = ?
''', (user_id,))
result = cursor.fetchone()
if result and result['last_dashboard_visit']:
val = result['last_dashboard_visit']
if isinstance(val, datetime):
return val
return datetime.fromisoformat(val)
return None
except Exception as e:
logger.error(f"Error getting last dashboard visit: {e}")
return None
def update_dashboard_visit(self, user_id: str = 'default') -> bool:
"""Update the last dashboard visit timestamp"""
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
now = datetime.now().isoformat()
cursor.execute('''
INSERT INTO error_tracking (user_id, last_dashboard_visit, updated_at)
VALUES (?, ?, ?)
ON CONFLICT(user_id) DO UPDATE SET
last_dashboard_visit = excluded.last_dashboard_visit,
updated_at = excluded.updated_at
''', (user_id, now, now))
conn.commit()
return True
except Exception as e:
logger.error(f"Error updating dashboard visit: {e}")
return False
def get_errors_needing_push_alert(self, delay_hours: int = 24) -> List[Dict]:
"""
Get errors that are older than delay_hours, unviewed, and haven't had a push alert recently
Args:
delay_hours: Number of hours errors must be unviewed before sending push alert (default 24)
Returns:
List of error dictionaries that need push alerts
"""
try:
with self.get_connection() as conn:
cursor = conn.cursor()
# Use 'localtime' since timestamps are stored in local time
delay_modifier = f'-{delay_hours} hours'
cursor.execute(f'''
SELECT id, module, message, occurrence_count, first_seen, last_seen
FROM error_log
WHERE dismissed_at IS NULL
AND viewed_at IS NULL
AND first_seen < datetime('now', 'localtime', '{delay_modifier}')
AND (push_alert_sent_at IS NULL OR push_alert_sent_at < datetime('now', 'localtime', '{delay_modifier}'))
''')
return [dict(row) for row in cursor.fetchall()]
except Exception as e:
logger.error(f"Error getting errors needing push alert: {e}")
return []
def mark_push_alert_sent(self, error_ids: List[int]) -> bool:
"""Mark that a push alert was sent for these errors"""
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
now = datetime.now().isoformat()
placeholders = ','.join('?' * len(error_ids))
cursor.execute(f'''
UPDATE error_log
SET push_alert_sent_at = ?
WHERE id IN ({placeholders})
''', [now] + error_ids)
conn.commit()
return True
except Exception as e:
logger.error(f"Error marking push alert sent: {e}")
return False
def cleanup_old_errors(self, days: int = 7) -> int:
"""
Delete error records older than specified days
Args:
days: Delete errors older than this many days
Returns:
Number of errors deleted
"""
try:
with self.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('''
DELETE FROM error_log
WHERE last_seen < datetime('now', ? || ' days')
''', (f'-{days}',))
deleted = cursor.rowcount
conn.commit()
if deleted > 0:
logger.info(f"Cleaned up {deleted} old error records")
return deleted
except Exception as e:
logger.error(f"Error cleaning up old errors: {e}")
return 0
def close(self):
"""Close all database connections.
When use_pool=True, closes all pooled connections.
When use_pool=False, connections are ephemeral (created/closed per-use
via get_connection context manager), so no cleanup is needed.
"""
if self.pool:
self.pool.close_all()
def checkpoint(self):
"""Run a WAL checkpoint to merge WAL file into main database
Should be called periodically (e.g., every 5 minutes) to prevent
WAL file from growing too large and to ensure data durability.
"""
if self.pool:
return self.pool.checkpoint()
# For non-pool mode, run checkpoint via a temporary connection
try:
conn = sqlite3.connect(str(self.db_path), timeout=10.0)
conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
conn.close()
except Exception:
pass
return None
# Adapter classes for backward compatibility
class FastDLDatabaseAdapter:
"""Adapter to make unified database compatible with FastDL module"""
def __init__(self, unified_db: UnifiedDatabase):
self.db = unified_db
self.platform = 'instagram' # Normalized to instagram (was 'fastdl')
self.method = 'fastdl' # Download method for tracking
self.unified_db = unified_db # For compatibility with modules expecting this attribute
def get_file_hash(self, file_path: str) -> Optional[str]:
"""Calculate SHA256 hash of a file (delegates to UnifiedDatabase)"""
return UnifiedDatabase.get_file_hash(file_path)
def get_download_by_file_hash(self, file_hash: str) -> Optional[Dict]:
"""Get download record by file hash (delegates to UnifiedDatabase)"""
return self.db.get_download_by_file_hash(file_hash)
def is_already_downloaded(self, media_id: str) -> bool:
# Check by media_id for Instagram platform (all methods now use platform='instagram')
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT 1 FROM downloads
WHERE platform = 'instagram'
AND media_id = ?
LIMIT 1
''', (media_id,))
return cursor.fetchone() is not None
def record_download(self, media_id: str, username: str, content_type: str,
filename: str, download_url: str = None,
post_date: datetime = None, metadata: Dict = None, file_path: str = None):
url = download_url if download_url else f"instagram://{media_id}"
full_metadata = {'media_id': media_id}
if metadata:
full_metadata.update(metadata)
# Calculate file hash if file_path provided
file_hash = None
if file_path:
try:
file_hash = UnifiedDatabase.get_file_hash(file_path)
logger.debug(f"[FastDLAdapter] Calculated hash for {filename}: {file_hash[:16]}...")
except Exception as e:
logger.debug(f"[FastDLAdapter] Failed to calculate hash for {filename}: {e}")
logger.debug(f"[FastDLAdapter] Recording download: filename={filename}, platform={self.platform}, method={self.method}, source={username}, file_path={file_path}")
result = self.db.record_download(
url=url,
platform=self.platform,
source=username,
content_type=content_type,
filename=filename,
file_path=file_path,
file_hash=file_hash,
post_date=post_date,
metadata=full_metadata,
method=self.method
)
if result:
logger.debug(f"[FastDLAdapter] Successfully recorded download for {filename}")
else:
logger.debug(f"[FastDLAdapter] Failed to record download for {filename} (possibly duplicate)")
return result
class ToolzuDatabaseAdapter:
"""Adapter to make unified database compatible with Toolzu module"""
def __init__(self, unified_db: UnifiedDatabase):
self.db = unified_db
self.platform = 'instagram' # Toolzu downloads Instagram content
self.method = 'toolzu' # Download method for tracking
self.unified_db = unified_db # For compatibility
def get_connection(self, for_write=False):
"""Get database connection (delegates to UnifiedDatabase)"""
return self.db.get_connection(for_write)
def get_download_by_media_id(self, media_id: str, platform: str = None, method: str = None) -> Optional[Dict]:
"""Get download record by media_id (delegates to UnifiedDatabase)"""
return self.db.get_download_by_media_id(media_id, platform or self.platform, method or self.method)
def get_file_hash(self, file_path: str) -> Optional[str]:
"""Calculate SHA256 hash of a file (delegates to UnifiedDatabase)"""
return UnifiedDatabase.get_file_hash(file_path)
def get_download_by_file_hash(self, file_hash: str) -> Optional[Dict]:
"""Get download record by file hash (delegates to UnifiedDatabase)"""
return self.db.get_download_by_file_hash(file_hash)
def is_already_downloaded(self, media_id: str) -> bool:
"""Check if content is already downloaded by media_id (all methods now use platform='instagram')"""
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT 1 FROM downloads
WHERE platform = 'instagram'
AND media_id = ?
LIMIT 1
''', (media_id,))
return cursor.fetchone() is not None
def record_download(self, media_id: str, username: str, content_type: str,
filename: str, download_url: str = None,
post_date: datetime = None, metadata: Dict = None, file_path: str = None):
"""Record a download in the database"""
url = download_url if download_url else f"instagram://{media_id}"
full_metadata = {'media_id': media_id, 'source': 'toolzu', 'resolution': '1920x1440'}
if metadata:
full_metadata.update(metadata)
# Calculate file hash if file_path provided
file_hash = None
if file_path:
try:
file_hash = UnifiedDatabase.get_file_hash(file_path)
except Exception:
pass # If hash fails, continue without it
return self.db.record_download(
url=url,
platform=self.platform,
source=username,
content_type=content_type,
filename=filename,
file_path=file_path,
file_hash=file_hash,
post_date=post_date,
metadata=full_metadata,
method=self.method
)
class SnapchatDatabaseAdapter:
"""Adapter to make unified database compatible with Snapchat module"""
def __init__(self, unified_db: UnifiedDatabase):
self.db = unified_db
self.platform = 'snapchat'
self.unified_db = unified_db # For compatibility
def get_connection(self, for_write=False):
"""Get database connection (delegates to UnifiedDatabase)"""
return self.db.get_connection(for_write)
def get_file_hash(self, file_path: str) -> Optional[str]:
"""Calculate SHA256 hash of a file (delegates to UnifiedDatabase)"""
return UnifiedDatabase.get_file_hash(file_path)
def get_download_by_file_hash(self, file_hash: str) -> Optional[Dict]:
"""Get download record by file hash (delegates to UnifiedDatabase)"""
return self.db.get_download_by_file_hash(file_hash)
def is_downloaded(self, username: str, url: str, post_date: datetime = None) -> bool:
"""Check if content is already downloaded"""
# Check by URL
return self.db.is_downloaded(url, self.platform)
def mark_downloaded(self, username: str, url: str, filename: str,
post_date: datetime = None, metadata: dict = None, file_path: str = None) -> bool:
"""Mark content as downloaded in database"""
meta = metadata.copy() if metadata else {}
# Calculate file hash if file_path provided
file_hash = None
if file_path:
try:
file_hash = UnifiedDatabase.get_file_hash(file_path)
except Exception:
pass # If hash fails, continue without it
return self.db.record_download(
url=url,
platform=self.platform,
source=username,
content_type='story',
filename=filename,
file_path=file_path,
file_hash=file_hash,
post_date=post_date,
metadata=meta
)
class CoppermineDatabaseAdapter:
"""Adapter to make unified database compatible with Coppermine module"""
def __init__(self, unified_db: UnifiedDatabase):
self.db = unified_db
self.platform = 'coppermine'
self.unified_db = unified_db # For compatibility
def get_file_hash(self, file_path: str) -> Optional[str]:
"""Calculate SHA256 hash of a file (delegates to UnifiedDatabase)"""
return UnifiedDatabase.get_file_hash(file_path)
def get_download_by_file_hash(self, file_hash: str) -> Optional[Dict]:
"""Get download record by file hash (delegates to UnifiedDatabase)"""
return self.db.get_download_by_file_hash(file_hash)
def is_downloaded(self, url: str, platform: str = None) -> bool:
"""Check if content is already downloaded"""
# Use provided platform or default to coppermine
platform = platform or self.platform
return self.db.is_downloaded(url, platform)
def add_download(self, url: str, platform: str, source: str, content_type: str,
filename: str, file_path: str = None, file_size: int = None,
file_hash: str = None, post_date: datetime = None,
metadata: dict = None) -> bool:
"""Record a download in the database"""
return self.db.record_download(
url=url,
platform=platform,
source=source,
content_type=content_type,
filename=filename,
file_path=file_path,
file_size=file_size,
file_hash=file_hash,
post_date=post_date,
metadata=metadata
)
if __name__ == "__main__":
# Test and migration script
import argparse
from pathlib import Path
# Default to the standard database location
DEFAULT_DB_PATH = str(Path(__file__).parent.parent / 'database' / 'media_downloader.db')
parser = argparse.ArgumentParser(description="Unified Database Manager")
parser.add_argument("--migrate", action="store_true", help="Migrate from old databases")
parser.add_argument("--fastdl-db", help="Path to FastDL database")
parser.add_argument("--tiktok-db", help="Path to TikTok database")
parser.add_argument("--forum-dbs", nargs="+", help="Paths to forum databases")
parser.add_argument("--stats", action="store_true", help="Show database statistics")
parser.add_argument("--db-path", default=DEFAULT_DB_PATH, help="Path to unified database")
args = parser.parse_args()
# Create unified database
db = UnifiedDatabase(args.db_path)
if args.migrate:
print("Starting database migration...")
stats = db.migrate_from_old_databases(
fastdl_db=args.fastdl_db,
tiktok_db=args.tiktok_db,
forum_dbs=args.forum_dbs,
verbose=True
)
if args.stats:
print("\nDatabase Statistics:")
print("-" * 40)
stats = db.get_platform_stats()
for platform_stats in stats:
print(f"Platform: {platform_stats['platform']}")
print(f" Total: {platform_stats['total']}")
print(f" Completed: {platform_stats['completed']}")
print(f" Failed: {platform_stats['failed']}")
print(f" Total Size: {platform_stats['total_size'] / (1024**3):.2f} GB")
print()
db.close()