""" Shared Utility Functions Common helper functions used across multiple routers. """ import io import sqlite3 import hashlib import subprocess from collections import OrderedDict from contextlib import closing from pathlib import Path from threading import Lock from typing import Dict, List, Optional, Tuple, Union from fastapi import HTTPException from PIL import Image from .config import settings # ============================================================================ # THUMBNAIL LRU CACHE # ============================================================================ class ThumbnailLRUCache: """Thread-safe LRU cache for thumbnail binary data. Avoids SQLite lookups for frequently accessed thumbnails. Used by media.py and recycle.py routers. """ def __init__(self, max_size: int = 500, max_memory_mb: int = 100): self._cache: OrderedDict[str, bytes] = OrderedDict() self._lock = Lock() self._max_size = max_size self._max_memory = max_memory_mb * 1024 * 1024 # Convert to bytes self._current_memory = 0 def get(self, key: str) -> Optional[bytes]: with self._lock: if key in self._cache: # Move to end (most recently used) self._cache.move_to_end(key) return self._cache[key] return None def put(self, key: str, data: bytes) -> None: with self._lock: data_size = len(data) # Don't cache if single item is too large (>1MB) if data_size > 1024 * 1024: return # Remove old entry if exists if key in self._cache: self._current_memory -= len(self._cache[key]) del self._cache[key] # Evict oldest entries if needed while (len(self._cache) >= self._max_size or self._current_memory + data_size > self._max_memory) and self._cache: oldest_key, oldest_data = self._cache.popitem(last=False) self._current_memory -= len(oldest_data) # Add new entry self._cache[key] = data self._current_memory += data_size def clear(self) -> None: with self._lock: self._cache.clear() self._current_memory = 0 # ============================================================================ # SQL FILTER CONSTANTS # ============================================================================ # Valid media file filters (excluding phrase checks, must have valid extension) # Used by downloads, health, and analytics endpoints MEDIA_FILTERS = """ (filename NOT LIKE '%_phrase_checked_%' OR filename IS NULL) AND (file_path IS NOT NULL AND file_path != '' OR platform = 'forums') AND (LENGTH(filename) > 20 OR filename LIKE '%_%_%') AND ( filename LIKE '%.jpg' OR filename LIKE '%.jpeg' OR filename LIKE '%.png' OR filename LIKE '%.gif' OR filename LIKE '%.heic' OR filename LIKE '%.heif' OR filename LIKE '%.mp4' OR filename LIKE '%.mov' OR filename LIKE '%.webm' OR filename LIKE '%.m4a' OR filename LIKE '%.mp3' OR filename LIKE '%.avi' OR filename LIKE '%.mkv' OR filename LIKE '%.flv' ) """ # ============================================================================ # PATH VALIDATION # ============================================================================ # Allowed base paths for file operations ALLOWED_PATHS = [ settings.MEDIA_BASE_PATH, settings.REVIEW_PATH, settings.RECYCLE_PATH, Path('/opt/media-downloader/temp/manual_import'), Path('/opt/immich/paid'), Path('/opt/immich/el'), Path('/opt/immich/elv'), ] def validate_file_path( file_path: str, allowed_bases: Optional[List[Path]] = None, require_exists: bool = False ) -> Path: """ Validate file path is within allowed directories. Prevents path traversal attacks. Args: file_path: Path to validate allowed_bases: List of allowed base paths (defaults to ALLOWED_PATHS) require_exists: If True, also verify the file exists Returns: Resolved Path object Raises: HTTPException: If path is invalid or outside allowed directories """ if allowed_bases is None: allowed_bases = ALLOWED_PATHS requested_path = Path(file_path) try: resolved_path = requested_path.resolve() is_allowed = False for allowed_base in allowed_bases: try: resolved_path.relative_to(allowed_base.resolve()) is_allowed = True break except ValueError: continue if not is_allowed: raise HTTPException(status_code=403, detail="Access denied") if require_exists and not resolved_path.exists(): raise HTTPException(status_code=404, detail="File not found") except HTTPException: raise except Exception: raise HTTPException(status_code=400, detail="Invalid file path") return resolved_path # ============================================================================ # QUERY FILTER BUILDER # ============================================================================ def build_media_filter_query( platform: Optional[str] = None, source: Optional[str] = None, media_type: Optional[str] = None, location: Optional[str] = None, date_from: Optional[str] = None, date_to: Optional[str] = None, table_alias: str = "fi" ) -> Tuple[str, List]: """ Build SQL filter clause for common media queries. This centralizes the filter building logic that was duplicated across media.py, downloads.py, and review.py routers. Args: platform: Filter by platform (e.g., 'instagram', 'tiktok') source: Filter by source (e.g., 'stories', 'posts') media_type: Filter by media type ('image', 'video', 'all') location: Filter by location ('final', 'review', 'recycle') date_from: Start date filter (ISO format) date_to: End date filter (ISO format) table_alias: SQL table alias (default 'fi' for file_inventory) Returns: Tuple of (SQL WHERE clause conditions, list of parameters) Example: conditions, params = build_media_filter_query(platform="instagram", media_type="video") query = f"SELECT * FROM file_inventory fi WHERE {conditions}" cursor.execute(query, params) """ if not table_alias.isidentifier(): table_alias = "fi" conditions = [] params = [] if platform: conditions.append(f"{table_alias}.platform = ?") params.append(platform) if source: conditions.append(f"{table_alias}.source = ?") params.append(source) if media_type and media_type != 'all': conditions.append(f"{table_alias}.media_type = ?") params.append(media_type) if location: conditions.append(f"{table_alias}.location = ?") params.append(location) if date_from: # Use COALESCE to handle both post_date from downloads and created_date from file_inventory conditions.append(f""" DATE(COALESCE( (SELECT MAX(d.post_date) FROM downloads d WHERE d.file_path = {table_alias}.file_path), {table_alias}.created_date )) >= ? """) params.append(date_from) if date_to: conditions.append(f""" DATE(COALESCE( (SELECT MAX(d.post_date) FROM downloads d WHERE d.file_path = {table_alias}.file_path), {table_alias}.created_date )) <= ? """) params.append(date_to) return " AND ".join(conditions) if conditions else "1=1", params def build_platform_list_filter( platforms: Optional[List[str]] = None, table_alias: str = "fi" ) -> Tuple[str, List[str]]: """ Build SQL IN clause for filtering by multiple platforms. Args: platforms: List of platform names to filter by table_alias: SQL table alias Returns: Tuple of (SQL condition string, list of parameters) """ if not platforms: return "1=1", [] placeholders = ",".join(["?"] * len(platforms)) return f"{table_alias}.platform IN ({placeholders})", platforms # ============================================================================ # THUMBNAIL GENERATION # ============================================================================ def generate_image_thumbnail(file_path: Path, max_size: Tuple[int, int] = (300, 300)) -> Optional[bytes]: """ Generate thumbnail for image file. Args: file_path: Path to image file max_size: Maximum thumbnail dimensions Returns: JPEG bytes or None if generation fails """ try: img = Image.open(file_path) img.thumbnail(max_size, Image.Resampling.LANCZOS) # Convert to RGB if necessary if img.mode in ('RGBA', 'LA', 'P'): background = Image.new('RGB', img.size, (255, 255, 255)) if img.mode == 'P': img = img.convert('RGBA') background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None) img = background buffer = io.BytesIO() img.save(buffer, format='JPEG', quality=85) return buffer.getvalue() except Exception: return None def generate_video_thumbnail(file_path: Path, max_size: Tuple[int, int] = (300, 300)) -> Optional[bytes]: """ Generate thumbnail for video file using ffmpeg. Args: file_path: Path to video file max_size: Maximum thumbnail dimensions Returns: JPEG bytes or None if generation fails """ # Try seeking to 1s first, then fall back to first frame for seek_time in ['00:00:01.000', '00:00:00.000']: try: result = subprocess.run([ 'ffmpeg', '-ss', seek_time, '-i', str(file_path), '-vframes', '1', '-f', 'image2pipe', '-vcodec', 'mjpeg', '-' ], capture_output=True, timeout=30) if result.returncode != 0 or not result.stdout: continue # Resize the frame img = Image.open(io.BytesIO(result.stdout)) img.thumbnail(max_size, Image.Resampling.LANCZOS) buffer = io.BytesIO() img.save(buffer, format='JPEG', quality=85) return buffer.getvalue() except Exception: continue return None def get_or_create_thumbnail( file_path: Union[str, Path], media_type: str, content_hash: Optional[str] = None, max_size: Tuple[int, int] = (300, 300) ) -> Optional[bytes]: """ Get thumbnail from cache or generate and cache it. Uses the thumbnails.db schema: file_hash (PK), file_path, thumbnail_data, created_at, file_mtime. Lookup strategy: 1. Try content_hash against file_hash column (survives file moves) 2. Fall back to file_path lookup (legacy thumbnails) 3. Generate and cache if not found Args: file_path: Path to media file media_type: 'image' or 'video' content_hash: Optional pre-computed hash (computed from path if not provided) max_size: Maximum thumbnail dimensions Returns: JPEG bytes or None if generation fails """ file_path = Path(file_path) thumb_db_path = settings.PROJECT_ROOT / 'database' / 'thumbnails.db' # Compute hash if not provided file_hash = content_hash if content_hash else hashlib.sha256(str(file_path).encode()).hexdigest() # Try to get from cache (skip mtime check — downloaded media files don't change) try: with sqlite3.connect(str(thumb_db_path), timeout=30.0) as conn: cursor = conn.cursor() # 1. Try file_hash lookup (primary key) cursor.execute( "SELECT thumbnail_data FROM thumbnails WHERE file_hash = ?", (file_hash,) ) result = cursor.fetchone() if result and result[0]: return result[0] # 2. Fall back to file_path lookup (legacy thumbnails) cursor.execute( "SELECT thumbnail_data FROM thumbnails WHERE file_path = ?", (str(file_path),) ) result = cursor.fetchone() if result and result[0]: return result[0] except Exception: pass # Only proceed with generation if the file actually exists if not file_path.exists(): return None # Get mtime only when we need to generate and cache a new thumbnail try: file_mtime = file_path.stat().st_mtime except OSError: file_mtime = 0 # Generate thumbnail thumbnail_data = None if media_type == 'video': thumbnail_data = generate_video_thumbnail(file_path, max_size) else: thumbnail_data = generate_image_thumbnail(file_path, max_size) # Cache the thumbnail if thumbnail_data: try: from .responses import now_iso8601 with sqlite3.connect(str(thumb_db_path), timeout=30.0) as conn: conn.execute(""" INSERT OR REPLACE INTO thumbnails (file_hash, file_path, thumbnail_data, created_at, file_mtime) VALUES (?, ?, ?, ?, ?) """, (file_hash, str(file_path), thumbnail_data, now_iso8601(), file_mtime)) conn.commit() except Exception: pass return thumbnail_data def get_media_dimensions(file_path: str, width: int = None, height: int = None) -> Tuple[Optional[int], Optional[int]]: """ Get media dimensions, falling back to metadata cache if not provided. Args: file_path: Path to media file width: Width from file_inventory (may be None) height: Height from file_inventory (may be None) Returns: Tuple of (width, height), or (None, None) if not available """ if width is not None and height is not None: return (width, height) try: metadata_db_path = settings.PROJECT_ROOT / 'database' / 'media_metadata.db' file_hash = hashlib.sha256(file_path.encode()).hexdigest() with closing(sqlite3.connect(str(metadata_db_path))) as conn: cursor = conn.execute( "SELECT width, height FROM media_metadata WHERE file_hash = ?", (file_hash,) ) result = cursor.fetchone() if result: return (result[0], result[1]) except Exception: pass return (width, height) def get_media_dimensions_batch(file_paths: List[str]) -> Dict[str, Tuple[int, int]]: """ Get media dimensions for multiple files in a single query (batch lookup). Avoids N+1 query problem by fetching all dimensions at once. Args: file_paths: List of file paths to look up Returns: Dict mapping file_path -> (width, height) """ if not file_paths: return {} result = {} try: metadata_db_path = settings.PROJECT_ROOT / 'database' / 'media_metadata.db' # Build hash -> path mapping hash_to_path = {} for fp in file_paths: file_hash = hashlib.sha256(fp.encode()).hexdigest() hash_to_path[file_hash] = fp # Query all at once with closing(sqlite3.connect(str(metadata_db_path))) as conn: placeholders = ','.join('?' * len(hash_to_path)) cursor = conn.execute( f"SELECT file_hash, width, height FROM media_metadata WHERE file_hash IN ({placeholders})", list(hash_to_path.keys()) ) for row in cursor.fetchall(): file_hash, width, height = row if file_hash in hash_to_path: result[hash_to_path[file_hash]] = (width, height) except Exception: pass return result # ============================================================================ # DATABASE UTILITIES # ============================================================================ def update_file_path_in_all_tables(db, old_path: str, new_path: str): """ Update file path in all relevant database tables. Used when moving files between locations (review -> final, etc.) to keep database references consistent. Args: db: UnifiedDatabase instance old_path: The old file path to replace new_path: The new file path to use """ from modules.universal_logger import get_logger logger = get_logger('API') try: with db.get_connection(for_write=True) as conn: cursor = conn.cursor() cursor.execute('UPDATE downloads SET file_path = ? WHERE file_path = ?', (new_path, old_path)) cursor.execute('UPDATE instagram_perceptual_hashes SET file_path = ? WHERE file_path = ?', (new_path, old_path)) cursor.execute('UPDATE face_recognition_scans SET file_path = ? WHERE file_path = ?', (new_path, old_path)) try: cursor.execute('UPDATE semantic_embeddings SET file_path = ? WHERE file_path = ?', (new_path, old_path)) except sqlite3.OperationalError: pass # Table may not exist conn.commit() except Exception as e: logger.warning(f"Failed to update file paths in tables: {e}", module="Database") # ============================================================================ # FACE RECOGNITION UTILITIES # ============================================================================ # Cached FaceRecognitionModule singleton to avoid loading InsightFace models on every request _face_module_cache: Dict[int, 'FaceRecognitionModule'] = {} def get_face_module(db, module_name: str = "FaceAPI"): """ Get or create a cached FaceRecognitionModule instance for the given database. Uses singleton pattern to avoid reloading heavy InsightFace models on each request. Args: db: UnifiedDatabase instance module_name: Name to use in log messages Returns: FaceRecognitionModule instance """ from modules.face_recognition_module import FaceRecognitionModule from modules.universal_logger import get_logger logger = get_logger('API') db_id = id(db) if db_id not in _face_module_cache: logger.info("Creating cached FaceRecognitionModule instance", module=module_name) _face_module_cache[db_id] = FaceRecognitionModule(unified_db=db) return _face_module_cache[db_id]