Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/web/backend/core/utils.py
+++ b/web/backend/core/utils.py
@@ -0,0 +1,582 @@
+"""
+Shared Utility Functions
+
+Common helper functions used across multiple routers.
+"""
+
+import io
+import sqlite3
+import hashlib
+import subprocess
+from collections import OrderedDict
+from contextlib import closing
+from pathlib import Path
+from threading import Lock
+from typing import Dict, List, Optional, Tuple, Union
+
+from fastapi import HTTPException
+from PIL import Image
+
+from .config import settings
+
+
+# ============================================================================
+# THUMBNAIL LRU CACHE
+# ============================================================================
+
+class ThumbnailLRUCache:
+    """Thread-safe LRU cache for thumbnail binary data.
+
+    Avoids SQLite lookups for frequently accessed thumbnails.
+    Used by media.py and recycle.py routers.
+    """
+
+    def __init__(self, max_size: int = 500, max_memory_mb: int = 100):
+        self._cache: OrderedDict[str, bytes] = OrderedDict()
+        self._lock = Lock()
+        self._max_size = max_size
+        self._max_memory = max_memory_mb * 1024 * 1024  # Convert to bytes
+        self._current_memory = 0
+
+    def get(self, key: str) -> Optional[bytes]:
+        with self._lock:
+            if key in self._cache:
+                # Move to end (most recently used)
+                self._cache.move_to_end(key)
+                return self._cache[key]
+        return None
+
+    def put(self, key: str, data: bytes) -> None:
+        with self._lock:
+            data_size = len(data)
+
+            # Don't cache if single item is too large (>1MB)
+            if data_size > 1024 * 1024:
+                return
+
+            # Remove old entry if exists
+            if key in self._cache:
+                self._current_memory -= len(self._cache[key])
+                del self._cache[key]
+
+            # Evict oldest entries if needed
+            while (len(self._cache) >= self._max_size or
+                   self._current_memory + data_size > self._max_memory) and self._cache:
+                oldest_key, oldest_data = self._cache.popitem(last=False)
+                self._current_memory -= len(oldest_data)
+
+            # Add new entry
+            self._cache[key] = data
+            self._current_memory += data_size
+
+    def clear(self) -> None:
+        with self._lock:
+            self._cache.clear()
+            self._current_memory = 0
+
+
+# ============================================================================
+# SQL FILTER CONSTANTS
+# ============================================================================
+
+# Valid media file filters (excluding phrase checks, must have valid extension)
+# Used by downloads, health, and analytics endpoints
+MEDIA_FILTERS = """
+    (filename NOT LIKE '%_phrase_checked_%' OR filename IS NULL)
+    AND (file_path IS NOT NULL AND file_path != '' OR platform = 'forums')
+    AND (LENGTH(filename) > 20 OR filename LIKE '%_%_%')
+    AND (
+        filename LIKE '%.jpg' OR filename LIKE '%.jpeg' OR
+        filename LIKE '%.png' OR filename LIKE '%.gif' OR
+        filename LIKE '%.heic' OR filename LIKE '%.heif' OR
+        filename LIKE '%.mp4' OR filename LIKE '%.mov' OR
+        filename LIKE '%.webm' OR filename LIKE '%.m4a' OR
+        filename LIKE '%.mp3' OR filename LIKE '%.avi' OR
+        filename LIKE '%.mkv' OR filename LIKE '%.flv'
+    )
+"""
+
+
+# ============================================================================
+# PATH VALIDATION
+# ============================================================================
+
+# Allowed base paths for file operations
+ALLOWED_PATHS = [
+    settings.MEDIA_BASE_PATH,
+    settings.REVIEW_PATH,
+    settings.RECYCLE_PATH,
+    Path('/opt/media-downloader/temp/manual_import'),
+    Path('/opt/immich/paid'),
+    Path('/opt/immich/el'),
+    Path('/opt/immich/elv'),
+]
+
+
+def validate_file_path(
+    file_path: str,
+    allowed_bases: Optional[List[Path]] = None,
+    require_exists: bool = False
+) -> Path:
+    """
+    Validate file path is within allowed directories.
+    Prevents path traversal attacks.
+
+    Args:
+        file_path: Path to validate
+        allowed_bases: List of allowed base paths (defaults to ALLOWED_PATHS)
+        require_exists: If True, also verify the file exists
+
+    Returns:
+        Resolved Path object
+
+    Raises:
+        HTTPException: If path is invalid or outside allowed directories
+    """
+    if allowed_bases is None:
+        allowed_bases = ALLOWED_PATHS
+
+    requested_path = Path(file_path)
+
+    try:
+        resolved_path = requested_path.resolve()
+        is_allowed = False
+
+        for allowed_base in allowed_bases:
+            try:
+                resolved_path.relative_to(allowed_base.resolve())
+                is_allowed = True
+                break
+            except ValueError:
+                continue
+
+        if not is_allowed:
+            raise HTTPException(status_code=403, detail="Access denied")
+
+        if require_exists and not resolved_path.exists():
+            raise HTTPException(status_code=404, detail="File not found")
+
+    except HTTPException:
+        raise
+    except Exception:
+        raise HTTPException(status_code=400, detail="Invalid file path")
+
+    return resolved_path
+
+
+# ============================================================================
+# QUERY FILTER BUILDER
+# ============================================================================
+
+def build_media_filter_query(
+    platform: Optional[str] = None,
+    source: Optional[str] = None,
+    media_type: Optional[str] = None,
+    location: Optional[str] = None,
+    date_from: Optional[str] = None,
+    date_to: Optional[str] = None,
+    table_alias: str = "fi"
+) -> Tuple[str, List]:
+    """
+    Build SQL filter clause for common media queries.
+
+    This centralizes the filter building logic that was duplicated across
+    media.py, downloads.py, and review.py routers.
+
+    Args:
+        platform: Filter by platform (e.g., 'instagram', 'tiktok')
+        source: Filter by source (e.g., 'stories', 'posts')
+        media_type: Filter by media type ('image', 'video', 'all')
+        location: Filter by location ('final', 'review', 'recycle')
+        date_from: Start date filter (ISO format)
+        date_to: End date filter (ISO format)
+        table_alias: SQL table alias (default 'fi' for file_inventory)
+
+    Returns:
+        Tuple of (SQL WHERE clause conditions, list of parameters)
+
+    Example:
+        conditions, params = build_media_filter_query(platform="instagram", media_type="video")
+        query = f"SELECT * FROM file_inventory fi WHERE {conditions}"
+        cursor.execute(query, params)
+    """
+    if not table_alias.isidentifier():
+        table_alias = "fi"
+
+    conditions = []
+    params = []
+
+    if platform:
+        conditions.append(f"{table_alias}.platform = ?")
+        params.append(platform)
+
+    if source:
+        conditions.append(f"{table_alias}.source = ?")
+        params.append(source)
+
+    if media_type and media_type != 'all':
+        conditions.append(f"{table_alias}.media_type = ?")
+        params.append(media_type)
+
+    if location:
+        conditions.append(f"{table_alias}.location = ?")
+        params.append(location)
+
+    if date_from:
+        # Use COALESCE to handle both post_date from downloads and created_date from file_inventory
+        conditions.append(f"""
+            DATE(COALESCE(
+                (SELECT MAX(d.post_date) FROM downloads d WHERE d.file_path = {table_alias}.file_path),
+                {table_alias}.created_date
+            )) >= ?
+        """)
+        params.append(date_from)
+
+    if date_to:
+        conditions.append(f"""
+            DATE(COALESCE(
+                (SELECT MAX(d.post_date) FROM downloads d WHERE d.file_path = {table_alias}.file_path),
+                {table_alias}.created_date
+            )) <= ?
+        """)
+        params.append(date_to)
+
+    return " AND ".join(conditions) if conditions else "1=1", params
+
+
+def build_platform_list_filter(
+    platforms: Optional[List[str]] = None,
+    table_alias: str = "fi"
+) -> Tuple[str, List[str]]:
+    """
+    Build SQL IN clause for filtering by multiple platforms.
+
+    Args:
+        platforms: List of platform names to filter by
+        table_alias: SQL table alias
+
+    Returns:
+        Tuple of (SQL condition string, list of parameters)
+    """
+    if not platforms:
+        return "1=1", []
+
+    placeholders = ",".join(["?"] * len(platforms))
+    return f"{table_alias}.platform IN ({placeholders})", platforms
+
+
+# ============================================================================
+# THUMBNAIL GENERATION
+# ============================================================================
+
+def generate_image_thumbnail(file_path: Path, max_size: Tuple[int, int] = (300, 300)) -> Optional[bytes]:
+    """
+    Generate thumbnail for image file.
+
+    Args:
+        file_path: Path to image file
+        max_size: Maximum thumbnail dimensions
+
+    Returns:
+        JPEG bytes or None if generation fails
+    """
+    try:
+        img = Image.open(file_path)
+        img.thumbnail(max_size, Image.Resampling.LANCZOS)
+
+        # Convert to RGB if necessary
+        if img.mode in ('RGBA', 'LA', 'P'):
+            background = Image.new('RGB', img.size, (255, 255, 255))
+            if img.mode == 'P':
+                img = img.convert('RGBA')
+            background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
+            img = background
+
+        buffer = io.BytesIO()
+        img.save(buffer, format='JPEG', quality=85)
+        return buffer.getvalue()
+    except Exception:
+        return None
+
+
+def generate_video_thumbnail(file_path: Path, max_size: Tuple[int, int] = (300, 300)) -> Optional[bytes]:
+    """
+    Generate thumbnail for video file using ffmpeg.
+
+    Args:
+        file_path: Path to video file
+        max_size: Maximum thumbnail dimensions
+
+    Returns:
+        JPEG bytes or None if generation fails
+    """
+    # Try seeking to 1s first, then fall back to first frame
+    for seek_time in ['00:00:01.000', '00:00:00.000']:
+        try:
+            result = subprocess.run([
+                'ffmpeg',
+                '-ss', seek_time,
+                '-i', str(file_path),
+                '-vframes', '1',
+                '-f', 'image2pipe',
+                '-vcodec', 'mjpeg',
+                '-'
+            ], capture_output=True, timeout=30)
+
+            if result.returncode != 0 or not result.stdout:
+                continue
+
+            # Resize the frame
+            img = Image.open(io.BytesIO(result.stdout))
+            img.thumbnail(max_size, Image.Resampling.LANCZOS)
+
+            buffer = io.BytesIO()
+            img.save(buffer, format='JPEG', quality=85)
+            return buffer.getvalue()
+        except Exception:
+            continue
+
+    return None
+
+
+def get_or_create_thumbnail(
+    file_path: Union[str, Path],
+    media_type: str,
+    content_hash: Optional[str] = None,
+    max_size: Tuple[int, int] = (300, 300)
+) -> Optional[bytes]:
+    """
+    Get thumbnail from cache or generate and cache it.
+
+    Uses the thumbnails.db schema: file_hash (PK), file_path, thumbnail_data, created_at, file_mtime.
+
+    Lookup strategy:
+    1. Try content_hash against file_hash column (survives file moves)
+    2. Fall back to file_path lookup (legacy thumbnails)
+    3. Generate and cache if not found
+
+    Args:
+        file_path: Path to media file
+        media_type: 'image' or 'video'
+        content_hash: Optional pre-computed hash (computed from path if not provided)
+        max_size: Maximum thumbnail dimensions
+
+    Returns:
+        JPEG bytes or None if generation fails
+    """
+    file_path = Path(file_path)
+    thumb_db_path = settings.PROJECT_ROOT / 'database' / 'thumbnails.db'
+
+    # Compute hash if not provided
+    file_hash = content_hash if content_hash else hashlib.sha256(str(file_path).encode()).hexdigest()
+
+    # Try to get from cache (skip mtime check — downloaded media files don't change)
+    try:
+        with sqlite3.connect(str(thumb_db_path), timeout=30.0) as conn:
+            cursor = conn.cursor()
+
+            # 1. Try file_hash lookup (primary key)
+            cursor.execute(
+                "SELECT thumbnail_data FROM thumbnails WHERE file_hash = ?",
+                (file_hash,)
+            )
+            result = cursor.fetchone()
+            if result and result[0]:
+                return result[0]
+
+            # 2. Fall back to file_path lookup (legacy thumbnails)
+            cursor.execute(
+                "SELECT thumbnail_data FROM thumbnails WHERE file_path = ?",
+                (str(file_path),)
+            )
+            result = cursor.fetchone()
+            if result and result[0]:
+                return result[0]
+    except Exception:
+        pass
+
+    # Only proceed with generation if the file actually exists
+    if not file_path.exists():
+        return None
+
+    # Get mtime only when we need to generate and cache a new thumbnail
+    try:
+        file_mtime = file_path.stat().st_mtime
+    except OSError:
+        file_mtime = 0
+
+    # Generate thumbnail
+    thumbnail_data = None
+    if media_type == 'video':
+        thumbnail_data = generate_video_thumbnail(file_path, max_size)
+    else:
+        thumbnail_data = generate_image_thumbnail(file_path, max_size)
+
+    # Cache the thumbnail
+    if thumbnail_data:
+        try:
+            from .responses import now_iso8601
+            with sqlite3.connect(str(thumb_db_path), timeout=30.0) as conn:
+                conn.execute("""
+                    INSERT OR REPLACE INTO thumbnails
+                    (file_hash, file_path, thumbnail_data, created_at, file_mtime)
+                    VALUES (?, ?, ?, ?, ?)
+                """, (file_hash, str(file_path), thumbnail_data, now_iso8601(), file_mtime))
+                conn.commit()
+        except Exception:
+            pass
+
+    return thumbnail_data
+
+
+def get_media_dimensions(file_path: str, width: int = None, height: int = None) -> Tuple[Optional[int], Optional[int]]:
+    """
+    Get media dimensions, falling back to metadata cache if not provided.
+
+    Args:
+        file_path: Path to media file
+        width: Width from file_inventory (may be None)
+        height: Height from file_inventory (may be None)
+
+    Returns:
+        Tuple of (width, height), or (None, None) if not available
+    """
+    if width is not None and height is not None:
+        return (width, height)
+
+    try:
+        metadata_db_path = settings.PROJECT_ROOT / 'database' / 'media_metadata.db'
+
+        file_hash = hashlib.sha256(file_path.encode()).hexdigest()
+        with closing(sqlite3.connect(str(metadata_db_path))) as conn:
+            cursor = conn.execute(
+                "SELECT width, height FROM media_metadata WHERE file_hash = ?",
+                (file_hash,)
+            )
+            result = cursor.fetchone()
+
+            if result:
+                return (result[0], result[1])
+
+    except Exception:
+        pass
+
+    return (width, height)
+
+
+def get_media_dimensions_batch(file_paths: List[str]) -> Dict[str, Tuple[int, int]]:
+    """
+    Get media dimensions for multiple files in a single query (batch lookup).
+    Avoids N+1 query problem by fetching all dimensions at once.
+
+    Args:
+        file_paths: List of file paths to look up
+
+    Returns:
+        Dict mapping file_path -> (width, height)
+    """
+    if not file_paths:
+        return {}
+
+    result = {}
+
+    try:
+        metadata_db_path = settings.PROJECT_ROOT / 'database' / 'media_metadata.db'
+
+        # Build hash -> path mapping
+        hash_to_path = {}
+        for fp in file_paths:
+            file_hash = hashlib.sha256(fp.encode()).hexdigest()
+            hash_to_path[file_hash] = fp
+
+        # Query all at once
+        with closing(sqlite3.connect(str(metadata_db_path))) as conn:
+            placeholders = ','.join('?' * len(hash_to_path))
+            cursor = conn.execute(
+                f"SELECT file_hash, width, height FROM media_metadata WHERE file_hash IN ({placeholders})",
+                list(hash_to_path.keys())
+            )
+
+            for row in cursor.fetchall():
+                file_hash, width, height = row
+                if file_hash in hash_to_path:
+                    result[hash_to_path[file_hash]] = (width, height)
+
+    except Exception:
+        pass
+
+    return result
+
+
+# ============================================================================
+# DATABASE UTILITIES
+# ============================================================================
+
+def update_file_path_in_all_tables(db, old_path: str, new_path: str):
+    """
+    Update file path in all relevant database tables.
+
+    Used when moving files between locations (review -> final, etc.)
+    to keep database references consistent.
+
+    Args:
+        db: UnifiedDatabase instance
+        old_path: The old file path to replace
+        new_path: The new file path to use
+    """
+    from modules.universal_logger import get_logger
+    logger = get_logger('API')
+
+    try:
+        with db.get_connection(for_write=True) as conn:
+            cursor = conn.cursor()
+
+            cursor.execute('UPDATE downloads SET file_path = ? WHERE file_path = ?',
+                          (new_path, old_path))
+            cursor.execute('UPDATE instagram_perceptual_hashes SET file_path = ? WHERE file_path = ?',
+                          (new_path, old_path))
+            cursor.execute('UPDATE face_recognition_scans SET file_path = ? WHERE file_path = ?',
+                          (new_path, old_path))
+
+            try:
+                cursor.execute('UPDATE semantic_embeddings SET file_path = ? WHERE file_path = ?',
+                              (new_path, old_path))
+            except sqlite3.OperationalError:
+                pass  # Table may not exist
+
+            conn.commit()
+
+    except Exception as e:
+        logger.warning(f"Failed to update file paths in tables: {e}", module="Database")
+
+
+# ============================================================================
+# FACE RECOGNITION UTILITIES
+# ============================================================================
+
+# Cached FaceRecognitionModule singleton to avoid loading InsightFace models on every request
+_face_module_cache: Dict[int, 'FaceRecognitionModule'] = {}
+
+
+def get_face_module(db, module_name: str = "FaceAPI"):
+    """
+    Get or create a cached FaceRecognitionModule instance for the given database.
+
+    Uses singleton pattern to avoid reloading heavy InsightFace models on each request.
+
+    Args:
+        db: UnifiedDatabase instance
+        module_name: Name to use in log messages
+
+    Returns:
+        FaceRecognitionModule instance
+    """
+    from modules.face_recognition_module import FaceRecognitionModule
+    from modules.universal_logger import get_logger
+    logger = get_logger('API')
+
+    db_id = id(db)
+    if db_id not in _face_module_cache:
+        logger.info("Creating cached FaceRecognitionModule instance", module=module_name)
+        _face_module_cache[db_id] = FaceRecognitionModule(unified_db=db)
+    return _face_module_cache[db_id]