Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/modules/thumbnail_cache_builder.py
+++ b/modules/thumbnail_cache_builder.py
@@ -0,0 +1,639 @@
+#!/usr/bin/env python3
+"""
+Background worker to pre-generate thumbnails and cache metadata for all media files.
+This improves performance by generating thumbnails in advance rather than on-demand.
+"""
+
+import sys
+import os
+import time
+import hashlib
+from pathlib import Path
+from datetime import datetime
+from PIL import Image
+import io
+
+# Add parent directory to path so we can import modules
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+# Bootstrap database backend (must be before any database imports)
+import modules.db_bootstrap  # noqa: E402,F401
+
+import sqlite3
+
+from modules.universal_logger import get_logger
+
+logger = get_logger('ThumbnailCacheBuilder')
+
+
+class ThumbnailCacheBuilder:
+    """Build and maintain thumbnail and metadata cache for media files"""
+
+    def __init__(self):
+        self.scan_dirs = [
+            Path('/opt/immich/md'),
+            Path('/opt/immich/review'),
+            Path('/opt/immich/recycle')
+        ]
+        self.db_path = Path(__file__).parent.parent / 'database' / 'thumbnails.db'
+        self.metadata_db_path = Path(__file__).parent.parent / 'database' / 'media_metadata.db'
+        self.unified_db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db'
+        self.max_thumb_size = (300, 300)
+
+        # Image and video extensions
+        self.image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif', '.webp'}
+        self.video_extensions = {'.mp4', '.mov', '.webm', '.avi', '.mkv', '.flv', '.m4v'}
+
+        self.stats = {
+            'processed': 0,
+            'thumbnails_created': 0,
+            'thumbnails_cached': 0,
+            'metadata_cached': 0,
+            'errors': 0,
+            'skipped': 0
+        }
+
+        self._init_metadata_db()
+
+    def _init_metadata_db(self):
+        """Initialize metadata cache database"""
+        self.metadata_db_path.parent.mkdir(parents=True, exist_ok=True)
+
+        conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
+        conn.execute('PRAGMA journal_mode=WAL')
+        conn.execute("""
+            CREATE TABLE IF NOT EXISTS media_metadata (
+                file_hash TEXT PRIMARY KEY,
+                file_path TEXT NOT NULL,
+                width INTEGER,
+                height INTEGER,
+                file_size INTEGER,
+                duration REAL,
+                format TEXT,
+                created_at TEXT,
+                file_mtime DOUBLE PRECISION
+            )
+        """)
+        conn.execute("CREATE INDEX IF NOT EXISTS idx_meta_file_path ON media_metadata(file_path)")
+        conn.commit()
+        conn.close()
+
+        logger.info(f"Metadata database initialized at {self.metadata_db_path}", module="Database")
+
+    def _get_file_hash(self, file_path: Path, content_hash: str = None) -> str:
+        """Generate hash for file path or use content hash
+
+        Args:
+            file_path: Path to the file
+            content_hash: Optional SHA256 content hash from database (preferred for recycle bin)
+        """
+        if content_hash:
+            # Use first 64 chars of content hash (full SHA256 for cache key)
+            return content_hash[:64]
+        # Fall back to path-based hash
+        return hashlib.sha256(str(file_path).encode()).hexdigest()
+
+    def _generate_image_thumbnail(self, file_path: Path) -> tuple:
+        """Generate thumbnail and extract metadata for image
+        Returns: (thumbnail_data, width, height, format)
+        """
+        try:
+            with Image.open(file_path) as img:
+                # Get original dimensions
+                width, height = img.size
+                img_format = img.format
+
+                # Convert RGBA to RGB if needed
+                if img.mode == 'RGBA':
+                    background = Image.new('RGB', img.size, (255, 255, 255))
+                    background.paste(img, mask=img.split()[3])
+                    img = background
+                elif img.mode != 'RGB':
+                    img = img.convert('RGB')
+
+                # Generate thumbnail
+                img.thumbnail(self.max_thumb_size, Image.Resampling.LANCZOS)
+
+                # Save to bytes
+                buffer = io.BytesIO()
+                img.save(buffer, format='JPEG', quality=85, optimize=True)
+                thumbnail_data = buffer.getvalue()
+
+                return thumbnail_data, width, height, img_format
+        except Exception as e:
+            logger.error(f"Error generating image thumbnail for {file_path}: {e}", module="Error")
+            return None, None, None, None
+
+    def _generate_video_thumbnail(self, file_path: Path) -> tuple:
+        """Generate thumbnail and extract metadata for video using ffmpeg
+        Returns: (thumbnail_data, width, height, duration)
+        """
+        try:
+            import subprocess
+            import json
+
+            # Get video metadata using ffprobe
+            probe_cmd = [
+                'ffprobe',
+                '-v', 'quiet',
+                '-print_format', 'json',
+                '-show_format',
+                '-show_streams',
+                str(file_path)
+            ]
+
+            result = subprocess.run(probe_cmd, capture_output=True, text=True, timeout=30)
+            if result.returncode != 0:
+                logger.error(f"ffprobe failed for {file_path}", module="Error")
+                return None, None, None, None
+
+            metadata = json.loads(result.stdout)
+
+            # Extract video stream info
+            video_stream = next((s for s in metadata.get('streams', []) if s.get('codec_type') == 'video'), None)
+            if not video_stream:
+                return None, None, None, None
+
+            width = video_stream.get('width')
+            height = video_stream.get('height')
+            duration = float(metadata.get('format', {}).get('duration', 0))
+
+            # Generate thumbnail - seek to 1s or 0s for very short videos
+            temp_output = f"/tmp/thumb_{os.getpid()}.jpg"
+            seek_time = '00:00:01' if duration > 1.5 else '00:00:00'
+
+            thumb_cmd = [
+                'ffmpeg',
+                '-ss', seek_time,
+                '-i', str(file_path),
+                '-vframes', '1',
+                '-vf', f'scale={self.max_thumb_size[0]}:{self.max_thumb_size[1]}:force_original_aspect_ratio=decrease',
+                '-y',
+                temp_output
+            ]
+
+            result = subprocess.run(thumb_cmd, capture_output=True, timeout=30)
+            if result.returncode != 0 or not Path(temp_output).exists():
+                logger.error(f"ffmpeg thumbnail generation failed for {file_path}", module="Error")
+                return None, width, height, duration
+
+            # Read thumbnail data
+            with open(temp_output, 'rb') as f:
+                thumbnail_data = f.read()
+
+            # Clean up temp file
+            Path(temp_output).unlink(missing_ok=True)
+
+            return thumbnail_data, width, height, duration
+
+        except Exception as e:
+            logger.error(f"Error generating video thumbnail for {file_path}: {e}", module="Error")
+            return None, None, None, None
+
+    def _cache_thumbnail(self, file_path: Path, thumbnail_data: bytes, content_hash: str = None):
+        """Store thumbnail in cache database
+
+        Args:
+            file_path: Path to the file
+            thumbnail_data: JPEG thumbnail data
+            content_hash: Optional SHA256 content hash from database
+        """
+        try:
+            file_hash = self._get_file_hash(file_path, content_hash)
+            file_mtime = file_path.stat().st_mtime
+
+            conn = sqlite3.connect(str(self.db_path), timeout=30.0)
+            conn.execute('PRAGMA journal_mode=WAL')
+            conn.execute("""
+                INSERT OR REPLACE INTO thumbnails
+                (file_hash, file_path, thumbnail_data, created_at, file_mtime)
+                VALUES (?, ?, ?, ?, ?)
+            """, (file_hash, str(file_path), thumbnail_data, datetime.now().isoformat(), file_mtime))
+            conn.commit()
+            conn.close()
+
+            return True
+        except Exception as e:
+            logger.error(f"Error caching thumbnail for {file_path}: {e}", module="Error")
+            return False
+
+    def _cache_metadata(self, file_path: Path, width: int, height: int, duration: float = None, format_type: str = None, content_hash: str = None):
+        """Store metadata in cache database
+
+        Args:
+            file_path: Path to the file
+            width: Image/video width
+            height: Image/video height
+            duration: Video duration (seconds)
+            format_type: Media format
+            content_hash: Optional SHA256 content hash from database
+        """
+        try:
+            file_hash = self._get_file_hash(file_path, content_hash)
+            file_mtime = file_path.stat().st_mtime
+            file_size = file_path.stat().st_size
+
+            conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
+            conn.execute('PRAGMA journal_mode=WAL')
+            conn.execute("""
+                INSERT OR REPLACE INTO media_metadata
+                (file_hash, file_path, width, height, file_size, duration, format, created_at, file_mtime)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+            """, (file_hash, str(file_path), width, height, file_size, duration, format_type,
+                  datetime.now().isoformat(), file_mtime))
+            conn.commit()
+            conn.close()
+
+            return True
+        except Exception as e:
+            logger.error(f"Error caching metadata for {file_path}: {e}", module="Error")
+            return False
+
+    def _is_cached_valid(self, file_path: Path, content_hash: str = None) -> bool:
+        """Check if file already has valid cached thumbnail and metadata
+
+        Args:
+            file_path: Path to the file
+            content_hash: Optional SHA256 content hash from database
+        """
+        try:
+            file_hash = self._get_file_hash(file_path, content_hash)
+            file_mtime = file_path.stat().st_mtime
+
+            # Check thumbnail cache
+            conn = sqlite3.connect(str(self.db_path), timeout=30.0)
+            conn.execute('PRAGMA journal_mode=WAL')
+            cursor = conn.execute(
+                "SELECT file_mtime FROM thumbnails WHERE file_hash = ?",
+                (file_hash,)
+            )
+            thumb_result = cursor.fetchone()
+            conn.close()
+
+            if not thumb_result or abs(thumb_result[0] - file_mtime) > 1:
+                return False
+
+            # Check metadata cache
+            conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
+            conn.execute('PRAGMA journal_mode=WAL')
+            cursor = conn.execute(
+                "SELECT file_mtime FROM media_metadata WHERE file_hash = ?",
+                (file_hash,)
+            )
+            meta_result = cursor.fetchone()
+            conn.close()
+
+            if not meta_result or abs(meta_result[0] - file_mtime) > 1:
+                return False
+
+            return True
+
+        except Exception as e:
+            logger.error(f"Error checking cache for {file_path}: {e}", module="Error")
+            return False
+
+    def process_file(self, file_path: Path, content_hash: str = None) -> bool:
+        """Process a single file - generate thumbnail and cache metadata
+
+        Args:
+            file_path: Path to the file
+            content_hash: Optional SHA256 content hash from database (preferred for cache key)
+        """
+        try:
+            if not file_path.exists():
+                self.stats['skipped'] += 1
+                return True
+
+            # Check if already cached and up-to-date
+            if self._is_cached_valid(file_path, content_hash):
+                self.stats['skipped'] += 1
+                return True
+
+            file_ext = file_path.suffix.lower()
+
+            if file_ext in self.image_extensions:
+                # Process image
+                thumbnail_data, width, height, format_type = self._generate_image_thumbnail(file_path)
+
+                if thumbnail_data and width and height:
+                    # Cache thumbnail
+                    if self._cache_thumbnail(file_path, thumbnail_data, content_hash):
+                        self.stats['thumbnails_created'] += 1
+
+                    # Cache metadata
+                    if self._cache_metadata(file_path, width, height, format_type=format_type, content_hash=content_hash):
+                        self.stats['metadata_cached'] += 1
+
+                    return True
+                else:
+                    self.stats['errors'] += 1
+                    return False
+
+            elif file_ext in self.video_extensions:
+                # Process video
+                thumbnail_data, width, height, duration = self._generate_video_thumbnail(file_path)
+
+                # Cache thumbnail if generated
+                if thumbnail_data:
+                    if self._cache_thumbnail(file_path, thumbnail_data, content_hash):
+                        self.stats['thumbnails_created'] += 1
+
+                # Cache metadata if we have dimensions
+                if width and height:
+                    if self._cache_metadata(file_path, width, height, duration=duration, format_type='video', content_hash=content_hash):
+                        self.stats['metadata_cached'] += 1
+
+                # Consider successful even if thumbnail failed (metadata might still be cached)
+                if width and height:
+                    return True
+                else:
+                    self.stats['errors'] += 1
+                    return False
+
+            return True
+
+        except Exception as e:
+            logger.error(f"Error processing file {file_path}: {e}", module="Error")
+            self.stats['errors'] += 1
+            return False
+
+    def _get_files_from_inventory(self) -> list:
+        """Query file_inventory table for all media files (database-first)
+        Returns: List of tuples (file_path, content_hash or None)
+        """
+        try:
+            conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
+            conn.row_factory = sqlite3.Row
+            cursor = conn.cursor()
+
+            # Query all files from file_inventory (any location: final, review, recycle)
+            # Include file_hash from recycle_bin if file is in recycle location
+            cursor.execute("""
+                SELECT
+                    fi.file_path,
+                    fi.content_type,
+                    fi.location,
+                    rb.file_hash as content_hash
+                FROM file_inventory fi
+                LEFT JOIN recycle_bin rb ON fi.file_path = rb.recycle_path
+                ORDER BY fi.created_date DESC
+            """)
+
+            rows = cursor.fetchall()
+            conn.close()
+
+            # Convert to Path objects and filter by extension
+            all_extensions = list(self.image_extensions) + list(self.video_extensions)
+            files = []
+
+            for row in rows:
+                file_path = Path(row['file_path'])
+                if file_path.suffix.lower() in all_extensions and file_path.exists():
+                    # Return tuple: (file_path, content_hash or None)
+                    content_hash = row['content_hash'] if row['content_hash'] else None
+                    files.append((file_path, content_hash))
+
+            return files
+
+        except Exception as e:
+            logger.error(f"Error querying file_inventory: {e}", module="Error")
+            # Fallback to filesystem scan if database query fails
+            logger.warning("Falling back to filesystem scan...", module="Warning")
+            return self._fallback_filesystem_scan()
+
+    def _fallback_filesystem_scan(self) -> list:
+        """Fallback: Scan filesystem if database query fails
+        Returns: List of tuples (file_path, None) - no content_hash available from filesystem
+        """
+        all_files = []
+        for scan_dir in self.scan_dirs:
+            if not scan_dir.exists():
+                continue
+            for ext in list(self.image_extensions) + list(self.video_extensions):
+                # Return tuples: (file_path, None) - no content hash from filesystem scan
+                all_files.extend([(f, None) for f in scan_dir.rglob(f"*{ext}")])
+        return all_files
+
+    def scan_and_process(self):
+        """Query file_inventory and process all files (database-first)"""
+        logger.info("Starting thumbnail and metadata cache build...", module="Core")
+        logger.info("Querying file_inventory table (database-first architecture)...", module="Core")
+
+        start_time = time.time()
+
+        # Query file_inventory instead of scanning filesystem
+        # Returns list of tuples: (file_path, content_hash or None)
+        all_files = self._get_files_from_inventory()
+
+        total_files = len(all_files)
+        logger.info(f"Found {total_files} media files to process from file_inventory", module="Core")
+
+        # Count how many have content hashes (from recycle bin)
+        files_with_hash = sum(1 for _, content_hash in all_files if content_hash)
+        if files_with_hash > 0:
+            logger.info(f"  - {files_with_hash} files have content hash (from recycle bin - cache survives moves)", module="Core")
+
+        # Process files with progress updates
+        for i, (file_path, content_hash) in enumerate(all_files, 1):
+            self.process_file(file_path, content_hash)
+            self.stats['processed'] += 1
+
+            # Progress update every 100 files
+            if i % 100 == 0 or i == total_files:
+                elapsed = time.time() - start_time
+                rate = i / elapsed if elapsed > 0 else 0
+                eta = (total_files - i) / rate if rate > 0 else 0
+
+                logger.info(f"Progress: {i}/{total_files} ({i/total_files*100:.1f}%) - "
+                           f"Rate: {rate:.1f} files/sec - ETA: {eta/60:.1f} min", module="Core")
+
+        # Final statistics
+        elapsed = time.time() - start_time
+        logger.info("=" * 60, module="Core")
+        logger.info("Thumbnail and Metadata Cache Build Complete", module="Core")
+        logger.info("=" * 60, module="Core")
+        logger.info(f"Total files processed: {self.stats['processed']}", module="Core")
+        logger.info(f"Thumbnails created: {self.stats['thumbnails_created']}", module="Core")
+        logger.info(f"Metadata cached: {self.stats['metadata_cached']}", module="Core")
+        logger.info(f"Files skipped (already cached): {self.stats['skipped']}", module="Core")
+        logger.info(f"Errors: {self.stats['errors']}", module="Core")
+        logger.info(f"Total time: {elapsed/60:.1f} minutes", module="Core")
+        logger.info(f"Average rate: {self.stats['processed']/elapsed:.1f} files/sec", module="Core")
+        logger.info("=" * 60, module="Core")
+
+    def cleanup_orphaned_records(self):
+        """Clean up orphaned database records for files that no longer exist"""
+        logger.info("Starting database cleanup for orphaned records...", module="Cleanup")
+        cleanup_stats = {
+            'face_recognition_scans': 0,
+            'downloads': 0,
+            'media_metadata': 0,
+            'thumbnail_cache': 0
+        }
+
+        conn = None
+        meta_conn = None
+        thumb_conn = None
+        main_conn = None
+
+        try:
+            # Clean up face_recognition_scans for files not in file_inventory
+            conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
+            cursor = conn.cursor()
+
+            # Find orphaned face_recognition_scans (files not in file_inventory)
+            cursor.execute("""
+                SELECT COUNT(*) FROM face_recognition_scans frs
+                WHERE NOT EXISTS (
+                    SELECT 1 FROM file_inventory fi WHERE fi.file_path = frs.file_path
+                )
+            """)
+            orphaned_count = cursor.fetchone()[0]
+
+            if orphaned_count > 0:
+                cursor.execute("""
+                    DELETE FROM face_recognition_scans
+                    WHERE NOT EXISTS (
+                        SELECT 1 FROM file_inventory fi WHERE fi.file_path = face_recognition_scans.file_path
+                    )
+                """)
+                conn.commit()
+                cleanup_stats['face_recognition_scans'] = orphaned_count
+                logger.info(f"Removed {orphaned_count} orphaned face_recognition_scans records", module="Cleanup")
+
+            # Clean up downloads for files not in file_inventory
+            cursor.execute("""
+                SELECT COUNT(*) FROM downloads d
+                WHERE d.file_path IS NOT NULL AND d.file_path != ''
+                AND NOT EXISTS (
+                    SELECT 1 FROM file_inventory fi WHERE fi.file_path = d.file_path
+                )
+            """)
+            orphaned_downloads = cursor.fetchone()[0]
+
+            if orphaned_downloads > 0:
+                cursor.execute("""
+                    DELETE FROM downloads
+                    WHERE file_path IS NOT NULL AND file_path != ''
+                    AND NOT EXISTS (
+                        SELECT 1 FROM file_inventory fi WHERE fi.file_path = downloads.file_path
+                    )
+                """)
+                conn.commit()
+                cleanup_stats['downloads'] = orphaned_downloads
+                logger.info(f"Removed {orphaned_downloads} orphaned downloads records", module="Cleanup")
+
+            conn.close()
+
+            # Clean up media_metadata cache for files not in file_inventory
+            try:
+                meta_conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
+                main_conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
+
+                # Get list of valid file paths from file_inventory
+                main_cursor = main_conn.cursor()
+                main_cursor.execute("SELECT file_path FROM file_inventory")
+                valid_paths = set(row[0] for row in main_cursor.fetchall())
+                main_conn.close()
+
+                # Check metadata for orphans
+                meta_cursor = meta_conn.cursor()
+                meta_cursor.execute("SELECT file_path FROM media_metadata")
+                all_meta_paths = [row[0] for row in meta_cursor.fetchall()]
+
+                orphaned_meta = [p for p in all_meta_paths if p not in valid_paths]
+                if orphaned_meta:
+                    placeholders = ','.join(['?' for _ in orphaned_meta])
+                    meta_cursor.execute(f"DELETE FROM media_metadata WHERE file_path IN ({placeholders})", orphaned_meta)
+                    meta_conn.commit()
+                    cleanup_stats['media_metadata'] = len(orphaned_meta)
+                    logger.info(f"Removed {len(orphaned_meta)} orphaned media_metadata records", module="Cleanup")
+
+                meta_conn.close()
+            except Exception:
+                pass  # metadata cleanup is non-critical
+
+            # Clean up thumbnail cache for files not in file_inventory
+            thumb_db_path = Path(__file__).parent.parent / 'database' / 'thumbnails.db'
+            try:
+                thumb_conn = sqlite3.connect(str(thumb_db_path), timeout=30.0)
+                main_conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
+
+                # Get list of valid file paths
+                main_cursor = main_conn.cursor()
+                main_cursor.execute("SELECT file_path FROM file_inventory")
+                valid_paths = set(row[0] for row in main_cursor.fetchall())
+                main_conn.close()
+
+                # Check thumbnails for orphans
+                thumb_cursor = thumb_conn.cursor()
+                # Thumbnails use file_hash as key, so we need to check existence differently
+                try:
+                    thumb_cursor.execute("SELECT file_path FROM thumbnails WHERE file_path IS NOT NULL")
+                    all_thumb_paths = [row[0] for row in thumb_cursor.fetchall()]
+
+                    orphaned_thumbs = [p for p in all_thumb_paths if p and p not in valid_paths]
+                    if orphaned_thumbs:
+                        placeholders = ','.join(['?' for _ in orphaned_thumbs])
+                        thumb_cursor.execute(f"DELETE FROM thumbnails WHERE file_path IN ({placeholders})", orphaned_thumbs)
+                        thumb_conn.commit()
+                        cleanup_stats['thumbnail_cache'] = len(orphaned_thumbs)
+                        logger.info(f"Removed {len(orphaned_thumbs)} orphaned thumbnail records", module="Cleanup")
+                except sqlite3.OperationalError:
+                    # Table structure may not have file_path column
+                    pass
+
+                thumb_conn.close()
+            except Exception:
+                pass  # thumbnail cleanup is non-critical
+
+            # Log summary
+            total_cleaned = sum(cleanup_stats.values())
+            logger.info("=" * 60, module="Cleanup")
+            logger.info("Database Cleanup Complete", module="Cleanup")
+            logger.info("=" * 60, module="Cleanup")
+            logger.info(f"Total orphaned records removed: {total_cleaned}", module="Cleanup")
+            for table, count in cleanup_stats.items():
+                if count > 0:
+                    logger.info(f"  - {table}: {count}", module="Cleanup")
+            logger.info("=" * 60, module="Cleanup")
+
+            return cleanup_stats
+
+        except Exception as e:
+            logger.error(f"Error during database cleanup: {e}", exc_info=True, module="Error")
+            return cleanup_stats
+        finally:
+            # Ensure all database connections are closed
+            for connection in [conn, meta_conn, thumb_conn, main_conn]:
+                if connection:
+                    try:
+                        connection.close()
+                    except Exception:
+                        pass  # Best effort cleanup
+
+
+def main():
+    """Main entry point"""
+    logger.info("Thumbnail Cache Builder starting...", module="Core")
+
+    try:
+        builder = ThumbnailCacheBuilder()
+
+        # Run database cleanup first (before processing)
+        logger.info("Phase 1: Database cleanup for orphaned records", module="Core")
+        builder.cleanup_orphaned_records()
+
+        # Then process thumbnails and metadata
+        logger.info("Phase 2: Thumbnail and metadata cache building", module="Core")
+        builder.scan_and_process()
+
+        logger.info("Thumbnail Cache Builder completed successfully", module="Core")
+        return 0
+    except Exception as e:
+        logger.error(f"Fatal error in Thumbnail Cache Builder: {e}", exc_info=True, module="Error")
+        return 1
+
+
+if __name__ == '__main__':
+    sys.exit(main())