#!/usr/bin/env python3 """ Background worker to pre-generate thumbnails and cache metadata for all media files. This improves performance by generating thumbnails in advance rather than on-demand. """ import sys import os import time import hashlib from pathlib import Path from datetime import datetime from PIL import Image import io # Add parent directory to path so we can import modules sys.path.insert(0, str(Path(__file__).parent.parent)) # Bootstrap database backend (must be before any database imports) import modules.db_bootstrap # noqa: E402,F401 import sqlite3 from modules.universal_logger import get_logger logger = get_logger('ThumbnailCacheBuilder') class ThumbnailCacheBuilder: """Build and maintain thumbnail and metadata cache for media files""" def __init__(self): self.scan_dirs = [ Path('/opt/immich/md'), Path('/opt/immich/review'), Path('/opt/immich/recycle') ] self.db_path = Path(__file__).parent.parent / 'database' / 'thumbnails.db' self.metadata_db_path = Path(__file__).parent.parent / 'database' / 'media_metadata.db' self.unified_db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db' self.max_thumb_size = (300, 300) # Image and video extensions self.image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif', '.webp'} self.video_extensions = {'.mp4', '.mov', '.webm', '.avi', '.mkv', '.flv', '.m4v'} self.stats = { 'processed': 0, 'thumbnails_created': 0, 'thumbnails_cached': 0, 'metadata_cached': 0, 'errors': 0, 'skipped': 0 } self._init_metadata_db() def _init_metadata_db(self): """Initialize metadata cache database""" self.metadata_db_path.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0) conn.execute('PRAGMA journal_mode=WAL') conn.execute(""" CREATE TABLE IF NOT EXISTS media_metadata ( file_hash TEXT PRIMARY KEY, file_path TEXT NOT NULL, width INTEGER, height INTEGER, file_size INTEGER, duration REAL, format TEXT, created_at TEXT, file_mtime DOUBLE PRECISION ) """) conn.execute("CREATE INDEX IF NOT EXISTS idx_meta_file_path ON media_metadata(file_path)") conn.commit() conn.close() logger.info(f"Metadata database initialized at {self.metadata_db_path}", module="Database") def _get_file_hash(self, file_path: Path, content_hash: str = None) -> str: """Generate hash for file path or use content hash Args: file_path: Path to the file content_hash: Optional SHA256 content hash from database (preferred for recycle bin) """ if content_hash: # Use first 64 chars of content hash (full SHA256 for cache key) return content_hash[:64] # Fall back to path-based hash return hashlib.sha256(str(file_path).encode()).hexdigest() def _generate_image_thumbnail(self, file_path: Path) -> tuple: """Generate thumbnail and extract metadata for image Returns: (thumbnail_data, width, height, format) """ try: with Image.open(file_path) as img: # Get original dimensions width, height = img.size img_format = img.format # Convert RGBA to RGB if needed if img.mode == 'RGBA': background = Image.new('RGB', img.size, (255, 255, 255)) background.paste(img, mask=img.split()[3]) img = background elif img.mode != 'RGB': img = img.convert('RGB') # Generate thumbnail img.thumbnail(self.max_thumb_size, Image.Resampling.LANCZOS) # Save to bytes buffer = io.BytesIO() img.save(buffer, format='JPEG', quality=85, optimize=True) thumbnail_data = buffer.getvalue() return thumbnail_data, width, height, img_format except Exception as e: logger.error(f"Error generating image thumbnail for {file_path}: {e}", module="Error") return None, None, None, None def _generate_video_thumbnail(self, file_path: Path) -> tuple: """Generate thumbnail and extract metadata for video using ffmpeg Returns: (thumbnail_data, width, height, duration) """ try: import subprocess import json # Get video metadata using ffprobe probe_cmd = [ 'ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_format', '-show_streams', str(file_path) ] result = subprocess.run(probe_cmd, capture_output=True, text=True, timeout=30) if result.returncode != 0: logger.error(f"ffprobe failed for {file_path}", module="Error") return None, None, None, None metadata = json.loads(result.stdout) # Extract video stream info video_stream = next((s for s in metadata.get('streams', []) if s.get('codec_type') == 'video'), None) if not video_stream: return None, None, None, None width = video_stream.get('width') height = video_stream.get('height') duration = float(metadata.get('format', {}).get('duration', 0)) # Generate thumbnail - seek to 1s or 0s for very short videos temp_output = f"/tmp/thumb_{os.getpid()}.jpg" seek_time = '00:00:01' if duration > 1.5 else '00:00:00' thumb_cmd = [ 'ffmpeg', '-ss', seek_time, '-i', str(file_path), '-vframes', '1', '-vf', f'scale={self.max_thumb_size[0]}:{self.max_thumb_size[1]}:force_original_aspect_ratio=decrease', '-y', temp_output ] result = subprocess.run(thumb_cmd, capture_output=True, timeout=30) if result.returncode != 0 or not Path(temp_output).exists(): logger.error(f"ffmpeg thumbnail generation failed for {file_path}", module="Error") return None, width, height, duration # Read thumbnail data with open(temp_output, 'rb') as f: thumbnail_data = f.read() # Clean up temp file Path(temp_output).unlink(missing_ok=True) return thumbnail_data, width, height, duration except Exception as e: logger.error(f"Error generating video thumbnail for {file_path}: {e}", module="Error") return None, None, None, None def _cache_thumbnail(self, file_path: Path, thumbnail_data: bytes, content_hash: str = None): """Store thumbnail in cache database Args: file_path: Path to the file thumbnail_data: JPEG thumbnail data content_hash: Optional SHA256 content hash from database """ try: file_hash = self._get_file_hash(file_path, content_hash) file_mtime = file_path.stat().st_mtime conn = sqlite3.connect(str(self.db_path), timeout=30.0) conn.execute('PRAGMA journal_mode=WAL') conn.execute(""" INSERT OR REPLACE INTO thumbnails (file_hash, file_path, thumbnail_data, created_at, file_mtime) VALUES (?, ?, ?, ?, ?) """, (file_hash, str(file_path), thumbnail_data, datetime.now().isoformat(), file_mtime)) conn.commit() conn.close() return True except Exception as e: logger.error(f"Error caching thumbnail for {file_path}: {e}", module="Error") return False def _cache_metadata(self, file_path: Path, width: int, height: int, duration: float = None, format_type: str = None, content_hash: str = None): """Store metadata in cache database Args: file_path: Path to the file width: Image/video width height: Image/video height duration: Video duration (seconds) format_type: Media format content_hash: Optional SHA256 content hash from database """ try: file_hash = self._get_file_hash(file_path, content_hash) file_mtime = file_path.stat().st_mtime file_size = file_path.stat().st_size conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0) conn.execute('PRAGMA journal_mode=WAL') conn.execute(""" INSERT OR REPLACE INTO media_metadata (file_hash, file_path, width, height, file_size, duration, format, created_at, file_mtime) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) """, (file_hash, str(file_path), width, height, file_size, duration, format_type, datetime.now().isoformat(), file_mtime)) conn.commit() conn.close() return True except Exception as e: logger.error(f"Error caching metadata for {file_path}: {e}", module="Error") return False def _is_cached_valid(self, file_path: Path, content_hash: str = None) -> bool: """Check if file already has valid cached thumbnail and metadata Args: file_path: Path to the file content_hash: Optional SHA256 content hash from database """ try: file_hash = self._get_file_hash(file_path, content_hash) file_mtime = file_path.stat().st_mtime # Check thumbnail cache conn = sqlite3.connect(str(self.db_path), timeout=30.0) conn.execute('PRAGMA journal_mode=WAL') cursor = conn.execute( "SELECT file_mtime FROM thumbnails WHERE file_hash = ?", (file_hash,) ) thumb_result = cursor.fetchone() conn.close() if not thumb_result or abs(thumb_result[0] - file_mtime) > 1: return False # Check metadata cache conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0) conn.execute('PRAGMA journal_mode=WAL') cursor = conn.execute( "SELECT file_mtime FROM media_metadata WHERE file_hash = ?", (file_hash,) ) meta_result = cursor.fetchone() conn.close() if not meta_result or abs(meta_result[0] - file_mtime) > 1: return False return True except Exception as e: logger.error(f"Error checking cache for {file_path}: {e}", module="Error") return False def process_file(self, file_path: Path, content_hash: str = None) -> bool: """Process a single file - generate thumbnail and cache metadata Args: file_path: Path to the file content_hash: Optional SHA256 content hash from database (preferred for cache key) """ try: if not file_path.exists(): self.stats['skipped'] += 1 return True # Check if already cached and up-to-date if self._is_cached_valid(file_path, content_hash): self.stats['skipped'] += 1 return True file_ext = file_path.suffix.lower() if file_ext in self.image_extensions: # Process image thumbnail_data, width, height, format_type = self._generate_image_thumbnail(file_path) if thumbnail_data and width and height: # Cache thumbnail if self._cache_thumbnail(file_path, thumbnail_data, content_hash): self.stats['thumbnails_created'] += 1 # Cache metadata if self._cache_metadata(file_path, width, height, format_type=format_type, content_hash=content_hash): self.stats['metadata_cached'] += 1 return True else: self.stats['errors'] += 1 return False elif file_ext in self.video_extensions: # Process video thumbnail_data, width, height, duration = self._generate_video_thumbnail(file_path) # Cache thumbnail if generated if thumbnail_data: if self._cache_thumbnail(file_path, thumbnail_data, content_hash): self.stats['thumbnails_created'] += 1 # Cache metadata if we have dimensions if width and height: if self._cache_metadata(file_path, width, height, duration=duration, format_type='video', content_hash=content_hash): self.stats['metadata_cached'] += 1 # Consider successful even if thumbnail failed (metadata might still be cached) if width and height: return True else: self.stats['errors'] += 1 return False return True except Exception as e: logger.error(f"Error processing file {file_path}: {e}", module="Error") self.stats['errors'] += 1 return False def _get_files_from_inventory(self) -> list: """Query file_inventory table for all media files (database-first) Returns: List of tuples (file_path, content_hash or None) """ try: conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0) conn.row_factory = sqlite3.Row cursor = conn.cursor() # Query all files from file_inventory (any location: final, review, recycle) # Include file_hash from recycle_bin if file is in recycle location cursor.execute(""" SELECT fi.file_path, fi.content_type, fi.location, rb.file_hash as content_hash FROM file_inventory fi LEFT JOIN recycle_bin rb ON fi.file_path = rb.recycle_path ORDER BY fi.created_date DESC """) rows = cursor.fetchall() conn.close() # Convert to Path objects and filter by extension all_extensions = list(self.image_extensions) + list(self.video_extensions) files = [] for row in rows: file_path = Path(row['file_path']) if file_path.suffix.lower() in all_extensions and file_path.exists(): # Return tuple: (file_path, content_hash or None) content_hash = row['content_hash'] if row['content_hash'] else None files.append((file_path, content_hash)) return files except Exception as e: logger.error(f"Error querying file_inventory: {e}", module="Error") # Fallback to filesystem scan if database query fails logger.warning("Falling back to filesystem scan...", module="Warning") return self._fallback_filesystem_scan() def _fallback_filesystem_scan(self) -> list: """Fallback: Scan filesystem if database query fails Returns: List of tuples (file_path, None) - no content_hash available from filesystem """ all_files = [] for scan_dir in self.scan_dirs: if not scan_dir.exists(): continue for ext in list(self.image_extensions) + list(self.video_extensions): # Return tuples: (file_path, None) - no content hash from filesystem scan all_files.extend([(f, None) for f in scan_dir.rglob(f"*{ext}")]) return all_files def scan_and_process(self): """Query file_inventory and process all files (database-first)""" logger.info("Starting thumbnail and metadata cache build...", module="Core") logger.info("Querying file_inventory table (database-first architecture)...", module="Core") start_time = time.time() # Query file_inventory instead of scanning filesystem # Returns list of tuples: (file_path, content_hash or None) all_files = self._get_files_from_inventory() total_files = len(all_files) logger.info(f"Found {total_files} media files to process from file_inventory", module="Core") # Count how many have content hashes (from recycle bin) files_with_hash = sum(1 for _, content_hash in all_files if content_hash) if files_with_hash > 0: logger.info(f" - {files_with_hash} files have content hash (from recycle bin - cache survives moves)", module="Core") # Process files with progress updates for i, (file_path, content_hash) in enumerate(all_files, 1): self.process_file(file_path, content_hash) self.stats['processed'] += 1 # Progress update every 100 files if i % 100 == 0 or i == total_files: elapsed = time.time() - start_time rate = i / elapsed if elapsed > 0 else 0 eta = (total_files - i) / rate if rate > 0 else 0 logger.info(f"Progress: {i}/{total_files} ({i/total_files*100:.1f}%) - " f"Rate: {rate:.1f} files/sec - ETA: {eta/60:.1f} min", module="Core") # Final statistics elapsed = time.time() - start_time logger.info("=" * 60, module="Core") logger.info("Thumbnail and Metadata Cache Build Complete", module="Core") logger.info("=" * 60, module="Core") logger.info(f"Total files processed: {self.stats['processed']}", module="Core") logger.info(f"Thumbnails created: {self.stats['thumbnails_created']}", module="Core") logger.info(f"Metadata cached: {self.stats['metadata_cached']}", module="Core") logger.info(f"Files skipped (already cached): {self.stats['skipped']}", module="Core") logger.info(f"Errors: {self.stats['errors']}", module="Core") logger.info(f"Total time: {elapsed/60:.1f} minutes", module="Core") logger.info(f"Average rate: {self.stats['processed']/elapsed:.1f} files/sec", module="Core") logger.info("=" * 60, module="Core") def cleanup_orphaned_records(self): """Clean up orphaned database records for files that no longer exist""" logger.info("Starting database cleanup for orphaned records...", module="Cleanup") cleanup_stats = { 'face_recognition_scans': 0, 'downloads': 0, 'media_metadata': 0, 'thumbnail_cache': 0 } conn = None meta_conn = None thumb_conn = None main_conn = None try: # Clean up face_recognition_scans for files not in file_inventory conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0) cursor = conn.cursor() # Find orphaned face_recognition_scans (files not in file_inventory) cursor.execute(""" SELECT COUNT(*) FROM face_recognition_scans frs WHERE NOT EXISTS ( SELECT 1 FROM file_inventory fi WHERE fi.file_path = frs.file_path ) """) orphaned_count = cursor.fetchone()[0] if orphaned_count > 0: cursor.execute(""" DELETE FROM face_recognition_scans WHERE NOT EXISTS ( SELECT 1 FROM file_inventory fi WHERE fi.file_path = face_recognition_scans.file_path ) """) conn.commit() cleanup_stats['face_recognition_scans'] = orphaned_count logger.info(f"Removed {orphaned_count} orphaned face_recognition_scans records", module="Cleanup") # Clean up downloads for files not in file_inventory cursor.execute(""" SELECT COUNT(*) FROM downloads d WHERE d.file_path IS NOT NULL AND d.file_path != '' AND NOT EXISTS ( SELECT 1 FROM file_inventory fi WHERE fi.file_path = d.file_path ) """) orphaned_downloads = cursor.fetchone()[0] if orphaned_downloads > 0: cursor.execute(""" DELETE FROM downloads WHERE file_path IS NOT NULL AND file_path != '' AND NOT EXISTS ( SELECT 1 FROM file_inventory fi WHERE fi.file_path = downloads.file_path ) """) conn.commit() cleanup_stats['downloads'] = orphaned_downloads logger.info(f"Removed {orphaned_downloads} orphaned downloads records", module="Cleanup") conn.close() # Clean up media_metadata cache for files not in file_inventory try: meta_conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0) main_conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0) # Get list of valid file paths from file_inventory main_cursor = main_conn.cursor() main_cursor.execute("SELECT file_path FROM file_inventory") valid_paths = set(row[0] for row in main_cursor.fetchall()) main_conn.close() # Check metadata for orphans meta_cursor = meta_conn.cursor() meta_cursor.execute("SELECT file_path FROM media_metadata") all_meta_paths = [row[0] for row in meta_cursor.fetchall()] orphaned_meta = [p for p in all_meta_paths if p not in valid_paths] if orphaned_meta: placeholders = ','.join(['?' for _ in orphaned_meta]) meta_cursor.execute(f"DELETE FROM media_metadata WHERE file_path IN ({placeholders})", orphaned_meta) meta_conn.commit() cleanup_stats['media_metadata'] = len(orphaned_meta) logger.info(f"Removed {len(orphaned_meta)} orphaned media_metadata records", module="Cleanup") meta_conn.close() except Exception: pass # metadata cleanup is non-critical # Clean up thumbnail cache for files not in file_inventory thumb_db_path = Path(__file__).parent.parent / 'database' / 'thumbnails.db' try: thumb_conn = sqlite3.connect(str(thumb_db_path), timeout=30.0) main_conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0) # Get list of valid file paths main_cursor = main_conn.cursor() main_cursor.execute("SELECT file_path FROM file_inventory") valid_paths = set(row[0] for row in main_cursor.fetchall()) main_conn.close() # Check thumbnails for orphans thumb_cursor = thumb_conn.cursor() # Thumbnails use file_hash as key, so we need to check existence differently try: thumb_cursor.execute("SELECT file_path FROM thumbnails WHERE file_path IS NOT NULL") all_thumb_paths = [row[0] for row in thumb_cursor.fetchall()] orphaned_thumbs = [p for p in all_thumb_paths if p and p not in valid_paths] if orphaned_thumbs: placeholders = ','.join(['?' for _ in orphaned_thumbs]) thumb_cursor.execute(f"DELETE FROM thumbnails WHERE file_path IN ({placeholders})", orphaned_thumbs) thumb_conn.commit() cleanup_stats['thumbnail_cache'] = len(orphaned_thumbs) logger.info(f"Removed {len(orphaned_thumbs)} orphaned thumbnail records", module="Cleanup") except sqlite3.OperationalError: # Table structure may not have file_path column pass thumb_conn.close() except Exception: pass # thumbnail cleanup is non-critical # Log summary total_cleaned = sum(cleanup_stats.values()) logger.info("=" * 60, module="Cleanup") logger.info("Database Cleanup Complete", module="Cleanup") logger.info("=" * 60, module="Cleanup") logger.info(f"Total orphaned records removed: {total_cleaned}", module="Cleanup") for table, count in cleanup_stats.items(): if count > 0: logger.info(f" - {table}: {count}", module="Cleanup") logger.info("=" * 60, module="Cleanup") return cleanup_stats except Exception as e: logger.error(f"Error during database cleanup: {e}", exc_info=True, module="Error") return cleanup_stats finally: # Ensure all database connections are closed for connection in [conn, meta_conn, thumb_conn, main_conn]: if connection: try: connection.close() except Exception: pass # Best effort cleanup def main(): """Main entry point""" logger.info("Thumbnail Cache Builder starting...", module="Core") try: builder = ThumbnailCacheBuilder() # Run database cleanup first (before processing) logger.info("Phase 1: Database cleanup for orphaned records", module="Core") builder.cleanup_orphaned_records() # Then process thumbnails and metadata logger.info("Phase 2: Thumbnail and metadata cache building", module="Core") builder.scan_and_process() logger.info("Thumbnail Cache Builder completed successfully", module="Core") return 0 except Exception as e: logger.error(f"Fatal error in Thumbnail Cache Builder: {e}", exc_info=True, module="Error") return 1 if __name__ == '__main__': sys.exit(main())