Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

View File

@@ -0,0 +1,639 @@
#!/usr/bin/env python3
"""
Background worker to pre-generate thumbnails and cache metadata for all media files.
This improves performance by generating thumbnails in advance rather than on-demand.
"""
import sys
import os
import time
import hashlib
from pathlib import Path
from datetime import datetime
from PIL import Image
import io
# Add parent directory to path so we can import modules
sys.path.insert(0, str(Path(__file__).parent.parent))
# Bootstrap database backend (must be before any database imports)
import modules.db_bootstrap # noqa: E402,F401
import sqlite3
from modules.universal_logger import get_logger
logger = get_logger('ThumbnailCacheBuilder')
class ThumbnailCacheBuilder:
"""Build and maintain thumbnail and metadata cache for media files"""
def __init__(self):
self.scan_dirs = [
Path('/opt/immich/md'),
Path('/opt/immich/review'),
Path('/opt/immich/recycle')
]
self.db_path = Path(__file__).parent.parent / 'database' / 'thumbnails.db'
self.metadata_db_path = Path(__file__).parent.parent / 'database' / 'media_metadata.db'
self.unified_db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db'
self.max_thumb_size = (300, 300)
# Image and video extensions
self.image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif', '.webp'}
self.video_extensions = {'.mp4', '.mov', '.webm', '.avi', '.mkv', '.flv', '.m4v'}
self.stats = {
'processed': 0,
'thumbnails_created': 0,
'thumbnails_cached': 0,
'metadata_cached': 0,
'errors': 0,
'skipped': 0
}
self._init_metadata_db()
def _init_metadata_db(self):
"""Initialize metadata cache database"""
self.metadata_db_path.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute("""
CREATE TABLE IF NOT EXISTS media_metadata (
file_hash TEXT PRIMARY KEY,
file_path TEXT NOT NULL,
width INTEGER,
height INTEGER,
file_size INTEGER,
duration REAL,
format TEXT,
created_at TEXT,
file_mtime DOUBLE PRECISION
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_meta_file_path ON media_metadata(file_path)")
conn.commit()
conn.close()
logger.info(f"Metadata database initialized at {self.metadata_db_path}", module="Database")
def _get_file_hash(self, file_path: Path, content_hash: str = None) -> str:
"""Generate hash for file path or use content hash
Args:
file_path: Path to the file
content_hash: Optional SHA256 content hash from database (preferred for recycle bin)
"""
if content_hash:
# Use first 64 chars of content hash (full SHA256 for cache key)
return content_hash[:64]
# Fall back to path-based hash
return hashlib.sha256(str(file_path).encode()).hexdigest()
def _generate_image_thumbnail(self, file_path: Path) -> tuple:
"""Generate thumbnail and extract metadata for image
Returns: (thumbnail_data, width, height, format)
"""
try:
with Image.open(file_path) as img:
# Get original dimensions
width, height = img.size
img_format = img.format
# Convert RGBA to RGB if needed
if img.mode == 'RGBA':
background = Image.new('RGB', img.size, (255, 255, 255))
background.paste(img, mask=img.split()[3])
img = background
elif img.mode != 'RGB':
img = img.convert('RGB')
# Generate thumbnail
img.thumbnail(self.max_thumb_size, Image.Resampling.LANCZOS)
# Save to bytes
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=85, optimize=True)
thumbnail_data = buffer.getvalue()
return thumbnail_data, width, height, img_format
except Exception as e:
logger.error(f"Error generating image thumbnail for {file_path}: {e}", module="Error")
return None, None, None, None
def _generate_video_thumbnail(self, file_path: Path) -> tuple:
"""Generate thumbnail and extract metadata for video using ffmpeg
Returns: (thumbnail_data, width, height, duration)
"""
try:
import subprocess
import json
# Get video metadata using ffprobe
probe_cmd = [
'ffprobe',
'-v', 'quiet',
'-print_format', 'json',
'-show_format',
'-show_streams',
str(file_path)
]
result = subprocess.run(probe_cmd, capture_output=True, text=True, timeout=30)
if result.returncode != 0:
logger.error(f"ffprobe failed for {file_path}", module="Error")
return None, None, None, None
metadata = json.loads(result.stdout)
# Extract video stream info
video_stream = next((s for s in metadata.get('streams', []) if s.get('codec_type') == 'video'), None)
if not video_stream:
return None, None, None, None
width = video_stream.get('width')
height = video_stream.get('height')
duration = float(metadata.get('format', {}).get('duration', 0))
# Generate thumbnail - seek to 1s or 0s for very short videos
temp_output = f"/tmp/thumb_{os.getpid()}.jpg"
seek_time = '00:00:01' if duration > 1.5 else '00:00:00'
thumb_cmd = [
'ffmpeg',
'-ss', seek_time,
'-i', str(file_path),
'-vframes', '1',
'-vf', f'scale={self.max_thumb_size[0]}:{self.max_thumb_size[1]}:force_original_aspect_ratio=decrease',
'-y',
temp_output
]
result = subprocess.run(thumb_cmd, capture_output=True, timeout=30)
if result.returncode != 0 or not Path(temp_output).exists():
logger.error(f"ffmpeg thumbnail generation failed for {file_path}", module="Error")
return None, width, height, duration
# Read thumbnail data
with open(temp_output, 'rb') as f:
thumbnail_data = f.read()
# Clean up temp file
Path(temp_output).unlink(missing_ok=True)
return thumbnail_data, width, height, duration
except Exception as e:
logger.error(f"Error generating video thumbnail for {file_path}: {e}", module="Error")
return None, None, None, None
def _cache_thumbnail(self, file_path: Path, thumbnail_data: bytes, content_hash: str = None):
"""Store thumbnail in cache database
Args:
file_path: Path to the file
thumbnail_data: JPEG thumbnail data
content_hash: Optional SHA256 content hash from database
"""
try:
file_hash = self._get_file_hash(file_path, content_hash)
file_mtime = file_path.stat().st_mtime
conn = sqlite3.connect(str(self.db_path), timeout=30.0)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute("""
INSERT OR REPLACE INTO thumbnails
(file_hash, file_path, thumbnail_data, created_at, file_mtime)
VALUES (?, ?, ?, ?, ?)
""", (file_hash, str(file_path), thumbnail_data, datetime.now().isoformat(), file_mtime))
conn.commit()
conn.close()
return True
except Exception as e:
logger.error(f"Error caching thumbnail for {file_path}: {e}", module="Error")
return False
def _cache_metadata(self, file_path: Path, width: int, height: int, duration: float = None, format_type: str = None, content_hash: str = None):
"""Store metadata in cache database
Args:
file_path: Path to the file
width: Image/video width
height: Image/video height
duration: Video duration (seconds)
format_type: Media format
content_hash: Optional SHA256 content hash from database
"""
try:
file_hash = self._get_file_hash(file_path, content_hash)
file_mtime = file_path.stat().st_mtime
file_size = file_path.stat().st_size
conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute("""
INSERT OR REPLACE INTO media_metadata
(file_hash, file_path, width, height, file_size, duration, format, created_at, file_mtime)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (file_hash, str(file_path), width, height, file_size, duration, format_type,
datetime.now().isoformat(), file_mtime))
conn.commit()
conn.close()
return True
except Exception as e:
logger.error(f"Error caching metadata for {file_path}: {e}", module="Error")
return False
def _is_cached_valid(self, file_path: Path, content_hash: str = None) -> bool:
"""Check if file already has valid cached thumbnail and metadata
Args:
file_path: Path to the file
content_hash: Optional SHA256 content hash from database
"""
try:
file_hash = self._get_file_hash(file_path, content_hash)
file_mtime = file_path.stat().st_mtime
# Check thumbnail cache
conn = sqlite3.connect(str(self.db_path), timeout=30.0)
conn.execute('PRAGMA journal_mode=WAL')
cursor = conn.execute(
"SELECT file_mtime FROM thumbnails WHERE file_hash = ?",
(file_hash,)
)
thumb_result = cursor.fetchone()
conn.close()
if not thumb_result or abs(thumb_result[0] - file_mtime) > 1:
return False
# Check metadata cache
conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
conn.execute('PRAGMA journal_mode=WAL')
cursor = conn.execute(
"SELECT file_mtime FROM media_metadata WHERE file_hash = ?",
(file_hash,)
)
meta_result = cursor.fetchone()
conn.close()
if not meta_result or abs(meta_result[0] - file_mtime) > 1:
return False
return True
except Exception as e:
logger.error(f"Error checking cache for {file_path}: {e}", module="Error")
return False
def process_file(self, file_path: Path, content_hash: str = None) -> bool:
"""Process a single file - generate thumbnail and cache metadata
Args:
file_path: Path to the file
content_hash: Optional SHA256 content hash from database (preferred for cache key)
"""
try:
if not file_path.exists():
self.stats['skipped'] += 1
return True
# Check if already cached and up-to-date
if self._is_cached_valid(file_path, content_hash):
self.stats['skipped'] += 1
return True
file_ext = file_path.suffix.lower()
if file_ext in self.image_extensions:
# Process image
thumbnail_data, width, height, format_type = self._generate_image_thumbnail(file_path)
if thumbnail_data and width and height:
# Cache thumbnail
if self._cache_thumbnail(file_path, thumbnail_data, content_hash):
self.stats['thumbnails_created'] += 1
# Cache metadata
if self._cache_metadata(file_path, width, height, format_type=format_type, content_hash=content_hash):
self.stats['metadata_cached'] += 1
return True
else:
self.stats['errors'] += 1
return False
elif file_ext in self.video_extensions:
# Process video
thumbnail_data, width, height, duration = self._generate_video_thumbnail(file_path)
# Cache thumbnail if generated
if thumbnail_data:
if self._cache_thumbnail(file_path, thumbnail_data, content_hash):
self.stats['thumbnails_created'] += 1
# Cache metadata if we have dimensions
if width and height:
if self._cache_metadata(file_path, width, height, duration=duration, format_type='video', content_hash=content_hash):
self.stats['metadata_cached'] += 1
# Consider successful even if thumbnail failed (metadata might still be cached)
if width and height:
return True
else:
self.stats['errors'] += 1
return False
return True
except Exception as e:
logger.error(f"Error processing file {file_path}: {e}", module="Error")
self.stats['errors'] += 1
return False
def _get_files_from_inventory(self) -> list:
"""Query file_inventory table for all media files (database-first)
Returns: List of tuples (file_path, content_hash or None)
"""
try:
conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
# Query all files from file_inventory (any location: final, review, recycle)
# Include file_hash from recycle_bin if file is in recycle location
cursor.execute("""
SELECT
fi.file_path,
fi.content_type,
fi.location,
rb.file_hash as content_hash
FROM file_inventory fi
LEFT JOIN recycle_bin rb ON fi.file_path = rb.recycle_path
ORDER BY fi.created_date DESC
""")
rows = cursor.fetchall()
conn.close()
# Convert to Path objects and filter by extension
all_extensions = list(self.image_extensions) + list(self.video_extensions)
files = []
for row in rows:
file_path = Path(row['file_path'])
if file_path.suffix.lower() in all_extensions and file_path.exists():
# Return tuple: (file_path, content_hash or None)
content_hash = row['content_hash'] if row['content_hash'] else None
files.append((file_path, content_hash))
return files
except Exception as e:
logger.error(f"Error querying file_inventory: {e}", module="Error")
# Fallback to filesystem scan if database query fails
logger.warning("Falling back to filesystem scan...", module="Warning")
return self._fallback_filesystem_scan()
def _fallback_filesystem_scan(self) -> list:
"""Fallback: Scan filesystem if database query fails
Returns: List of tuples (file_path, None) - no content_hash available from filesystem
"""
all_files = []
for scan_dir in self.scan_dirs:
if not scan_dir.exists():
continue
for ext in list(self.image_extensions) + list(self.video_extensions):
# Return tuples: (file_path, None) - no content hash from filesystem scan
all_files.extend([(f, None) for f in scan_dir.rglob(f"*{ext}")])
return all_files
def scan_and_process(self):
"""Query file_inventory and process all files (database-first)"""
logger.info("Starting thumbnail and metadata cache build...", module="Core")
logger.info("Querying file_inventory table (database-first architecture)...", module="Core")
start_time = time.time()
# Query file_inventory instead of scanning filesystem
# Returns list of tuples: (file_path, content_hash or None)
all_files = self._get_files_from_inventory()
total_files = len(all_files)
logger.info(f"Found {total_files} media files to process from file_inventory", module="Core")
# Count how many have content hashes (from recycle bin)
files_with_hash = sum(1 for _, content_hash in all_files if content_hash)
if files_with_hash > 0:
logger.info(f" - {files_with_hash} files have content hash (from recycle bin - cache survives moves)", module="Core")
# Process files with progress updates
for i, (file_path, content_hash) in enumerate(all_files, 1):
self.process_file(file_path, content_hash)
self.stats['processed'] += 1
# Progress update every 100 files
if i % 100 == 0 or i == total_files:
elapsed = time.time() - start_time
rate = i / elapsed if elapsed > 0 else 0
eta = (total_files - i) / rate if rate > 0 else 0
logger.info(f"Progress: {i}/{total_files} ({i/total_files*100:.1f}%) - "
f"Rate: {rate:.1f} files/sec - ETA: {eta/60:.1f} min", module="Core")
# Final statistics
elapsed = time.time() - start_time
logger.info("=" * 60, module="Core")
logger.info("Thumbnail and Metadata Cache Build Complete", module="Core")
logger.info("=" * 60, module="Core")
logger.info(f"Total files processed: {self.stats['processed']}", module="Core")
logger.info(f"Thumbnails created: {self.stats['thumbnails_created']}", module="Core")
logger.info(f"Metadata cached: {self.stats['metadata_cached']}", module="Core")
logger.info(f"Files skipped (already cached): {self.stats['skipped']}", module="Core")
logger.info(f"Errors: {self.stats['errors']}", module="Core")
logger.info(f"Total time: {elapsed/60:.1f} minutes", module="Core")
logger.info(f"Average rate: {self.stats['processed']/elapsed:.1f} files/sec", module="Core")
logger.info("=" * 60, module="Core")
def cleanup_orphaned_records(self):
"""Clean up orphaned database records for files that no longer exist"""
logger.info("Starting database cleanup for orphaned records...", module="Cleanup")
cleanup_stats = {
'face_recognition_scans': 0,
'downloads': 0,
'media_metadata': 0,
'thumbnail_cache': 0
}
conn = None
meta_conn = None
thumb_conn = None
main_conn = None
try:
# Clean up face_recognition_scans for files not in file_inventory
conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
cursor = conn.cursor()
# Find orphaned face_recognition_scans (files not in file_inventory)
cursor.execute("""
SELECT COUNT(*) FROM face_recognition_scans frs
WHERE NOT EXISTS (
SELECT 1 FROM file_inventory fi WHERE fi.file_path = frs.file_path
)
""")
orphaned_count = cursor.fetchone()[0]
if orphaned_count > 0:
cursor.execute("""
DELETE FROM face_recognition_scans
WHERE NOT EXISTS (
SELECT 1 FROM file_inventory fi WHERE fi.file_path = face_recognition_scans.file_path
)
""")
conn.commit()
cleanup_stats['face_recognition_scans'] = orphaned_count
logger.info(f"Removed {orphaned_count} orphaned face_recognition_scans records", module="Cleanup")
# Clean up downloads for files not in file_inventory
cursor.execute("""
SELECT COUNT(*) FROM downloads d
WHERE d.file_path IS NOT NULL AND d.file_path != ''
AND NOT EXISTS (
SELECT 1 FROM file_inventory fi WHERE fi.file_path = d.file_path
)
""")
orphaned_downloads = cursor.fetchone()[0]
if orphaned_downloads > 0:
cursor.execute("""
DELETE FROM downloads
WHERE file_path IS NOT NULL AND file_path != ''
AND NOT EXISTS (
SELECT 1 FROM file_inventory fi WHERE fi.file_path = downloads.file_path
)
""")
conn.commit()
cleanup_stats['downloads'] = orphaned_downloads
logger.info(f"Removed {orphaned_downloads} orphaned downloads records", module="Cleanup")
conn.close()
# Clean up media_metadata cache for files not in file_inventory
try:
meta_conn = sqlite3.connect(str(self.metadata_db_path), timeout=30.0)
main_conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
# Get list of valid file paths from file_inventory
main_cursor = main_conn.cursor()
main_cursor.execute("SELECT file_path FROM file_inventory")
valid_paths = set(row[0] for row in main_cursor.fetchall())
main_conn.close()
# Check metadata for orphans
meta_cursor = meta_conn.cursor()
meta_cursor.execute("SELECT file_path FROM media_metadata")
all_meta_paths = [row[0] for row in meta_cursor.fetchall()]
orphaned_meta = [p for p in all_meta_paths if p not in valid_paths]
if orphaned_meta:
placeholders = ','.join(['?' for _ in orphaned_meta])
meta_cursor.execute(f"DELETE FROM media_metadata WHERE file_path IN ({placeholders})", orphaned_meta)
meta_conn.commit()
cleanup_stats['media_metadata'] = len(orphaned_meta)
logger.info(f"Removed {len(orphaned_meta)} orphaned media_metadata records", module="Cleanup")
meta_conn.close()
except Exception:
pass # metadata cleanup is non-critical
# Clean up thumbnail cache for files not in file_inventory
thumb_db_path = Path(__file__).parent.parent / 'database' / 'thumbnails.db'
try:
thumb_conn = sqlite3.connect(str(thumb_db_path), timeout=30.0)
main_conn = sqlite3.connect(str(self.unified_db_path), timeout=30.0)
# Get list of valid file paths
main_cursor = main_conn.cursor()
main_cursor.execute("SELECT file_path FROM file_inventory")
valid_paths = set(row[0] for row in main_cursor.fetchall())
main_conn.close()
# Check thumbnails for orphans
thumb_cursor = thumb_conn.cursor()
# Thumbnails use file_hash as key, so we need to check existence differently
try:
thumb_cursor.execute("SELECT file_path FROM thumbnails WHERE file_path IS NOT NULL")
all_thumb_paths = [row[0] for row in thumb_cursor.fetchall()]
orphaned_thumbs = [p for p in all_thumb_paths if p and p not in valid_paths]
if orphaned_thumbs:
placeholders = ','.join(['?' for _ in orphaned_thumbs])
thumb_cursor.execute(f"DELETE FROM thumbnails WHERE file_path IN ({placeholders})", orphaned_thumbs)
thumb_conn.commit()
cleanup_stats['thumbnail_cache'] = len(orphaned_thumbs)
logger.info(f"Removed {len(orphaned_thumbs)} orphaned thumbnail records", module="Cleanup")
except sqlite3.OperationalError:
# Table structure may not have file_path column
pass
thumb_conn.close()
except Exception:
pass # thumbnail cleanup is non-critical
# Log summary
total_cleaned = sum(cleanup_stats.values())
logger.info("=" * 60, module="Cleanup")
logger.info("Database Cleanup Complete", module="Cleanup")
logger.info("=" * 60, module="Cleanup")
logger.info(f"Total orphaned records removed: {total_cleaned}", module="Cleanup")
for table, count in cleanup_stats.items():
if count > 0:
logger.info(f" - {table}: {count}", module="Cleanup")
logger.info("=" * 60, module="Cleanup")
return cleanup_stats
except Exception as e:
logger.error(f"Error during database cleanup: {e}", exc_info=True, module="Error")
return cleanup_stats
finally:
# Ensure all database connections are closed
for connection in [conn, meta_conn, thumb_conn, main_conn]:
if connection:
try:
connection.close()
except Exception:
pass # Best effort cleanup
def main():
"""Main entry point"""
logger.info("Thumbnail Cache Builder starting...", module="Core")
try:
builder = ThumbnailCacheBuilder()
# Run database cleanup first (before processing)
logger.info("Phase 1: Database cleanup for orphaned records", module="Core")
builder.cleanup_orphaned_records()
# Then process thumbnails and metadata
logger.info("Phase 2: Thumbnail and metadata cache building", module="Core")
builder.scan_and_process()
logger.info("Thumbnail Cache Builder completed successfully", module="Core")
return 0
except Exception as e:
logger.error(f"Fatal error in Thumbnail Cache Builder: {e}", exc_info=True, module="Error")
return 1
if __name__ == '__main__':
sys.exit(main())