Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

582
web/backend/core/utils.py Normal file
View File

@@ -0,0 +1,582 @@
"""
Shared Utility Functions
Common helper functions used across multiple routers.
"""
import io
import sqlite3
import hashlib
import subprocess
from collections import OrderedDict
from contextlib import closing
from pathlib import Path
from threading import Lock
from typing import Dict, List, Optional, Tuple, Union
from fastapi import HTTPException
from PIL import Image
from .config import settings
# ============================================================================
# THUMBNAIL LRU CACHE
# ============================================================================
class ThumbnailLRUCache:
"""Thread-safe LRU cache for thumbnail binary data.
Avoids SQLite lookups for frequently accessed thumbnails.
Used by media.py and recycle.py routers.
"""
def __init__(self, max_size: int = 500, max_memory_mb: int = 100):
self._cache: OrderedDict[str, bytes] = OrderedDict()
self._lock = Lock()
self._max_size = max_size
self._max_memory = max_memory_mb * 1024 * 1024 # Convert to bytes
self._current_memory = 0
def get(self, key: str) -> Optional[bytes]:
with self._lock:
if key in self._cache:
# Move to end (most recently used)
self._cache.move_to_end(key)
return self._cache[key]
return None
def put(self, key: str, data: bytes) -> None:
with self._lock:
data_size = len(data)
# Don't cache if single item is too large (>1MB)
if data_size > 1024 * 1024:
return
# Remove old entry if exists
if key in self._cache:
self._current_memory -= len(self._cache[key])
del self._cache[key]
# Evict oldest entries if needed
while (len(self._cache) >= self._max_size or
self._current_memory + data_size > self._max_memory) and self._cache:
oldest_key, oldest_data = self._cache.popitem(last=False)
self._current_memory -= len(oldest_data)
# Add new entry
self._cache[key] = data
self._current_memory += data_size
def clear(self) -> None:
with self._lock:
self._cache.clear()
self._current_memory = 0
# ============================================================================
# SQL FILTER CONSTANTS
# ============================================================================
# Valid media file filters (excluding phrase checks, must have valid extension)
# Used by downloads, health, and analytics endpoints
MEDIA_FILTERS = """
(filename NOT LIKE '%_phrase_checked_%' OR filename IS NULL)
AND (file_path IS NOT NULL AND file_path != '' OR platform = 'forums')
AND (LENGTH(filename) > 20 OR filename LIKE '%_%_%')
AND (
filename LIKE '%.jpg' OR filename LIKE '%.jpeg' OR
filename LIKE '%.png' OR filename LIKE '%.gif' OR
filename LIKE '%.heic' OR filename LIKE '%.heif' OR
filename LIKE '%.mp4' OR filename LIKE '%.mov' OR
filename LIKE '%.webm' OR filename LIKE '%.m4a' OR
filename LIKE '%.mp3' OR filename LIKE '%.avi' OR
filename LIKE '%.mkv' OR filename LIKE '%.flv'
)
"""
# ============================================================================
# PATH VALIDATION
# ============================================================================
# Allowed base paths for file operations
ALLOWED_PATHS = [
settings.MEDIA_BASE_PATH,
settings.REVIEW_PATH,
settings.RECYCLE_PATH,
Path('/opt/media-downloader/temp/manual_import'),
Path('/opt/immich/paid'),
Path('/opt/immich/el'),
Path('/opt/immich/elv'),
]
def validate_file_path(
file_path: str,
allowed_bases: Optional[List[Path]] = None,
require_exists: bool = False
) -> Path:
"""
Validate file path is within allowed directories.
Prevents path traversal attacks.
Args:
file_path: Path to validate
allowed_bases: List of allowed base paths (defaults to ALLOWED_PATHS)
require_exists: If True, also verify the file exists
Returns:
Resolved Path object
Raises:
HTTPException: If path is invalid or outside allowed directories
"""
if allowed_bases is None:
allowed_bases = ALLOWED_PATHS
requested_path = Path(file_path)
try:
resolved_path = requested_path.resolve()
is_allowed = False
for allowed_base in allowed_bases:
try:
resolved_path.relative_to(allowed_base.resolve())
is_allowed = True
break
except ValueError:
continue
if not is_allowed:
raise HTTPException(status_code=403, detail="Access denied")
if require_exists and not resolved_path.exists():
raise HTTPException(status_code=404, detail="File not found")
except HTTPException:
raise
except Exception:
raise HTTPException(status_code=400, detail="Invalid file path")
return resolved_path
# ============================================================================
# QUERY FILTER BUILDER
# ============================================================================
def build_media_filter_query(
platform: Optional[str] = None,
source: Optional[str] = None,
media_type: Optional[str] = None,
location: Optional[str] = None,
date_from: Optional[str] = None,
date_to: Optional[str] = None,
table_alias: str = "fi"
) -> Tuple[str, List]:
"""
Build SQL filter clause for common media queries.
This centralizes the filter building logic that was duplicated across
media.py, downloads.py, and review.py routers.
Args:
platform: Filter by platform (e.g., 'instagram', 'tiktok')
source: Filter by source (e.g., 'stories', 'posts')
media_type: Filter by media type ('image', 'video', 'all')
location: Filter by location ('final', 'review', 'recycle')
date_from: Start date filter (ISO format)
date_to: End date filter (ISO format)
table_alias: SQL table alias (default 'fi' for file_inventory)
Returns:
Tuple of (SQL WHERE clause conditions, list of parameters)
Example:
conditions, params = build_media_filter_query(platform="instagram", media_type="video")
query = f"SELECT * FROM file_inventory fi WHERE {conditions}"
cursor.execute(query, params)
"""
if not table_alias.isidentifier():
table_alias = "fi"
conditions = []
params = []
if platform:
conditions.append(f"{table_alias}.platform = ?")
params.append(platform)
if source:
conditions.append(f"{table_alias}.source = ?")
params.append(source)
if media_type and media_type != 'all':
conditions.append(f"{table_alias}.media_type = ?")
params.append(media_type)
if location:
conditions.append(f"{table_alias}.location = ?")
params.append(location)
if date_from:
# Use COALESCE to handle both post_date from downloads and created_date from file_inventory
conditions.append(f"""
DATE(COALESCE(
(SELECT MAX(d.post_date) FROM downloads d WHERE d.file_path = {table_alias}.file_path),
{table_alias}.created_date
)) >= ?
""")
params.append(date_from)
if date_to:
conditions.append(f"""
DATE(COALESCE(
(SELECT MAX(d.post_date) FROM downloads d WHERE d.file_path = {table_alias}.file_path),
{table_alias}.created_date
)) <= ?
""")
params.append(date_to)
return " AND ".join(conditions) if conditions else "1=1", params
def build_platform_list_filter(
platforms: Optional[List[str]] = None,
table_alias: str = "fi"
) -> Tuple[str, List[str]]:
"""
Build SQL IN clause for filtering by multiple platforms.
Args:
platforms: List of platform names to filter by
table_alias: SQL table alias
Returns:
Tuple of (SQL condition string, list of parameters)
"""
if not platforms:
return "1=1", []
placeholders = ",".join(["?"] * len(platforms))
return f"{table_alias}.platform IN ({placeholders})", platforms
# ============================================================================
# THUMBNAIL GENERATION
# ============================================================================
def generate_image_thumbnail(file_path: Path, max_size: Tuple[int, int] = (300, 300)) -> Optional[bytes]:
"""
Generate thumbnail for image file.
Args:
file_path: Path to image file
max_size: Maximum thumbnail dimensions
Returns:
JPEG bytes or None if generation fails
"""
try:
img = Image.open(file_path)
img.thumbnail(max_size, Image.Resampling.LANCZOS)
# Convert to RGB if necessary
if img.mode in ('RGBA', 'LA', 'P'):
background = Image.new('RGB', img.size, (255, 255, 255))
if img.mode == 'P':
img = img.convert('RGBA')
background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
img = background
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=85)
return buffer.getvalue()
except Exception:
return None
def generate_video_thumbnail(file_path: Path, max_size: Tuple[int, int] = (300, 300)) -> Optional[bytes]:
"""
Generate thumbnail for video file using ffmpeg.
Args:
file_path: Path to video file
max_size: Maximum thumbnail dimensions
Returns:
JPEG bytes or None if generation fails
"""
# Try seeking to 1s first, then fall back to first frame
for seek_time in ['00:00:01.000', '00:00:00.000']:
try:
result = subprocess.run([
'ffmpeg',
'-ss', seek_time,
'-i', str(file_path),
'-vframes', '1',
'-f', 'image2pipe',
'-vcodec', 'mjpeg',
'-'
], capture_output=True, timeout=30)
if result.returncode != 0 or not result.stdout:
continue
# Resize the frame
img = Image.open(io.BytesIO(result.stdout))
img.thumbnail(max_size, Image.Resampling.LANCZOS)
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=85)
return buffer.getvalue()
except Exception:
continue
return None
def get_or_create_thumbnail(
file_path: Union[str, Path],
media_type: str,
content_hash: Optional[str] = None,
max_size: Tuple[int, int] = (300, 300)
) -> Optional[bytes]:
"""
Get thumbnail from cache or generate and cache it.
Uses the thumbnails.db schema: file_hash (PK), file_path, thumbnail_data, created_at, file_mtime.
Lookup strategy:
1. Try content_hash against file_hash column (survives file moves)
2. Fall back to file_path lookup (legacy thumbnails)
3. Generate and cache if not found
Args:
file_path: Path to media file
media_type: 'image' or 'video'
content_hash: Optional pre-computed hash (computed from path if not provided)
max_size: Maximum thumbnail dimensions
Returns:
JPEG bytes or None if generation fails
"""
file_path = Path(file_path)
thumb_db_path = settings.PROJECT_ROOT / 'database' / 'thumbnails.db'
# Compute hash if not provided
file_hash = content_hash if content_hash else hashlib.sha256(str(file_path).encode()).hexdigest()
# Try to get from cache (skip mtime check — downloaded media files don't change)
try:
with sqlite3.connect(str(thumb_db_path), timeout=30.0) as conn:
cursor = conn.cursor()
# 1. Try file_hash lookup (primary key)
cursor.execute(
"SELECT thumbnail_data FROM thumbnails WHERE file_hash = ?",
(file_hash,)
)
result = cursor.fetchone()
if result and result[0]:
return result[0]
# 2. Fall back to file_path lookup (legacy thumbnails)
cursor.execute(
"SELECT thumbnail_data FROM thumbnails WHERE file_path = ?",
(str(file_path),)
)
result = cursor.fetchone()
if result and result[0]:
return result[0]
except Exception:
pass
# Only proceed with generation if the file actually exists
if not file_path.exists():
return None
# Get mtime only when we need to generate and cache a new thumbnail
try:
file_mtime = file_path.stat().st_mtime
except OSError:
file_mtime = 0
# Generate thumbnail
thumbnail_data = None
if media_type == 'video':
thumbnail_data = generate_video_thumbnail(file_path, max_size)
else:
thumbnail_data = generate_image_thumbnail(file_path, max_size)
# Cache the thumbnail
if thumbnail_data:
try:
from .responses import now_iso8601
with sqlite3.connect(str(thumb_db_path), timeout=30.0) as conn:
conn.execute("""
INSERT OR REPLACE INTO thumbnails
(file_hash, file_path, thumbnail_data, created_at, file_mtime)
VALUES (?, ?, ?, ?, ?)
""", (file_hash, str(file_path), thumbnail_data, now_iso8601(), file_mtime))
conn.commit()
except Exception:
pass
return thumbnail_data
def get_media_dimensions(file_path: str, width: int = None, height: int = None) -> Tuple[Optional[int], Optional[int]]:
"""
Get media dimensions, falling back to metadata cache if not provided.
Args:
file_path: Path to media file
width: Width from file_inventory (may be None)
height: Height from file_inventory (may be None)
Returns:
Tuple of (width, height), or (None, None) if not available
"""
if width is not None and height is not None:
return (width, height)
try:
metadata_db_path = settings.PROJECT_ROOT / 'database' / 'media_metadata.db'
file_hash = hashlib.sha256(file_path.encode()).hexdigest()
with closing(sqlite3.connect(str(metadata_db_path))) as conn:
cursor = conn.execute(
"SELECT width, height FROM media_metadata WHERE file_hash = ?",
(file_hash,)
)
result = cursor.fetchone()
if result:
return (result[0], result[1])
except Exception:
pass
return (width, height)
def get_media_dimensions_batch(file_paths: List[str]) -> Dict[str, Tuple[int, int]]:
"""
Get media dimensions for multiple files in a single query (batch lookup).
Avoids N+1 query problem by fetching all dimensions at once.
Args:
file_paths: List of file paths to look up
Returns:
Dict mapping file_path -> (width, height)
"""
if not file_paths:
return {}
result = {}
try:
metadata_db_path = settings.PROJECT_ROOT / 'database' / 'media_metadata.db'
# Build hash -> path mapping
hash_to_path = {}
for fp in file_paths:
file_hash = hashlib.sha256(fp.encode()).hexdigest()
hash_to_path[file_hash] = fp
# Query all at once
with closing(sqlite3.connect(str(metadata_db_path))) as conn:
placeholders = ','.join('?' * len(hash_to_path))
cursor = conn.execute(
f"SELECT file_hash, width, height FROM media_metadata WHERE file_hash IN ({placeholders})",
list(hash_to_path.keys())
)
for row in cursor.fetchall():
file_hash, width, height = row
if file_hash in hash_to_path:
result[hash_to_path[file_hash]] = (width, height)
except Exception:
pass
return result
# ============================================================================
# DATABASE UTILITIES
# ============================================================================
def update_file_path_in_all_tables(db, old_path: str, new_path: str):
"""
Update file path in all relevant database tables.
Used when moving files between locations (review -> final, etc.)
to keep database references consistent.
Args:
db: UnifiedDatabase instance
old_path: The old file path to replace
new_path: The new file path to use
"""
from modules.universal_logger import get_logger
logger = get_logger('API')
try:
with db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('UPDATE downloads SET file_path = ? WHERE file_path = ?',
(new_path, old_path))
cursor.execute('UPDATE instagram_perceptual_hashes SET file_path = ? WHERE file_path = ?',
(new_path, old_path))
cursor.execute('UPDATE face_recognition_scans SET file_path = ? WHERE file_path = ?',
(new_path, old_path))
try:
cursor.execute('UPDATE semantic_embeddings SET file_path = ? WHERE file_path = ?',
(new_path, old_path))
except sqlite3.OperationalError:
pass # Table may not exist
conn.commit()
except Exception as e:
logger.warning(f"Failed to update file paths in tables: {e}", module="Database")
# ============================================================================
# FACE RECOGNITION UTILITIES
# ============================================================================
# Cached FaceRecognitionModule singleton to avoid loading InsightFace models on every request
_face_module_cache: Dict[int, 'FaceRecognitionModule'] = {}
def get_face_module(db, module_name: str = "FaceAPI"):
"""
Get or create a cached FaceRecognitionModule instance for the given database.
Uses singleton pattern to avoid reloading heavy InsightFace models on each request.
Args:
db: UnifiedDatabase instance
module_name: Name to use in log messages
Returns:
FaceRecognitionModule instance
"""
from modules.face_recognition_module import FaceRecognitionModule
from modules.universal_logger import get_logger
logger = get_logger('API')
db_id = id(db)
if db_id not in _face_module_cache:
logger.info("Creating cached FaceRecognitionModule instance", module=module_name)
_face_module_cache[db_id] = FaceRecognitionModule(unified_db=db)
return _face_module_cache[db_id]