582
web/backend/core/utils.py
Normal file
582
web/backend/core/utils.py
Normal file
@@ -0,0 +1,582 @@
|
||||
"""
|
||||
Shared Utility Functions
|
||||
|
||||
Common helper functions used across multiple routers.
|
||||
"""
|
||||
|
||||
import io
|
||||
import sqlite3
|
||||
import hashlib
|
||||
import subprocess
|
||||
from collections import OrderedDict
|
||||
from contextlib import closing
|
||||
from pathlib import Path
|
||||
from threading import Lock
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
from fastapi import HTTPException
|
||||
from PIL import Image
|
||||
|
||||
from .config import settings
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# THUMBNAIL LRU CACHE
|
||||
# ============================================================================
|
||||
|
||||
class ThumbnailLRUCache:
|
||||
"""Thread-safe LRU cache for thumbnail binary data.
|
||||
|
||||
Avoids SQLite lookups for frequently accessed thumbnails.
|
||||
Used by media.py and recycle.py routers.
|
||||
"""
|
||||
|
||||
def __init__(self, max_size: int = 500, max_memory_mb: int = 100):
|
||||
self._cache: OrderedDict[str, bytes] = OrderedDict()
|
||||
self._lock = Lock()
|
||||
self._max_size = max_size
|
||||
self._max_memory = max_memory_mb * 1024 * 1024 # Convert to bytes
|
||||
self._current_memory = 0
|
||||
|
||||
def get(self, key: str) -> Optional[bytes]:
|
||||
with self._lock:
|
||||
if key in self._cache:
|
||||
# Move to end (most recently used)
|
||||
self._cache.move_to_end(key)
|
||||
return self._cache[key]
|
||||
return None
|
||||
|
||||
def put(self, key: str, data: bytes) -> None:
|
||||
with self._lock:
|
||||
data_size = len(data)
|
||||
|
||||
# Don't cache if single item is too large (>1MB)
|
||||
if data_size > 1024 * 1024:
|
||||
return
|
||||
|
||||
# Remove old entry if exists
|
||||
if key in self._cache:
|
||||
self._current_memory -= len(self._cache[key])
|
||||
del self._cache[key]
|
||||
|
||||
# Evict oldest entries if needed
|
||||
while (len(self._cache) >= self._max_size or
|
||||
self._current_memory + data_size > self._max_memory) and self._cache:
|
||||
oldest_key, oldest_data = self._cache.popitem(last=False)
|
||||
self._current_memory -= len(oldest_data)
|
||||
|
||||
# Add new entry
|
||||
self._cache[key] = data
|
||||
self._current_memory += data_size
|
||||
|
||||
def clear(self) -> None:
|
||||
with self._lock:
|
||||
self._cache.clear()
|
||||
self._current_memory = 0
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# SQL FILTER CONSTANTS
|
||||
# ============================================================================
|
||||
|
||||
# Valid media file filters (excluding phrase checks, must have valid extension)
|
||||
# Used by downloads, health, and analytics endpoints
|
||||
MEDIA_FILTERS = """
|
||||
(filename NOT LIKE '%_phrase_checked_%' OR filename IS NULL)
|
||||
AND (file_path IS NOT NULL AND file_path != '' OR platform = 'forums')
|
||||
AND (LENGTH(filename) > 20 OR filename LIKE '%_%_%')
|
||||
AND (
|
||||
filename LIKE '%.jpg' OR filename LIKE '%.jpeg' OR
|
||||
filename LIKE '%.png' OR filename LIKE '%.gif' OR
|
||||
filename LIKE '%.heic' OR filename LIKE '%.heif' OR
|
||||
filename LIKE '%.mp4' OR filename LIKE '%.mov' OR
|
||||
filename LIKE '%.webm' OR filename LIKE '%.m4a' OR
|
||||
filename LIKE '%.mp3' OR filename LIKE '%.avi' OR
|
||||
filename LIKE '%.mkv' OR filename LIKE '%.flv'
|
||||
)
|
||||
"""
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# PATH VALIDATION
|
||||
# ============================================================================
|
||||
|
||||
# Allowed base paths for file operations
|
||||
ALLOWED_PATHS = [
|
||||
settings.MEDIA_BASE_PATH,
|
||||
settings.REVIEW_PATH,
|
||||
settings.RECYCLE_PATH,
|
||||
Path('/opt/media-downloader/temp/manual_import'),
|
||||
Path('/opt/immich/paid'),
|
||||
Path('/opt/immich/el'),
|
||||
Path('/opt/immich/elv'),
|
||||
]
|
||||
|
||||
|
||||
def validate_file_path(
|
||||
file_path: str,
|
||||
allowed_bases: Optional[List[Path]] = None,
|
||||
require_exists: bool = False
|
||||
) -> Path:
|
||||
"""
|
||||
Validate file path is within allowed directories.
|
||||
Prevents path traversal attacks.
|
||||
|
||||
Args:
|
||||
file_path: Path to validate
|
||||
allowed_bases: List of allowed base paths (defaults to ALLOWED_PATHS)
|
||||
require_exists: If True, also verify the file exists
|
||||
|
||||
Returns:
|
||||
Resolved Path object
|
||||
|
||||
Raises:
|
||||
HTTPException: If path is invalid or outside allowed directories
|
||||
"""
|
||||
if allowed_bases is None:
|
||||
allowed_bases = ALLOWED_PATHS
|
||||
|
||||
requested_path = Path(file_path)
|
||||
|
||||
try:
|
||||
resolved_path = requested_path.resolve()
|
||||
is_allowed = False
|
||||
|
||||
for allowed_base in allowed_bases:
|
||||
try:
|
||||
resolved_path.relative_to(allowed_base.resolve())
|
||||
is_allowed = True
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
if not is_allowed:
|
||||
raise HTTPException(status_code=403, detail="Access denied")
|
||||
|
||||
if require_exists and not resolved_path.exists():
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception:
|
||||
raise HTTPException(status_code=400, detail="Invalid file path")
|
||||
|
||||
return resolved_path
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# QUERY FILTER BUILDER
|
||||
# ============================================================================
|
||||
|
||||
def build_media_filter_query(
|
||||
platform: Optional[str] = None,
|
||||
source: Optional[str] = None,
|
||||
media_type: Optional[str] = None,
|
||||
location: Optional[str] = None,
|
||||
date_from: Optional[str] = None,
|
||||
date_to: Optional[str] = None,
|
||||
table_alias: str = "fi"
|
||||
) -> Tuple[str, List]:
|
||||
"""
|
||||
Build SQL filter clause for common media queries.
|
||||
|
||||
This centralizes the filter building logic that was duplicated across
|
||||
media.py, downloads.py, and review.py routers.
|
||||
|
||||
Args:
|
||||
platform: Filter by platform (e.g., 'instagram', 'tiktok')
|
||||
source: Filter by source (e.g., 'stories', 'posts')
|
||||
media_type: Filter by media type ('image', 'video', 'all')
|
||||
location: Filter by location ('final', 'review', 'recycle')
|
||||
date_from: Start date filter (ISO format)
|
||||
date_to: End date filter (ISO format)
|
||||
table_alias: SQL table alias (default 'fi' for file_inventory)
|
||||
|
||||
Returns:
|
||||
Tuple of (SQL WHERE clause conditions, list of parameters)
|
||||
|
||||
Example:
|
||||
conditions, params = build_media_filter_query(platform="instagram", media_type="video")
|
||||
query = f"SELECT * FROM file_inventory fi WHERE {conditions}"
|
||||
cursor.execute(query, params)
|
||||
"""
|
||||
if not table_alias.isidentifier():
|
||||
table_alias = "fi"
|
||||
|
||||
conditions = []
|
||||
params = []
|
||||
|
||||
if platform:
|
||||
conditions.append(f"{table_alias}.platform = ?")
|
||||
params.append(platform)
|
||||
|
||||
if source:
|
||||
conditions.append(f"{table_alias}.source = ?")
|
||||
params.append(source)
|
||||
|
||||
if media_type and media_type != 'all':
|
||||
conditions.append(f"{table_alias}.media_type = ?")
|
||||
params.append(media_type)
|
||||
|
||||
if location:
|
||||
conditions.append(f"{table_alias}.location = ?")
|
||||
params.append(location)
|
||||
|
||||
if date_from:
|
||||
# Use COALESCE to handle both post_date from downloads and created_date from file_inventory
|
||||
conditions.append(f"""
|
||||
DATE(COALESCE(
|
||||
(SELECT MAX(d.post_date) FROM downloads d WHERE d.file_path = {table_alias}.file_path),
|
||||
{table_alias}.created_date
|
||||
)) >= ?
|
||||
""")
|
||||
params.append(date_from)
|
||||
|
||||
if date_to:
|
||||
conditions.append(f"""
|
||||
DATE(COALESCE(
|
||||
(SELECT MAX(d.post_date) FROM downloads d WHERE d.file_path = {table_alias}.file_path),
|
||||
{table_alias}.created_date
|
||||
)) <= ?
|
||||
""")
|
||||
params.append(date_to)
|
||||
|
||||
return " AND ".join(conditions) if conditions else "1=1", params
|
||||
|
||||
|
||||
def build_platform_list_filter(
|
||||
platforms: Optional[List[str]] = None,
|
||||
table_alias: str = "fi"
|
||||
) -> Tuple[str, List[str]]:
|
||||
"""
|
||||
Build SQL IN clause for filtering by multiple platforms.
|
||||
|
||||
Args:
|
||||
platforms: List of platform names to filter by
|
||||
table_alias: SQL table alias
|
||||
|
||||
Returns:
|
||||
Tuple of (SQL condition string, list of parameters)
|
||||
"""
|
||||
if not platforms:
|
||||
return "1=1", []
|
||||
|
||||
placeholders = ",".join(["?"] * len(platforms))
|
||||
return f"{table_alias}.platform IN ({placeholders})", platforms
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# THUMBNAIL GENERATION
|
||||
# ============================================================================
|
||||
|
||||
def generate_image_thumbnail(file_path: Path, max_size: Tuple[int, int] = (300, 300)) -> Optional[bytes]:
|
||||
"""
|
||||
Generate thumbnail for image file.
|
||||
|
||||
Args:
|
||||
file_path: Path to image file
|
||||
max_size: Maximum thumbnail dimensions
|
||||
|
||||
Returns:
|
||||
JPEG bytes or None if generation fails
|
||||
"""
|
||||
try:
|
||||
img = Image.open(file_path)
|
||||
img.thumbnail(max_size, Image.Resampling.LANCZOS)
|
||||
|
||||
# Convert to RGB if necessary
|
||||
if img.mode in ('RGBA', 'LA', 'P'):
|
||||
background = Image.new('RGB', img.size, (255, 255, 255))
|
||||
if img.mode == 'P':
|
||||
img = img.convert('RGBA')
|
||||
background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
|
||||
img = background
|
||||
|
||||
buffer = io.BytesIO()
|
||||
img.save(buffer, format='JPEG', quality=85)
|
||||
return buffer.getvalue()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def generate_video_thumbnail(file_path: Path, max_size: Tuple[int, int] = (300, 300)) -> Optional[bytes]:
|
||||
"""
|
||||
Generate thumbnail for video file using ffmpeg.
|
||||
|
||||
Args:
|
||||
file_path: Path to video file
|
||||
max_size: Maximum thumbnail dimensions
|
||||
|
||||
Returns:
|
||||
JPEG bytes or None if generation fails
|
||||
"""
|
||||
# Try seeking to 1s first, then fall back to first frame
|
||||
for seek_time in ['00:00:01.000', '00:00:00.000']:
|
||||
try:
|
||||
result = subprocess.run([
|
||||
'ffmpeg',
|
||||
'-ss', seek_time,
|
||||
'-i', str(file_path),
|
||||
'-vframes', '1',
|
||||
'-f', 'image2pipe',
|
||||
'-vcodec', 'mjpeg',
|
||||
'-'
|
||||
], capture_output=True, timeout=30)
|
||||
|
||||
if result.returncode != 0 or not result.stdout:
|
||||
continue
|
||||
|
||||
# Resize the frame
|
||||
img = Image.open(io.BytesIO(result.stdout))
|
||||
img.thumbnail(max_size, Image.Resampling.LANCZOS)
|
||||
|
||||
buffer = io.BytesIO()
|
||||
img.save(buffer, format='JPEG', quality=85)
|
||||
return buffer.getvalue()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_or_create_thumbnail(
|
||||
file_path: Union[str, Path],
|
||||
media_type: str,
|
||||
content_hash: Optional[str] = None,
|
||||
max_size: Tuple[int, int] = (300, 300)
|
||||
) -> Optional[bytes]:
|
||||
"""
|
||||
Get thumbnail from cache or generate and cache it.
|
||||
|
||||
Uses the thumbnails.db schema: file_hash (PK), file_path, thumbnail_data, created_at, file_mtime.
|
||||
|
||||
Lookup strategy:
|
||||
1. Try content_hash against file_hash column (survives file moves)
|
||||
2. Fall back to file_path lookup (legacy thumbnails)
|
||||
3. Generate and cache if not found
|
||||
|
||||
Args:
|
||||
file_path: Path to media file
|
||||
media_type: 'image' or 'video'
|
||||
content_hash: Optional pre-computed hash (computed from path if not provided)
|
||||
max_size: Maximum thumbnail dimensions
|
||||
|
||||
Returns:
|
||||
JPEG bytes or None if generation fails
|
||||
"""
|
||||
file_path = Path(file_path)
|
||||
thumb_db_path = settings.PROJECT_ROOT / 'database' / 'thumbnails.db'
|
||||
|
||||
# Compute hash if not provided
|
||||
file_hash = content_hash if content_hash else hashlib.sha256(str(file_path).encode()).hexdigest()
|
||||
|
||||
# Try to get from cache (skip mtime check — downloaded media files don't change)
|
||||
try:
|
||||
with sqlite3.connect(str(thumb_db_path), timeout=30.0) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 1. Try file_hash lookup (primary key)
|
||||
cursor.execute(
|
||||
"SELECT thumbnail_data FROM thumbnails WHERE file_hash = ?",
|
||||
(file_hash,)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
if result and result[0]:
|
||||
return result[0]
|
||||
|
||||
# 2. Fall back to file_path lookup (legacy thumbnails)
|
||||
cursor.execute(
|
||||
"SELECT thumbnail_data FROM thumbnails WHERE file_path = ?",
|
||||
(str(file_path),)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
if result and result[0]:
|
||||
return result[0]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Only proceed with generation if the file actually exists
|
||||
if not file_path.exists():
|
||||
return None
|
||||
|
||||
# Get mtime only when we need to generate and cache a new thumbnail
|
||||
try:
|
||||
file_mtime = file_path.stat().st_mtime
|
||||
except OSError:
|
||||
file_mtime = 0
|
||||
|
||||
# Generate thumbnail
|
||||
thumbnail_data = None
|
||||
if media_type == 'video':
|
||||
thumbnail_data = generate_video_thumbnail(file_path, max_size)
|
||||
else:
|
||||
thumbnail_data = generate_image_thumbnail(file_path, max_size)
|
||||
|
||||
# Cache the thumbnail
|
||||
if thumbnail_data:
|
||||
try:
|
||||
from .responses import now_iso8601
|
||||
with sqlite3.connect(str(thumb_db_path), timeout=30.0) as conn:
|
||||
conn.execute("""
|
||||
INSERT OR REPLACE INTO thumbnails
|
||||
(file_hash, file_path, thumbnail_data, created_at, file_mtime)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""", (file_hash, str(file_path), thumbnail_data, now_iso8601(), file_mtime))
|
||||
conn.commit()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return thumbnail_data
|
||||
|
||||
|
||||
def get_media_dimensions(file_path: str, width: int = None, height: int = None) -> Tuple[Optional[int], Optional[int]]:
|
||||
"""
|
||||
Get media dimensions, falling back to metadata cache if not provided.
|
||||
|
||||
Args:
|
||||
file_path: Path to media file
|
||||
width: Width from file_inventory (may be None)
|
||||
height: Height from file_inventory (may be None)
|
||||
|
||||
Returns:
|
||||
Tuple of (width, height), or (None, None) if not available
|
||||
"""
|
||||
if width is not None and height is not None:
|
||||
return (width, height)
|
||||
|
||||
try:
|
||||
metadata_db_path = settings.PROJECT_ROOT / 'database' / 'media_metadata.db'
|
||||
|
||||
file_hash = hashlib.sha256(file_path.encode()).hexdigest()
|
||||
with closing(sqlite3.connect(str(metadata_db_path))) as conn:
|
||||
cursor = conn.execute(
|
||||
"SELECT width, height FROM media_metadata WHERE file_hash = ?",
|
||||
(file_hash,)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
return (result[0], result[1])
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return (width, height)
|
||||
|
||||
|
||||
def get_media_dimensions_batch(file_paths: List[str]) -> Dict[str, Tuple[int, int]]:
|
||||
"""
|
||||
Get media dimensions for multiple files in a single query (batch lookup).
|
||||
Avoids N+1 query problem by fetching all dimensions at once.
|
||||
|
||||
Args:
|
||||
file_paths: List of file paths to look up
|
||||
|
||||
Returns:
|
||||
Dict mapping file_path -> (width, height)
|
||||
"""
|
||||
if not file_paths:
|
||||
return {}
|
||||
|
||||
result = {}
|
||||
|
||||
try:
|
||||
metadata_db_path = settings.PROJECT_ROOT / 'database' / 'media_metadata.db'
|
||||
|
||||
# Build hash -> path mapping
|
||||
hash_to_path = {}
|
||||
for fp in file_paths:
|
||||
file_hash = hashlib.sha256(fp.encode()).hexdigest()
|
||||
hash_to_path[file_hash] = fp
|
||||
|
||||
# Query all at once
|
||||
with closing(sqlite3.connect(str(metadata_db_path))) as conn:
|
||||
placeholders = ','.join('?' * len(hash_to_path))
|
||||
cursor = conn.execute(
|
||||
f"SELECT file_hash, width, height FROM media_metadata WHERE file_hash IN ({placeholders})",
|
||||
list(hash_to_path.keys())
|
||||
)
|
||||
|
||||
for row in cursor.fetchall():
|
||||
file_hash, width, height = row
|
||||
if file_hash in hash_to_path:
|
||||
result[hash_to_path[file_hash]] = (width, height)
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# DATABASE UTILITIES
|
||||
# ============================================================================
|
||||
|
||||
def update_file_path_in_all_tables(db, old_path: str, new_path: str):
|
||||
"""
|
||||
Update file path in all relevant database tables.
|
||||
|
||||
Used when moving files between locations (review -> final, etc.)
|
||||
to keep database references consistent.
|
||||
|
||||
Args:
|
||||
db: UnifiedDatabase instance
|
||||
old_path: The old file path to replace
|
||||
new_path: The new file path to use
|
||||
"""
|
||||
from modules.universal_logger import get_logger
|
||||
logger = get_logger('API')
|
||||
|
||||
try:
|
||||
with db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('UPDATE downloads SET file_path = ? WHERE file_path = ?',
|
||||
(new_path, old_path))
|
||||
cursor.execute('UPDATE instagram_perceptual_hashes SET file_path = ? WHERE file_path = ?',
|
||||
(new_path, old_path))
|
||||
cursor.execute('UPDATE face_recognition_scans SET file_path = ? WHERE file_path = ?',
|
||||
(new_path, old_path))
|
||||
|
||||
try:
|
||||
cursor.execute('UPDATE semantic_embeddings SET file_path = ? WHERE file_path = ?',
|
||||
(new_path, old_path))
|
||||
except sqlite3.OperationalError:
|
||||
pass # Table may not exist
|
||||
|
||||
conn.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to update file paths in tables: {e}", module="Database")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# FACE RECOGNITION UTILITIES
|
||||
# ============================================================================
|
||||
|
||||
# Cached FaceRecognitionModule singleton to avoid loading InsightFace models on every request
|
||||
_face_module_cache: Dict[int, 'FaceRecognitionModule'] = {}
|
||||
|
||||
|
||||
def get_face_module(db, module_name: str = "FaceAPI"):
|
||||
"""
|
||||
Get or create a cached FaceRecognitionModule instance for the given database.
|
||||
|
||||
Uses singleton pattern to avoid reloading heavy InsightFace models on each request.
|
||||
|
||||
Args:
|
||||
db: UnifiedDatabase instance
|
||||
module_name: Name to use in log messages
|
||||
|
||||
Returns:
|
||||
FaceRecognitionModule instance
|
||||
"""
|
||||
from modules.face_recognition_module import FaceRecognitionModule
|
||||
from modules.universal_logger import get_logger
|
||||
logger = get_logger('API')
|
||||
|
||||
db_id = id(db)
|
||||
if db_id not in _face_module_cache:
|
||||
logger.info("Creating cached FaceRecognitionModule instance", module=module_name)
|
||||
_face_module_cache[db_id] = FaceRecognitionModule(unified_db=db)
|
||||
return _face_module_cache[db_id]
|
||||
Reference in New Issue
Block a user