583 lines
18 KiB
Python
583 lines
18 KiB
Python
"""
|
|
Shared Utility Functions
|
|
|
|
Common helper functions used across multiple routers.
|
|
"""
|
|
|
|
import io
|
|
import sqlite3
|
|
import hashlib
|
|
import subprocess
|
|
from collections import OrderedDict
|
|
from contextlib import closing
|
|
from pathlib import Path
|
|
from threading import Lock
|
|
from typing import Dict, List, Optional, Tuple, Union
|
|
|
|
from fastapi import HTTPException
|
|
from PIL import Image
|
|
|
|
from .config import settings
|
|
|
|
|
|
# ============================================================================
|
|
# THUMBNAIL LRU CACHE
|
|
# ============================================================================
|
|
|
|
class ThumbnailLRUCache:
|
|
"""Thread-safe LRU cache for thumbnail binary data.
|
|
|
|
Avoids SQLite lookups for frequently accessed thumbnails.
|
|
Used by media.py and recycle.py routers.
|
|
"""
|
|
|
|
def __init__(self, max_size: int = 500, max_memory_mb: int = 100):
|
|
self._cache: OrderedDict[str, bytes] = OrderedDict()
|
|
self._lock = Lock()
|
|
self._max_size = max_size
|
|
self._max_memory = max_memory_mb * 1024 * 1024 # Convert to bytes
|
|
self._current_memory = 0
|
|
|
|
def get(self, key: str) -> Optional[bytes]:
|
|
with self._lock:
|
|
if key in self._cache:
|
|
# Move to end (most recently used)
|
|
self._cache.move_to_end(key)
|
|
return self._cache[key]
|
|
return None
|
|
|
|
def put(self, key: str, data: bytes) -> None:
|
|
with self._lock:
|
|
data_size = len(data)
|
|
|
|
# Don't cache if single item is too large (>1MB)
|
|
if data_size > 1024 * 1024:
|
|
return
|
|
|
|
# Remove old entry if exists
|
|
if key in self._cache:
|
|
self._current_memory -= len(self._cache[key])
|
|
del self._cache[key]
|
|
|
|
# Evict oldest entries if needed
|
|
while (len(self._cache) >= self._max_size or
|
|
self._current_memory + data_size > self._max_memory) and self._cache:
|
|
oldest_key, oldest_data = self._cache.popitem(last=False)
|
|
self._current_memory -= len(oldest_data)
|
|
|
|
# Add new entry
|
|
self._cache[key] = data
|
|
self._current_memory += data_size
|
|
|
|
def clear(self) -> None:
|
|
with self._lock:
|
|
self._cache.clear()
|
|
self._current_memory = 0
|
|
|
|
|
|
# ============================================================================
|
|
# SQL FILTER CONSTANTS
|
|
# ============================================================================
|
|
|
|
# Valid media file filters (excluding phrase checks, must have valid extension)
|
|
# Used by downloads, health, and analytics endpoints
|
|
MEDIA_FILTERS = """
|
|
(filename NOT LIKE '%_phrase_checked_%' OR filename IS NULL)
|
|
AND (file_path IS NOT NULL AND file_path != '' OR platform = 'forums')
|
|
AND (LENGTH(filename) > 20 OR filename LIKE '%_%_%')
|
|
AND (
|
|
filename LIKE '%.jpg' OR filename LIKE '%.jpeg' OR
|
|
filename LIKE '%.png' OR filename LIKE '%.gif' OR
|
|
filename LIKE '%.heic' OR filename LIKE '%.heif' OR
|
|
filename LIKE '%.mp4' OR filename LIKE '%.mov' OR
|
|
filename LIKE '%.webm' OR filename LIKE '%.m4a' OR
|
|
filename LIKE '%.mp3' OR filename LIKE '%.avi' OR
|
|
filename LIKE '%.mkv' OR filename LIKE '%.flv'
|
|
)
|
|
"""
|
|
|
|
|
|
# ============================================================================
|
|
# PATH VALIDATION
|
|
# ============================================================================
|
|
|
|
# Allowed base paths for file operations
|
|
ALLOWED_PATHS = [
|
|
settings.MEDIA_BASE_PATH,
|
|
settings.REVIEW_PATH,
|
|
settings.RECYCLE_PATH,
|
|
Path('/opt/media-downloader/temp/manual_import'),
|
|
Path('/opt/immich/paid'),
|
|
Path('/opt/immich/el'),
|
|
Path('/opt/immich/elv'),
|
|
]
|
|
|
|
|
|
def validate_file_path(
|
|
file_path: str,
|
|
allowed_bases: Optional[List[Path]] = None,
|
|
require_exists: bool = False
|
|
) -> Path:
|
|
"""
|
|
Validate file path is within allowed directories.
|
|
Prevents path traversal attacks.
|
|
|
|
Args:
|
|
file_path: Path to validate
|
|
allowed_bases: List of allowed base paths (defaults to ALLOWED_PATHS)
|
|
require_exists: If True, also verify the file exists
|
|
|
|
Returns:
|
|
Resolved Path object
|
|
|
|
Raises:
|
|
HTTPException: If path is invalid or outside allowed directories
|
|
"""
|
|
if allowed_bases is None:
|
|
allowed_bases = ALLOWED_PATHS
|
|
|
|
requested_path = Path(file_path)
|
|
|
|
try:
|
|
resolved_path = requested_path.resolve()
|
|
is_allowed = False
|
|
|
|
for allowed_base in allowed_bases:
|
|
try:
|
|
resolved_path.relative_to(allowed_base.resolve())
|
|
is_allowed = True
|
|
break
|
|
except ValueError:
|
|
continue
|
|
|
|
if not is_allowed:
|
|
raise HTTPException(status_code=403, detail="Access denied")
|
|
|
|
if require_exists and not resolved_path.exists():
|
|
raise HTTPException(status_code=404, detail="File not found")
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception:
|
|
raise HTTPException(status_code=400, detail="Invalid file path")
|
|
|
|
return resolved_path
|
|
|
|
|
|
# ============================================================================
|
|
# QUERY FILTER BUILDER
|
|
# ============================================================================
|
|
|
|
def build_media_filter_query(
|
|
platform: Optional[str] = None,
|
|
source: Optional[str] = None,
|
|
media_type: Optional[str] = None,
|
|
location: Optional[str] = None,
|
|
date_from: Optional[str] = None,
|
|
date_to: Optional[str] = None,
|
|
table_alias: str = "fi"
|
|
) -> Tuple[str, List]:
|
|
"""
|
|
Build SQL filter clause for common media queries.
|
|
|
|
This centralizes the filter building logic that was duplicated across
|
|
media.py, downloads.py, and review.py routers.
|
|
|
|
Args:
|
|
platform: Filter by platform (e.g., 'instagram', 'tiktok')
|
|
source: Filter by source (e.g., 'stories', 'posts')
|
|
media_type: Filter by media type ('image', 'video', 'all')
|
|
location: Filter by location ('final', 'review', 'recycle')
|
|
date_from: Start date filter (ISO format)
|
|
date_to: End date filter (ISO format)
|
|
table_alias: SQL table alias (default 'fi' for file_inventory)
|
|
|
|
Returns:
|
|
Tuple of (SQL WHERE clause conditions, list of parameters)
|
|
|
|
Example:
|
|
conditions, params = build_media_filter_query(platform="instagram", media_type="video")
|
|
query = f"SELECT * FROM file_inventory fi WHERE {conditions}"
|
|
cursor.execute(query, params)
|
|
"""
|
|
if not table_alias.isidentifier():
|
|
table_alias = "fi"
|
|
|
|
conditions = []
|
|
params = []
|
|
|
|
if platform:
|
|
conditions.append(f"{table_alias}.platform = ?")
|
|
params.append(platform)
|
|
|
|
if source:
|
|
conditions.append(f"{table_alias}.source = ?")
|
|
params.append(source)
|
|
|
|
if media_type and media_type != 'all':
|
|
conditions.append(f"{table_alias}.media_type = ?")
|
|
params.append(media_type)
|
|
|
|
if location:
|
|
conditions.append(f"{table_alias}.location = ?")
|
|
params.append(location)
|
|
|
|
if date_from:
|
|
# Use COALESCE to handle both post_date from downloads and created_date from file_inventory
|
|
conditions.append(f"""
|
|
DATE(COALESCE(
|
|
(SELECT MAX(d.post_date) FROM downloads d WHERE d.file_path = {table_alias}.file_path),
|
|
{table_alias}.created_date
|
|
)) >= ?
|
|
""")
|
|
params.append(date_from)
|
|
|
|
if date_to:
|
|
conditions.append(f"""
|
|
DATE(COALESCE(
|
|
(SELECT MAX(d.post_date) FROM downloads d WHERE d.file_path = {table_alias}.file_path),
|
|
{table_alias}.created_date
|
|
)) <= ?
|
|
""")
|
|
params.append(date_to)
|
|
|
|
return " AND ".join(conditions) if conditions else "1=1", params
|
|
|
|
|
|
def build_platform_list_filter(
|
|
platforms: Optional[List[str]] = None,
|
|
table_alias: str = "fi"
|
|
) -> Tuple[str, List[str]]:
|
|
"""
|
|
Build SQL IN clause for filtering by multiple platforms.
|
|
|
|
Args:
|
|
platforms: List of platform names to filter by
|
|
table_alias: SQL table alias
|
|
|
|
Returns:
|
|
Tuple of (SQL condition string, list of parameters)
|
|
"""
|
|
if not platforms:
|
|
return "1=1", []
|
|
|
|
placeholders = ",".join(["?"] * len(platforms))
|
|
return f"{table_alias}.platform IN ({placeholders})", platforms
|
|
|
|
|
|
# ============================================================================
|
|
# THUMBNAIL GENERATION
|
|
# ============================================================================
|
|
|
|
def generate_image_thumbnail(file_path: Path, max_size: Tuple[int, int] = (300, 300)) -> Optional[bytes]:
|
|
"""
|
|
Generate thumbnail for image file.
|
|
|
|
Args:
|
|
file_path: Path to image file
|
|
max_size: Maximum thumbnail dimensions
|
|
|
|
Returns:
|
|
JPEG bytes or None if generation fails
|
|
"""
|
|
try:
|
|
img = Image.open(file_path)
|
|
img.thumbnail(max_size, Image.Resampling.LANCZOS)
|
|
|
|
# Convert to RGB if necessary
|
|
if img.mode in ('RGBA', 'LA', 'P'):
|
|
background = Image.new('RGB', img.size, (255, 255, 255))
|
|
if img.mode == 'P':
|
|
img = img.convert('RGBA')
|
|
background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
|
|
img = background
|
|
|
|
buffer = io.BytesIO()
|
|
img.save(buffer, format='JPEG', quality=85)
|
|
return buffer.getvalue()
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def generate_video_thumbnail(file_path: Path, max_size: Tuple[int, int] = (300, 300)) -> Optional[bytes]:
|
|
"""
|
|
Generate thumbnail for video file using ffmpeg.
|
|
|
|
Args:
|
|
file_path: Path to video file
|
|
max_size: Maximum thumbnail dimensions
|
|
|
|
Returns:
|
|
JPEG bytes or None if generation fails
|
|
"""
|
|
# Try seeking to 1s first, then fall back to first frame
|
|
for seek_time in ['00:00:01.000', '00:00:00.000']:
|
|
try:
|
|
result = subprocess.run([
|
|
'ffmpeg',
|
|
'-ss', seek_time,
|
|
'-i', str(file_path),
|
|
'-vframes', '1',
|
|
'-f', 'image2pipe',
|
|
'-vcodec', 'mjpeg',
|
|
'-'
|
|
], capture_output=True, timeout=30)
|
|
|
|
if result.returncode != 0 or not result.stdout:
|
|
continue
|
|
|
|
# Resize the frame
|
|
img = Image.open(io.BytesIO(result.stdout))
|
|
img.thumbnail(max_size, Image.Resampling.LANCZOS)
|
|
|
|
buffer = io.BytesIO()
|
|
img.save(buffer, format='JPEG', quality=85)
|
|
return buffer.getvalue()
|
|
except Exception:
|
|
continue
|
|
|
|
return None
|
|
|
|
|
|
def get_or_create_thumbnail(
|
|
file_path: Union[str, Path],
|
|
media_type: str,
|
|
content_hash: Optional[str] = None,
|
|
max_size: Tuple[int, int] = (300, 300)
|
|
) -> Optional[bytes]:
|
|
"""
|
|
Get thumbnail from cache or generate and cache it.
|
|
|
|
Uses the thumbnails.db schema: file_hash (PK), file_path, thumbnail_data, created_at, file_mtime.
|
|
|
|
Lookup strategy:
|
|
1. Try content_hash against file_hash column (survives file moves)
|
|
2. Fall back to file_path lookup (legacy thumbnails)
|
|
3. Generate and cache if not found
|
|
|
|
Args:
|
|
file_path: Path to media file
|
|
media_type: 'image' or 'video'
|
|
content_hash: Optional pre-computed hash (computed from path if not provided)
|
|
max_size: Maximum thumbnail dimensions
|
|
|
|
Returns:
|
|
JPEG bytes or None if generation fails
|
|
"""
|
|
file_path = Path(file_path)
|
|
thumb_db_path = settings.PROJECT_ROOT / 'database' / 'thumbnails.db'
|
|
|
|
# Compute hash if not provided
|
|
file_hash = content_hash if content_hash else hashlib.sha256(str(file_path).encode()).hexdigest()
|
|
|
|
# Try to get from cache (skip mtime check — downloaded media files don't change)
|
|
try:
|
|
with sqlite3.connect(str(thumb_db_path), timeout=30.0) as conn:
|
|
cursor = conn.cursor()
|
|
|
|
# 1. Try file_hash lookup (primary key)
|
|
cursor.execute(
|
|
"SELECT thumbnail_data FROM thumbnails WHERE file_hash = ?",
|
|
(file_hash,)
|
|
)
|
|
result = cursor.fetchone()
|
|
if result and result[0]:
|
|
return result[0]
|
|
|
|
# 2. Fall back to file_path lookup (legacy thumbnails)
|
|
cursor.execute(
|
|
"SELECT thumbnail_data FROM thumbnails WHERE file_path = ?",
|
|
(str(file_path),)
|
|
)
|
|
result = cursor.fetchone()
|
|
if result and result[0]:
|
|
return result[0]
|
|
except Exception:
|
|
pass
|
|
|
|
# Only proceed with generation if the file actually exists
|
|
if not file_path.exists():
|
|
return None
|
|
|
|
# Get mtime only when we need to generate and cache a new thumbnail
|
|
try:
|
|
file_mtime = file_path.stat().st_mtime
|
|
except OSError:
|
|
file_mtime = 0
|
|
|
|
# Generate thumbnail
|
|
thumbnail_data = None
|
|
if media_type == 'video':
|
|
thumbnail_data = generate_video_thumbnail(file_path, max_size)
|
|
else:
|
|
thumbnail_data = generate_image_thumbnail(file_path, max_size)
|
|
|
|
# Cache the thumbnail
|
|
if thumbnail_data:
|
|
try:
|
|
from .responses import now_iso8601
|
|
with sqlite3.connect(str(thumb_db_path), timeout=30.0) as conn:
|
|
conn.execute("""
|
|
INSERT OR REPLACE INTO thumbnails
|
|
(file_hash, file_path, thumbnail_data, created_at, file_mtime)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
""", (file_hash, str(file_path), thumbnail_data, now_iso8601(), file_mtime))
|
|
conn.commit()
|
|
except Exception:
|
|
pass
|
|
|
|
return thumbnail_data
|
|
|
|
|
|
def get_media_dimensions(file_path: str, width: int = None, height: int = None) -> Tuple[Optional[int], Optional[int]]:
|
|
"""
|
|
Get media dimensions, falling back to metadata cache if not provided.
|
|
|
|
Args:
|
|
file_path: Path to media file
|
|
width: Width from file_inventory (may be None)
|
|
height: Height from file_inventory (may be None)
|
|
|
|
Returns:
|
|
Tuple of (width, height), or (None, None) if not available
|
|
"""
|
|
if width is not None and height is not None:
|
|
return (width, height)
|
|
|
|
try:
|
|
metadata_db_path = settings.PROJECT_ROOT / 'database' / 'media_metadata.db'
|
|
|
|
file_hash = hashlib.sha256(file_path.encode()).hexdigest()
|
|
with closing(sqlite3.connect(str(metadata_db_path))) as conn:
|
|
cursor = conn.execute(
|
|
"SELECT width, height FROM media_metadata WHERE file_hash = ?",
|
|
(file_hash,)
|
|
)
|
|
result = cursor.fetchone()
|
|
|
|
if result:
|
|
return (result[0], result[1])
|
|
|
|
except Exception:
|
|
pass
|
|
|
|
return (width, height)
|
|
|
|
|
|
def get_media_dimensions_batch(file_paths: List[str]) -> Dict[str, Tuple[int, int]]:
|
|
"""
|
|
Get media dimensions for multiple files in a single query (batch lookup).
|
|
Avoids N+1 query problem by fetching all dimensions at once.
|
|
|
|
Args:
|
|
file_paths: List of file paths to look up
|
|
|
|
Returns:
|
|
Dict mapping file_path -> (width, height)
|
|
"""
|
|
if not file_paths:
|
|
return {}
|
|
|
|
result = {}
|
|
|
|
try:
|
|
metadata_db_path = settings.PROJECT_ROOT / 'database' / 'media_metadata.db'
|
|
|
|
# Build hash -> path mapping
|
|
hash_to_path = {}
|
|
for fp in file_paths:
|
|
file_hash = hashlib.sha256(fp.encode()).hexdigest()
|
|
hash_to_path[file_hash] = fp
|
|
|
|
# Query all at once
|
|
with closing(sqlite3.connect(str(metadata_db_path))) as conn:
|
|
placeholders = ','.join('?' * len(hash_to_path))
|
|
cursor = conn.execute(
|
|
f"SELECT file_hash, width, height FROM media_metadata WHERE file_hash IN ({placeholders})",
|
|
list(hash_to_path.keys())
|
|
)
|
|
|
|
for row in cursor.fetchall():
|
|
file_hash, width, height = row
|
|
if file_hash in hash_to_path:
|
|
result[hash_to_path[file_hash]] = (width, height)
|
|
|
|
except Exception:
|
|
pass
|
|
|
|
return result
|
|
|
|
|
|
# ============================================================================
|
|
# DATABASE UTILITIES
|
|
# ============================================================================
|
|
|
|
def update_file_path_in_all_tables(db, old_path: str, new_path: str):
|
|
"""
|
|
Update file path in all relevant database tables.
|
|
|
|
Used when moving files between locations (review -> final, etc.)
|
|
to keep database references consistent.
|
|
|
|
Args:
|
|
db: UnifiedDatabase instance
|
|
old_path: The old file path to replace
|
|
new_path: The new file path to use
|
|
"""
|
|
from modules.universal_logger import get_logger
|
|
logger = get_logger('API')
|
|
|
|
try:
|
|
with db.get_connection(for_write=True) as conn:
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute('UPDATE downloads SET file_path = ? WHERE file_path = ?',
|
|
(new_path, old_path))
|
|
cursor.execute('UPDATE instagram_perceptual_hashes SET file_path = ? WHERE file_path = ?',
|
|
(new_path, old_path))
|
|
cursor.execute('UPDATE face_recognition_scans SET file_path = ? WHERE file_path = ?',
|
|
(new_path, old_path))
|
|
|
|
try:
|
|
cursor.execute('UPDATE semantic_embeddings SET file_path = ? WHERE file_path = ?',
|
|
(new_path, old_path))
|
|
except sqlite3.OperationalError:
|
|
pass # Table may not exist
|
|
|
|
conn.commit()
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to update file paths in tables: {e}", module="Database")
|
|
|
|
|
|
# ============================================================================
|
|
# FACE RECOGNITION UTILITIES
|
|
# ============================================================================
|
|
|
|
# Cached FaceRecognitionModule singleton to avoid loading InsightFace models on every request
|
|
_face_module_cache: Dict[int, 'FaceRecognitionModule'] = {}
|
|
|
|
|
|
def get_face_module(db, module_name: str = "FaceAPI"):
|
|
"""
|
|
Get or create a cached FaceRecognitionModule instance for the given database.
|
|
|
|
Uses singleton pattern to avoid reloading heavy InsightFace models on each request.
|
|
|
|
Args:
|
|
db: UnifiedDatabase instance
|
|
module_name: Name to use in log messages
|
|
|
|
Returns:
|
|
FaceRecognitionModule instance
|
|
"""
|
|
from modules.face_recognition_module import FaceRecognitionModule
|
|
from modules.universal_logger import get_logger
|
|
logger = get_logger('API')
|
|
|
|
db_id = id(db)
|
|
if db_id not in _face_module_cache:
|
|
logger.info("Creating cached FaceRecognitionModule instance", module=module_name)
|
|
_face_module_cache[db_id] = FaceRecognitionModule(unified_db=db)
|
|
return _face_module_cache[db_id]
|