Files
media-downloader/web/backend/routers/media.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

1405 lines
52 KiB
Python

"""
Media Router
Handles all media file operations:
- Thumbnail generation and caching
- Media preview/serving
- Metadata retrieval
- Gallery listing
- Batch operations (delete, move, download)
- Cache management
"""
import hashlib
import json
import shutil
import sqlite3
import subprocess
import tempfile
import zipfile
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
from fastapi import APIRouter, BackgroundTasks, Body, Depends, Query, Request
from fastapi.responses import FileResponse, Response
from PIL import Image
from pydantic import BaseModel
from slowapi import Limiter
from slowapi.util import get_remote_address
from ..core.dependencies import get_current_user, get_current_user_media, require_admin, get_app_state
from ..core.config import settings
from ..core.exceptions import (
handle_exceptions,
MediaFileNotFoundError as CustomFileNotFoundError,
FileOperationError,
ValidationError
)
from ..core.responses import now_iso8601
from modules.universal_logger import get_logger
from ..core.utils import (
get_media_dimensions,
get_media_dimensions_batch,
validate_file_path,
generate_image_thumbnail as shared_generate_image_thumbnail,
generate_video_thumbnail as shared_generate_video_thumbnail,
get_or_create_thumbnail as shared_get_or_create_thumbnail,
ThumbnailLRUCache,
ALLOWED_PATHS
)
logger = get_logger('API')
router = APIRouter(prefix="/api/media", tags=["Media"])
limiter = Limiter(key_func=get_remote_address)
# Use centralized paths from config
MEDIA_BASE = settings.MEDIA_BASE_PATH
REVIEW_BASE = settings.REVIEW_PATH
RECYCLE_BASE = settings.RECYCLE_PATH
# Global thumbnail memory cache (500 items or 100MB max)
# Using shared ThumbnailLRUCache from core/utils.py
_thumbnail_cache = ThumbnailLRUCache(max_size=500, max_memory_mb=100)
# ============================================================================
# PYDANTIC MODELS
# ============================================================================
class BatchMoveRequest(BaseModel):
file_paths: List[str]
destination: str
class BatchDeleteRequest(BaseModel):
file_paths: List[str]
class UpdateDateRequest(BaseModel):
ids: List[int] # file_inventory IDs
new_date: str # ISO datetime "2024-06-15T14:30:00"
update_file: bool = True # Also update filesystem/EXIF timestamps
date_type: str = "post_date" # "post_date" or "download_date"
# ============================================================================
# HELPER FUNCTIONS (validate_file_path from core/utils.py)
# ============================================================================
# Thumbnail generation functions are now in core/utils.py (shared across routers).
# Local aliases for backward compatibility within this module:
generate_image_thumbnail = shared_generate_image_thumbnail
generate_video_thumbnail = shared_generate_video_thumbnail
get_or_create_thumbnail = shared_get_or_create_thumbnail
def update_file_path_in_all_tables(db, old_path: str, new_path: str):
"""
Update file path in all relevant database tables when a file is moved.
"""
try:
with db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute('UPDATE downloads SET file_path = ? WHERE file_path = ?',
(new_path, old_path))
downloads_updated = cursor.rowcount
cursor.execute('UPDATE instagram_perceptual_hashes SET file_path = ? WHERE file_path = ?',
(new_path, old_path))
perceptual_updated = cursor.rowcount
cursor.execute('UPDATE face_recognition_scans SET file_path = ? WHERE file_path = ?',
(new_path, old_path))
face_updated = cursor.rowcount
try:
cursor.execute('UPDATE semantic_embeddings SET file_path = ? WHERE file_path = ?',
(new_path, old_path))
embeddings_updated = cursor.rowcount
except sqlite3.OperationalError:
embeddings_updated = 0
conn.commit()
if downloads_updated or perceptual_updated or face_updated or embeddings_updated:
logger.debug(
f"Updated file paths: downloads={downloads_updated}, "
f"perceptual={perceptual_updated}, face={face_updated}, "
f"embeddings={embeddings_updated}",
module="Database"
)
except Exception as e:
logger.warning(f"Failed to update file paths in tables: {e}", module="Database")
# Also update thumbnails.db cache (uses path-based hash)
try:
thumb_db_path = settings.PROJECT_ROOT / 'database' / 'thumbnails.db'
old_hash = hashlib.sha256(old_path.encode()).hexdigest()
new_hash = hashlib.sha256(new_path.encode()).hexdigest()
with sqlite3.connect(str(thumb_db_path), timeout=10.0) as thumb_conn:
cursor = thumb_conn.cursor()
# Get thumbnail data from old path
cursor.execute("SELECT thumbnail_data, file_mtime FROM thumbnails WHERE file_hash = ?", (old_hash,))
row = cursor.fetchone()
if row:
thumbnail_data, file_mtime = row
# Insert with new hash
cursor.execute("""
INSERT OR REPLACE INTO thumbnails
(file_hash, file_path, thumbnail_data, created_at, file_mtime)
VALUES (?, ?, ?, ?, ?)
""", (new_hash, new_path, thumbnail_data, now_iso8601(), file_mtime))
# Delete old entry
cursor.execute("DELETE FROM thumbnails WHERE file_hash = ?", (old_hash,))
thumb_conn.commit()
logger.debug(f"Migrated thumbnail cache for moved file", module="Database")
except Exception as e:
logger.warning(f"Failed to update thumbnail cache: {e}", module="Database")
# ============================================================================
# THUMBNAIL AND PREVIEW ENDPOINTS
# ============================================================================
@router.get("/thumbnail")
@limiter.limit("5000/minute")
@handle_exceptions
async def get_media_thumbnail(
request: Request,
file_path: str = None,
media_type: str = None,
token: str = None,
current_user: Dict = Depends(get_current_user_media)
):
"""
Get or generate thumbnail for media file.
Uses 3-tier caching:
1. In-memory LRU cache (fastest, ~500 items)
2. Thumbnail database (fast, persistent)
3. Generate on-demand (slowest, for new files)
Cache key uses content hash (SHA256) so thumbnails survive file moves.
Args:
file_path: Path to the media file
media_type: 'image' or 'video'
"""
resolved_path = validate_file_path(file_path)
app_state = get_app_state()
# Cache key: use path (avoids slow file_inventory DB lookup on every request)
cache_key = str(resolved_path)
# 1. Check in-memory LRU cache first (fastest — no disk/DB access)
thumbnail_data = _thumbnail_cache.get(cache_key)
if thumbnail_data:
return Response(
content=thumbnail_data,
media_type="image/jpeg",
headers={
"Cache-Control": "public, max-age=86400, immutable",
"Vary": "Accept-Encoding"
}
)
# For videos, check if we have a cached platform thumbnail
if media_type == 'video':
try:
with app_state.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('SELECT thumbnail_data FROM video_downloads WHERE file_path = ?',
(str(resolved_path),))
row = cursor.fetchone()
if row and row['thumbnail_data']:
thumbnail_data = row['thumbnail_data']
# Add to in-memory cache
_thumbnail_cache.put(cache_key, thumbnail_data)
return Response(
content=thumbnail_data,
media_type="image/jpeg",
headers={
"Cache-Control": "public, max-age=86400, immutable",
"Vary": "Accept-Encoding"
}
)
except Exception as e:
logger.debug(f"Error checking cached thumbnail: {e}", module="MediaThumbnail")
# 2. Get from database cache or generate
thumbnail_data = get_or_create_thumbnail(resolved_path, media_type)
if not thumbnail_data:
raise FileOperationError("Failed to generate thumbnail")
# Add to in-memory cache for faster subsequent requests
_thumbnail_cache.put(cache_key, thumbnail_data)
# Cache thumbnails for 1 day - they don't change often
# immutable flag tells browsers the content will never change for this URL
return Response(
content=thumbnail_data,
media_type="image/jpeg",
headers={
"Cache-Control": "public, max-age=86400, immutable",
"Vary": "Accept-Encoding"
}
)
@router.get("/preview")
@limiter.limit("5000/minute")
@handle_exceptions
async def get_media_preview(
request: Request,
file_path: str,
token: str = None,
current_user: Dict = Depends(get_current_user_media)
):
"""Serve a media file for preview."""
resolved_path = validate_file_path(file_path)
if not resolved_path.exists() or not resolved_path.is_file():
raise CustomFileNotFoundError("File not found", {"path": str(file_path)})
# Cache media files for 1 hour - content doesn't change
return FileResponse(
str(resolved_path),
headers={"Cache-Control": "public, max-age=3600"}
)
@router.get("/metadata")
@limiter.limit("5000/minute")
@handle_exceptions
async def get_media_metadata(
request: Request,
file_path: str,
current_user: Dict = Depends(get_current_user)
):
"""
Get cached metadata for a media file (resolution, duration, etc.).
"""
resolved_path = validate_file_path(file_path)
if not resolved_path.exists() or not resolved_path.is_file():
raise CustomFileNotFoundError("File not found", {"path": str(file_path)})
app_state = get_app_state()
# Get metadata from cache
metadata_db_path = settings.PROJECT_ROOT / 'database' / 'media_metadata.db'
file_hash = hashlib.sha256(str(resolved_path).encode()).hexdigest()
try:
with sqlite3.connect(str(metadata_db_path)) as conn:
cursor = conn.execute(
"""SELECT width, height, file_size, duration, format, created_at
FROM media_metadata WHERE file_hash = ?""",
(file_hash,)
)
result = cursor.fetchone()
except Exception:
result = None
if result:
width, height, file_size, duration, format_type, created_at = result
return {
"file_path": str(resolved_path),
"width": width,
"height": height,
"file_size": file_size,
"duration": duration,
"format": format_type,
"cached": True,
"cached_at": created_at
}
# Not in metadata cache - try file_inventory first
width, height, duration = None, None, None
try:
with app_state.db.get_connection() as conn:
cursor = conn.execute(
"""SELECT width, height, file_size, platform, source FROM file_inventory WHERE file_path = ?""",
(str(resolved_path),)
)
inv_result = cursor.fetchone()
if inv_result and inv_result[0] and inv_result[1]:
return {
"file_path": str(resolved_path),
"width": inv_result[0],
"height": inv_result[1],
"file_size": inv_result[2] or resolved_path.stat().st_size,
"platform": inv_result[3],
"source": inv_result[4],
"cached": True,
"source_table": "file_inventory"
}
except Exception as e:
logger.debug(f"Error reading file_inventory cache: {e}", module="MediaInfo")
# Fall back to dynamic extraction
file_ext = resolved_path.suffix.lower()
try:
if file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.heic', '.heif']:
try:
with Image.open(str(resolved_path)) as img:
width, height = img.size
except Exception as e:
logger.debug(f"Error reading image dimensions: {e}", module="MediaInfo")
elif file_ext in ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v']:
# Skip ffprobe fallback for performance - rely on cached dimensions only
# Videos without cached dimensions will show without width/height
pass
except Exception:
pass
return {
"file_path": str(resolved_path),
"width": width,
"height": height,
"duration": duration,
"file_size": resolved_path.stat().st_size,
"cached": False
}
@router.get("/embedded-metadata")
@limiter.limit("1000/minute")
@handle_exceptions
async def get_embedded_metadata(
request: Request,
file_path: str,
current_user: Dict = Depends(get_current_user)
):
"""
Read descriptive metadata embedded in the actual file.
For videos: Uses ffprobe to read title, artist, description, comment, date
For images: Uses exiftool to read EXIF data (ImageDescription, Artist, etc.)
This is different from /metadata which returns technical info (resolution, duration).
"""
resolved_path = validate_file_path(file_path)
if not resolved_path.exists() or not resolved_path.is_file():
raise CustomFileNotFoundError("File not found", {"path": str(file_path)})
file_ext = resolved_path.suffix.lower()
metadata = {
"file_path": str(resolved_path),
"title": None,
"artist": None,
"description": None,
"comment": None,
"date": None,
"source": None
}
try:
if file_ext in ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.m4v', '.m4a', '.mp3']:
# Use ffprobe for video/audio files
result = subprocess.run(
['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_format', str(resolved_path)],
capture_output=True, text=True, timeout=10
)
if result.returncode == 0:
data = json.loads(result.stdout)
tags = data.get('format', {}).get('tags', {})
# ffprobe returns tags in various cases, normalize to lowercase lookup
tags_lower = {k.lower(): v for k, v in tags.items()}
metadata['title'] = tags_lower.get('title')
metadata['artist'] = tags_lower.get('artist') or tags_lower.get('album_artist')
metadata['description'] = tags_lower.get('description') or tags_lower.get('synopsis')
metadata['comment'] = tags_lower.get('comment')
metadata['date'] = tags_lower.get('date') or tags_lower.get('creation_time')
# Try to extract source URL from comment or purl
if metadata['comment'] and metadata['comment'].startswith('http'):
metadata['source'] = metadata['comment']
elif tags_lower.get('purl'):
metadata['source'] = tags_lower.get('purl')
elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.heic', '.heif']:
# Use exiftool for images
result = subprocess.run(
['exiftool', '-j', '-ImageDescription', '-XPComment', '-Artist',
'-DateTimeOriginal', '-UserComment', '-Caption-Abstract', str(resolved_path)],
capture_output=True, text=True, timeout=10
)
if result.returncode == 0:
data = json.loads(result.stdout)
if data and len(data) > 0:
exif = data[0]
metadata['title'] = exif.get('ImageDescription') or exif.get('Caption-Abstract')
metadata['artist'] = exif.get('Artist')
metadata['description'] = exif.get('XPComment')
metadata['comment'] = exif.get('UserComment')
metadata['date'] = exif.get('DateTimeOriginal')
# Check if comment contains URL
if metadata['comment'] and str(metadata['comment']).startswith('http'):
metadata['source'] = metadata['comment']
except subprocess.TimeoutExpired:
logger.warning(f"Timeout reading embedded metadata: {file_path}", module="Media")
except Exception as e:
logger.warning(f"Error reading embedded metadata: {e}", module="Media")
return metadata
# ============================================================================
# CACHE MANAGEMENT ENDPOINTS
# ============================================================================
@router.post("/cache/rebuild")
@limiter.limit("5/minute")
@handle_exceptions
async def rebuild_media_cache(
request: Request,
current_user: Dict = Depends(get_current_user)
):
"""Trigger thumbnail and metadata cache rebuild."""
script_path = settings.PROJECT_ROOT / 'modules' / 'thumbnail_cache_builder.py'
if not script_path.exists():
raise CustomFileNotFoundError("Cache builder script not found")
# Run in background
subprocess.Popen(
['/usr/bin/python3', str(script_path)],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True
)
return {
"success": True,
"message": "Cache rebuild started in background"
}
@router.get("/cache/stats")
@limiter.limit("5000/minute")
@handle_exceptions
async def get_cache_stats(
request: Request,
current_user: Dict = Depends(get_current_user)
):
"""Get statistics about the media cache."""
thumb_db_path = settings.PROJECT_ROOT / 'database' / 'thumbnails.db'
metadata_db_path = settings.PROJECT_ROOT / 'database' / 'media_metadata.db'
stats = {
"thumbnails": {
"exists": True,
"count": 0,
"size_bytes": 0
},
"metadata": {
"exists": True,
"count": 0,
"size_bytes": 0
}
}
try:
with sqlite3.connect(str(thumb_db_path)) as conn:
cursor = conn.execute("SELECT COUNT(*) FROM thumbnails")
stats["thumbnails"]["count"] = cursor.fetchone()[0]
except Exception:
stats["thumbnails"]["exists"] = False
try:
with sqlite3.connect(str(metadata_db_path)) as conn:
cursor = conn.execute("SELECT COUNT(*) FROM media_metadata")
stats["metadata"]["count"] = cursor.fetchone()[0]
except Exception:
stats["metadata"]["exists"] = False
return stats
# ============================================================================
# BATCH OPERATIONS
# ============================================================================
MAX_BATCH_SIZE = 500 # Maximum files per batch operation
@router.post("/batch-delete")
@limiter.limit("10/minute")
@handle_exceptions
async def batch_delete_media(
request: Request,
current_user: Dict = Depends(require_admin),
file_paths: List[str] = Body(...)
):
"""
Move multiple media files to recycle bin (admin only).
Maximum 500 files per request.
"""
# Security: Limit batch size to prevent DoS
if len(file_paths) > MAX_BATCH_SIZE:
raise ValidationError(f"Batch size exceeds maximum of {MAX_BATCH_SIZE} files")
app_state = get_app_state()
deleted = []
errors = []
for file_path in file_paths:
try:
requested_path = Path(file_path)
resolved_path = requested_path.resolve()
# Use relative_to() for safe path validation (prevents symlink bypass)
try:
resolved_path.relative_to(MEDIA_BASE.resolve())
except ValueError:
errors.append({"file": file_path, "error": "Access denied"})
continue
if resolved_path.exists() and resolved_path.is_file():
recycle_id = app_state.db.move_to_recycle_bin(
file_path=str(resolved_path),
deleted_from='media',
deleted_by=current_user.get('sub'),
metadata={}
)
if recycle_id:
deleted.append(file_path)
else:
errors.append({"file": file_path, "error": "Failed to move to recycle bin"})
else:
errors.append({"file": file_path, "error": "File not found"})
except Exception as e:
errors.append({"file": file_path, "error": str(e)})
# Broadcast update
try:
if hasattr(app_state, 'websocket_manager') and app_state.websocket_manager:
await app_state.websocket_manager.broadcast({
"type": "batch_delete_completed",
"deleted_count": len(deleted),
"error_count": len(errors),
"timestamp": now_iso8601()
})
except Exception:
pass
return {
"success": True,
"deleted": deleted,
"errors": errors,
"deleted_count": len(deleted),
"error_count": len(errors)
}
@router.post("/update-date")
@limiter.limit("30/minute")
@handle_exceptions
async def update_media_date(
request: Request,
data: UpdateDateRequest,
current_user: Dict = Depends(require_admin)
):
"""
Update the date (post_date or download_date) for media files.
Optionally updates file timestamps (EXIF, video metadata, filesystem).
"""
from modules.date_utils import DateHandler
app_state = get_app_state()
results = []
success_count = 0
failed_count = 0
# Parse the new date
try:
new_date = datetime.fromisoformat(data.new_date.replace('Z', '+00:00'))
except ValueError:
raise ValidationError(f"Invalid date format: {data.new_date}. Use ISO format like 2024-06-15T14:30:00")
# Validate date_type
if data.date_type not in ("post_date", "download_date"):
raise ValidationError(f"Invalid date_type: {data.date_type}. Must be 'post_date' or 'download_date'")
# Use connection pool for main database access
with app_state.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
for file_id in data.ids:
try:
# Get file info from file_inventory
cursor.execute("""
SELECT id, file_path, filename, platform
FROM file_inventory
WHERE id = ?
""", (file_id,))
row = cursor.fetchone()
if not row:
results.append({"id": file_id, "success": False, "error": "File not found in inventory"})
failed_count += 1
continue
file_path = row['file_path']
filename = row['filename']
# Update downloads table
# Use explicit SQL statements instead of f-string interpolation to prevent SQL injection
date_value = new_date.strftime('%Y-%m-%d %H:%M:%S')
if data.date_type == "post_date":
cursor.execute("""
UPDATE downloads
SET post_date = ?
WHERE file_path = ? OR filename = ?
""", (date_value, file_path, filename))
else: # download_date (already validated above)
cursor.execute("""
UPDATE downloads
SET download_date = ?
WHERE file_path = ? OR filename = ?
""", (date_value, file_path, filename))
rows_updated = cursor.rowcount
# If no downloads record matched, try by filename only
if rows_updated == 0:
if data.date_type == "post_date":
cursor.execute("""
UPDATE downloads
SET post_date = ?
WHERE filename = ?
""", (date_value, filename))
else: # download_date
cursor.execute("""
UPDATE downloads
SET download_date = ?
WHERE filename = ?
""", (date_value, filename))
rows_updated = cursor.rowcount
# If still no match, insert a downloads record so the date is persisted
if rows_updated == 0:
import hashlib
platform = row['platform'] or 'unknown'
url_hash = hashlib.sha256(file_path.encode()).hexdigest()
url = f"file://{file_path}"
if data.date_type == "post_date":
cursor.execute("""
INSERT INTO downloads (url_hash, url, filename, file_path, platform, post_date, download_date, status)
VALUES (?, ?, ?, ?, ?, ?, ?, 'completed')
""", (url_hash, url, filename, file_path, platform, date_value, date_value))
else: # download_date
cursor.execute("""
INSERT INTO downloads (url_hash, url, filename, file_path, platform, download_date, status)
VALUES (?, ?, ?, ?, ?, ?, 'completed')
""", (url_hash, url, filename, file_path, platform, date_value))
rows_updated = 1
logger.info(f"Created downloads record for {filename} to store {data.date_type}")
# Update file timestamps if requested
file_update_success = True
if data.update_file and Path(file_path).exists():
try:
file_update_success = DateHandler.update_file_timestamps(file_path, new_date)
if file_update_success:
logger.info(f"Updated file timestamps for {filename}")
else:
logger.warning(f"Failed to update file timestamps for {filename}")
except Exception as e:
logger.error(f"Error updating file timestamps for {filename}: {e}")
file_update_success = False
results.append({
"id": file_id,
"success": True,
"db_rows_updated": rows_updated,
"file_updated": file_update_success if data.update_file else None
})
success_count += 1
except Exception as e:
logger.error(f"Error updating date for file {file_id}: {e}")
results.append({"id": file_id, "success": False, "error": str(e)})
failed_count += 1
return {
"success": True,
"results": results,
"success_count": success_count,
"failed_count": failed_count
}
def process_batch_move_background(file_paths: List[str], destination: str, app_state):
"""Background task to process batch file moves."""
dest_path = Path(destination)
if not dest_path.is_absolute():
dest_path = MEDIA_BASE / destination
dest_path.mkdir(parents=True, exist_ok=True)
moved_count = 0
error_count = 0
for file_path in file_paths:
filename = Path(file_path).name
try:
requested_path = Path(file_path)
resolved_path = requested_path.resolve()
# Use relative_to() for safe path validation (prevents symlink bypass)
try:
resolved_path.relative_to(MEDIA_BASE.resolve())
except ValueError:
try:
if hasattr(app_state, 'websocket_manager') and app_state.websocket_manager:
app_state.websocket_manager.broadcast_sync({
"type": "batch_move_progress",
"filename": filename,
"success": False,
"error": "Access denied"
})
except Exception:
pass
error_count += 1
continue
if resolved_path.exists() and resolved_path.is_file():
dest_file = dest_path / resolved_path.name
shutil.move(str(resolved_path), str(dest_file))
# Update file_inventory
try:
app_state.db.update_file_inventory_location(
file_path=str(resolved_path),
new_location='final',
new_file_path=str(dest_file)
)
except Exception as e:
logger.warning(f"Failed to update file_inventory for {filename}: {e}", module="API")
# Update paths in all tables
update_file_path_in_all_tables(app_state.db, str(resolved_path), str(dest_file))
try:
if hasattr(app_state, 'websocket_manager') and app_state.websocket_manager:
app_state.websocket_manager.broadcast_sync({
"type": "batch_move_progress",
"filename": filename,
"success": True,
"destination": str(dest_file)
})
except Exception:
pass
moved_count += 1
else:
try:
if hasattr(app_state, 'websocket_manager') and app_state.websocket_manager:
app_state.websocket_manager.broadcast_sync({
"type": "batch_move_progress",
"filename": filename,
"success": False,
"error": "File not found"
})
except Exception:
pass
error_count += 1
except Exception as e:
try:
if hasattr(app_state, 'websocket_manager') and app_state.websocket_manager:
app_state.websocket_manager.broadcast_sync({
"type": "batch_move_progress",
"filename": filename,
"success": False,
"error": str(e)
})
except Exception:
pass
error_count += 1
# Send completion update
try:
if hasattr(app_state, 'websocket_manager') and app_state.websocket_manager:
app_state.websocket_manager.broadcast_sync({
"type": "batch_move_completed",
"moved_count": moved_count,
"error_count": error_count,
"timestamp": now_iso8601()
})
except Exception:
pass
@router.post("/batch-move")
@limiter.limit("10/minute")
@handle_exceptions
async def batch_move_media(
request: Request,
background_tasks: BackgroundTasks,
current_user: Dict = Depends(require_admin),
move_data: BatchMoveRequest = Body(...)
):
"""
Move multiple media files to a different directory (admin only, async with progress updates).
Maximum 500 files per request.
"""
# Security: Limit batch size to prevent DoS
if len(move_data.file_paths) > MAX_BATCH_SIZE:
raise ValidationError(f"Batch size exceeds maximum of {MAX_BATCH_SIZE} files")
app_state = get_app_state()
dest_path = Path(move_data.destination)
if not dest_path.is_absolute():
dest_path = MEDIA_BASE / move_data.destination
# Use relative_to() for safe path validation (prevents symlink bypass)
try:
dest_path.resolve().relative_to(MEDIA_BASE.resolve())
except ValueError:
raise ValidationError(
"Destination must be within media directory",
{"destination": move_data.destination}
)
# Queue batch move in background
background_tasks.add_task(
process_batch_move_background,
move_data.file_paths,
move_data.destination,
app_state
)
return {
"success": True,
"processing": True,
"file_count": len(move_data.file_paths),
"message": "Batch move started, processing in background"
}
@router.post("/batch-download")
@limiter.limit("10/minute")
@handle_exceptions
async def batch_download_media(
request: Request,
current_user: Dict = Depends(get_current_user),
file_paths: List[str] = Body(...)
):
"""Create a zip file of selected media files."""
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.zip')
temp_file.close()
with zipfile.ZipFile(temp_file.name, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file_path in file_paths:
try:
requested_path = Path(file_path)
resolved_path = requested_path.resolve()
# Use relative_to() for safe path validation (prevents symlink bypass)
try:
resolved_path.relative_to(MEDIA_BASE.resolve())
except ValueError:
continue
if resolved_path.exists() and resolved_path.is_file():
arcname = resolved_path.name
zipf.write(resolved_path, arcname)
except Exception:
continue
return FileResponse(
temp_file.name,
media_type='application/zip',
filename=f'media-{datetime.now().strftime("%Y%m%d-%H%M%S")}.zip'
)
# ============================================================================
# FILE STATUS AND LOCATION ENDPOINTS
# ============================================================================
@router.post("/check-status")
@limiter.limit("1000/minute")
@handle_exceptions
async def check_file_statuses(
request: Request,
current_user: Dict = Depends(get_current_user),
file_paths: List[str] = Body(..., embed=True)
):
"""
Check the status of multiple files - returns location (media/review/recycle/deleted).
"""
import os
app_state = get_app_state()
results = {}
with app_state.db.get_connection() as conn:
cursor = conn.cursor()
for file_path in file_paths:
cursor.execute('''
SELECT location, file_path FROM file_inventory WHERE file_path = ?
''', (file_path,))
row = cursor.fetchone()
if row:
location = row[0]
current_path = row[1]
if location == 'final':
if os.path.exists(current_path):
results[file_path] = {'status': 'media', 'current_path': current_path}
else:
results[file_path] = {'status': 'missing', 'current_path': current_path}
elif location == 'review':
if os.path.exists(current_path):
results[file_path] = {'status': 'review', 'current_path': current_path}
else:
results[file_path] = {'status': 'missing', 'current_path': current_path}
else:
results[file_path] = {'status': location, 'current_path': current_path}
else:
# Not in file_inventory, check recycle bin
cursor.execute('''
SELECT id, original_path FROM recycle_bin WHERE original_path = ?
''', (file_path,))
recycle_row = cursor.fetchone()
if recycle_row:
results[file_path] = {'status': 'recycle', 'recycle_id': recycle_row[0]}
elif os.path.exists(file_path):
# File exists on disk but not tracked - treat as media
results[file_path] = {'status': 'media', 'current_path': file_path}
else:
results[file_path] = {'status': 'deleted'}
return {"file_statuses": results}
@router.post("/move-to-review")
@limiter.limit("10/minute")
@handle_exceptions
async def move_to_review(
request: Request,
current_user: Dict = Depends(require_admin),
file_paths: List[str] = Body(..., embed=True)
):
"""Move media files to review queue. Requires admin privileges."""
app_state = get_app_state()
moved = []
errors = []
for file_path in file_paths:
try:
source_path = Path(file_path)
resolved_path = source_path.resolve()
# Use relative_to() for safe path validation (prevents symlink bypass)
try:
resolved_path.relative_to(MEDIA_BASE.resolve())
except ValueError:
errors.append({"file": file_path, "error": "Access denied"})
continue
if not resolved_path.exists() or not resolved_path.is_file():
errors.append({"file": file_path, "error": "File not found"})
continue
# Maintain folder structure relative to media_base
relative_path = resolved_path.relative_to(MEDIA_BASE)
dest_path = REVIEW_BASE / relative_path
dest_path.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(resolved_path), str(dest_path))
# Update file_inventory
try:
app_state.db.update_file_inventory_location(
file_path=str(resolved_path),
new_location='review',
new_file_path=str(dest_path)
)
except Exception as e:
logger.warning(f"Failed to update file_inventory: {e}", module="API")
# Update paths in all tables
update_file_path_in_all_tables(app_state.db, str(resolved_path), str(dest_path))
moved.append(file_path)
except Exception as e:
errors.append({"file": file_path, "error": str(e)})
# Broadcast update
try:
if hasattr(app_state, 'websocket_manager') and app_state.websocket_manager:
await app_state.websocket_manager.broadcast({
"type": "move_to_review_completed",
"moved_count": len(moved),
"error_count": len(errors),
"timestamp": now_iso8601()
})
except Exception:
pass
return {
"success": True,
"moved": moved,
"errors": errors,
"moved_count": len(moved),
"error_count": len(errors)
}
# ============================================================================
# GALLERY ENDPOINT
# ============================================================================
@router.get("/gallery")
@limiter.limit("5000/minute")
@handle_exceptions
async def get_media_gallery(
request: Request,
current_user: Dict = Depends(get_current_user),
platform: Optional[str] = None,
source: Optional[str] = None,
media_type: str = Query("all", pattern="^(all|image|video)$"),
limit: int = Query(50, ge=1, le=500, description="Max items to return (1-500)"),
offset: int = Query(0, ge=0, description="Number of items to skip"),
face_recognition: Optional[str] = None,
sort_by: str = Query("post_date", pattern="^(post_date|download_date|file_size|filename)$"),
sort_order: str = Query("desc", pattern="^(asc|desc)$"),
date_from: Optional[str] = Query(None, pattern="^\\d{4}-\\d{2}-\\d{2}$", description="Filter by post date from (YYYY-MM-DD)"),
date_to: Optional[str] = Query(None, pattern="^\\d{4}-\\d{2}-\\d{2}$", description="Filter by post date to (YYYY-MM-DD)"),
size_min: Optional[int] = Query(None, ge=0, description="Minimum file size in bytes"),
size_max: Optional[int] = Query(None, ge=0, description="Maximum file size in bytes"),
search: Optional[str] = Query(None, max_length=200, description="Search filename"),
shuffle: bool = Query(False, description="Shuffle results deterministically"),
shuffle_seed: Optional[int] = Query(None, description="Seed for deterministic shuffle"),
):
"""Get media files for gallery view (database-first)."""
app_state = get_app_state()
with app_state.db.get_connection() as conn:
cursor = conn.cursor()
# Build query with filters
# Only join face_recognition_scans if filtering by face_recognition (30-40% faster when not filtering)
if face_recognition:
query = '''
SELECT
fi.id,
fi.file_path,
fi.filename,
fi.platform,
fi.source,
fi.content_type as media_type,
fi.file_size,
fi.width,
fi.height,
fi.video_id,
COALESCE(d_date.max_download_date, fi.created_date) as download_date,
COALESCE(d_post.max_post_date, fi.created_date) as post_date,
frs.has_match as face_has_match,
frs.matched_person as face_matched_person,
frs.confidence as face_confidence,
frs.scan_date as face_scan_date
FROM file_inventory fi
LEFT JOIN (
SELECT filename, MAX(download_date) as max_download_date
FROM downloads GROUP BY filename
) d_date ON d_date.filename = fi.filename
LEFT JOIN (
SELECT file_path, MAX(post_date) as max_post_date
FROM downloads GROUP BY file_path
) d_post ON d_post.file_path = fi.file_path
LEFT JOIN face_recognition_scans frs ON frs.file_path = fi.file_path
WHERE fi.location = 'final'
'''
else:
query = '''
SELECT
fi.id,
fi.file_path,
fi.filename,
fi.platform,
fi.source,
fi.content_type as media_type,
fi.file_size,
fi.width,
fi.height,
fi.video_id,
COALESCE(d_date.max_download_date, fi.created_date) as download_date,
COALESCE(d_post.max_post_date, fi.created_date) as post_date,
NULL as face_has_match,
NULL as face_matched_person,
NULL as face_confidence,
NULL as face_scan_date
FROM file_inventory fi
LEFT JOIN (
SELECT filename, MAX(download_date) as max_download_date
FROM downloads GROUP BY filename
) d_date ON d_date.filename = fi.filename
LEFT JOIN (
SELECT file_path, MAX(post_date) as max_post_date
FROM downloads GROUP BY file_path
) d_post ON d_post.file_path = fi.file_path
WHERE fi.location = 'final'
'''
params = []
if platform:
query += ' AND fi.platform = ?'
params.append(platform)
if source:
query += ' AND fi.source = ?'
params.append(source)
if media_type != "all":
query += ' AND fi.content_type = ?'
params.append(media_type)
if date_from:
query += ' AND DATE(COALESCE(d_post.max_post_date, fi.created_date)) >= ?'
params.append(date_from)
if date_to:
query += ' AND DATE(COALESCE(d_post.max_post_date, fi.created_date)) <= ?'
params.append(date_to)
if size_min is not None:
query += ' AND fi.file_size >= ?'
params.append(size_min)
if size_max is not None:
query += ' AND fi.file_size <= ?'
params.append(size_max)
if search:
search_term = f'%{search}%'
query += ' AND (fi.filename LIKE ? OR fi.platform LIKE ? OR fi.source LIKE ? OR fi.content_type LIKE ?)'
params.extend([search_term, search_term, search_term, search_term])
# Apply face recognition filter
if face_recognition:
if face_recognition == 'matched':
query += ' AND frs.has_match = 1'
elif face_recognition == 'no_match':
query += ' AND frs.file_path IS NOT NULL AND frs.has_match = 0'
elif face_recognition == 'not_scanned':
query += ' AND frs.file_path IS NULL'
# Build count query (only join face_recognition_scans if filtering)
if face_recognition:
count_query = '''
SELECT COUNT(*)
FROM file_inventory fi
LEFT JOIN (
SELECT file_path, MAX(post_date) as max_post_date
FROM downloads GROUP BY file_path
) d_post ON d_post.file_path = fi.file_path
LEFT JOIN face_recognition_scans frs ON frs.file_path = fi.file_path
WHERE fi.location = 'final'
'''
else:
count_query = '''
SELECT COUNT(*)
FROM file_inventory fi
LEFT JOIN (
SELECT file_path, MAX(post_date) as max_post_date
FROM downloads GROUP BY file_path
) d_post ON d_post.file_path = fi.file_path
WHERE fi.location = 'final'
'''
count_params = []
if platform:
count_query += ' AND fi.platform = ?'
count_params.append(platform)
if source:
count_query += ' AND fi.source = ?'
count_params.append(source)
if media_type != "all":
count_query += ' AND fi.content_type = ?'
count_params.append(media_type)
if date_from:
count_query += ' AND DATE(COALESCE(d_post.max_post_date, fi.created_date)) >= ?'
count_params.append(date_from)
if date_to:
count_query += ' AND DATE(COALESCE(d_post.max_post_date, fi.created_date)) <= ?'
count_params.append(date_to)
if size_min is not None:
count_query += ' AND fi.file_size >= ?'
count_params.append(size_min)
if size_max is not None:
count_query += ' AND fi.file_size <= ?'
count_params.append(size_max)
if search:
search_term = f'%{search}%'
count_query += ' AND (fi.filename LIKE ? OR fi.platform LIKE ? OR fi.source LIKE ? OR fi.content_type LIKE ?)'
count_params.extend([search_term, search_term, search_term, search_term])
if face_recognition:
if face_recognition == 'matched':
count_query += ' AND frs.has_match = 1'
elif face_recognition == 'no_match':
count_query += ' AND frs.file_path IS NOT NULL AND frs.has_match = 0'
elif face_recognition == 'not_scanned':
count_query += ' AND frs.file_path IS NULL'
cursor.execute(count_query, count_params)
total = cursor.fetchone()[0]
# Add sorting
if shuffle:
# Deterministic shuffle using PostgreSQL md5 hash
seed = shuffle_seed if shuffle_seed is not None else 42
query += ' ORDER BY md5(fi.id::text || ?::text), fi.id'
params.append(str(seed))
else:
field_mapping = {
'post_date': 'post_date',
'download_date': 'download_date',
'file_size': 'fi.file_size',
'filename': 'fi.filename',
'source': 'fi.source',
'platform': 'fi.platform'
}
db_sort_field = field_mapping.get(sort_by, 'post_date')
sort_direction = 'DESC' if sort_order.lower() == 'desc' else 'ASC'
query += f' ORDER BY {db_sort_field} {sort_direction}'
# Add pagination
query += ' LIMIT ? OFFSET ?'
params.extend([limit, offset])
cursor.execute(query, params)
rows = cursor.fetchall()
# Batch fetch dimensions for items missing width/height (avoids N+1 queries)
paths_needing_dimensions = [
row['file_path'] for row in rows
if row['width'] is None or row['height'] is None
]
dimensions_cache = get_media_dimensions_batch(paths_needing_dimensions) if paths_needing_dimensions else {}
# Convert to list of dicts
media = []
for row in rows:
# Use cached dimensions or existing values
if row['width'] is not None and row['height'] is not None:
width, height = row['width'], row['height']
else:
width, height = dimensions_cache.get(row['file_path'], (None, None))
has_face_data = row['face_has_match'] is not None
face_recognition_data = {
'scanned': has_face_data,
'matched': bool(row['face_has_match']) if has_face_data else False,
'person_name': row['face_matched_person'] if has_face_data else None,
'confidence': row['face_confidence'] if has_face_data else None,
'scan_date': row['face_scan_date'] if has_face_data else None
}
item = {
"id": row['id'],
"platform": row['platform'],
"source": row['source'] or 'unknown',
"filename": row['filename'],
"file_path": row['file_path'],
"file_size": row['file_size'] or 0,
"media_type": row['media_type'] or 'image',
"download_date": row['download_date'],
"post_date": row['post_date'] if row['post_date'] else '',
"width": width,
"height": height,
"video_id": row['video_id'],
"face_recognition": face_recognition_data
}
media.append(item)
return {
"media": media,
"total": total,
"limit": limit,
"offset": offset
}
@router.get("/gallery/date-range")
@limiter.limit("60/minute")
@handle_exceptions
async def get_media_gallery_date_range(
request: Request,
current_user: Dict = Depends(get_current_user),
media_type: Optional[str] = Query(None, pattern="^(image|video)$"),
):
"""Get year/month distribution of media for timeline scrubber."""
app_state = get_app_state()
with app_state.db.get_connection() as conn:
cursor = conn.cursor()
query = '''
SELECT
EXTRACT(YEAR FROM COALESCE(d.max_post_date, fi.created_date)::timestamp) as year,
EXTRACT(MONTH FROM COALESCE(d.max_post_date, fi.created_date)::timestamp) as month,
COUNT(*) as count
FROM file_inventory fi
LEFT JOIN (
SELECT file_path, MAX(post_date) as max_post_date
FROM downloads
WHERE status = 'completed'
GROUP BY file_path
) d ON fi.file_path = d.file_path
WHERE fi.location = 'final'
'''
params = []
if media_type == 'image':
query += " AND fi.content_type = 'image'"
elif media_type == 'video':
query += " AND fi.content_type = 'video'"
query += '''
GROUP BY year, month
ORDER BY year DESC, month DESC
'''
cursor.execute(query, params)
rows = cursor.fetchall()
ranges = [{"year": int(row["year"]), "month": int(row["month"]), "count": row["count"]} for row in rows]
return {"ranges": ranges}