""" Maintenance Router Handles database maintenance and cleanup operations: - Scan and remove missing file references - Database integrity checks - Orphaned record cleanup """ import os from pathlib import Path from datetime import datetime from typing import Dict, List from fastapi import APIRouter, Depends, Request, BackgroundTasks from slowapi import Limiter from slowapi.util import get_remote_address from ..core.dependencies import get_current_user, require_admin, get_app_state from ..core.config import settings from ..core.responses import now_iso8601 from ..core.exceptions import handle_exceptions from modules.universal_logger import get_logger logger = get_logger('Maintenance') router = APIRouter(prefix="/api/maintenance", tags=["Maintenance"]) limiter = Limiter(key_func=get_remote_address) # Whitelist of allowed table/column combinations for cleanup operations # This prevents SQL injection by only allowing known-safe identifiers ALLOWED_CLEANUP_TABLES = { "file_inventory": "file_path", "downloads": "file_path", "youtube_downloads": "file_path", "video_downloads": "file_path", "face_recognition_scans": "file_path", "face_recognition_references": "reference_image_path", "discovery_scan_queue": "file_path", "recycle_bin": "recycle_path", } # Pre-built SQL queries for each allowed table (avoids any string interpolation) # Uses 'id' instead of 'rowid' (PostgreSQL does not have rowid) # Uses information_schema for table existence checks (PostgreSQL) _CLEANUP_QUERIES = { table: { "check_exists": "SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_name=?", "select": f"SELECT id, {col} FROM {table} WHERE {col} IS NOT NULL AND {col} != ''", "delete": f"DELETE FROM {table} WHERE id IN ", } for table, col in ALLOWED_CLEANUP_TABLES.items() } # Store last scan results last_scan_result = None @router.post("/cleanup/missing-files") @limiter.limit("5/hour") @handle_exceptions async def cleanup_missing_files( request: Request, background_tasks: BackgroundTasks, dry_run: bool = True, current_user: Dict = Depends(require_admin) ): """ Scan all database tables for file references and remove entries for missing files. Args: dry_run: If True, only report what would be deleted (default: True) Returns: Summary of files found and removed """ app_state = get_app_state() user_id = current_user.get('sub', 'unknown') logger.info(f"Database cleanup started by {user_id} (dry_run={dry_run})", module="Maintenance") # Run cleanup in background background_tasks.add_task( _cleanup_missing_files_task, app_state, dry_run, user_id ) return { "status": "started", "dry_run": dry_run, "message": "Cleanup scan started in background. Check /api/maintenance/cleanup/status for progress.", "timestamp": now_iso8601() } @router.get("/cleanup/status") @limiter.limit("60/minute") @handle_exceptions async def get_cleanup_status(request: Request, current_user: Dict = Depends(get_current_user)): """Get the status and results of the last cleanup scan""" global last_scan_result if last_scan_result is None: return { "status": "no_scan", "message": "No cleanup scan has been run yet" } return last_scan_result async def _cleanup_missing_files_task(app_state, dry_run: bool, user_id: str): """Background task to scan and cleanup missing files""" global last_scan_result start_time = datetime.now() # Initialize result tracking result = { "status": "running", "started_at": start_time.isoformat(), "dry_run": dry_run, "user": user_id, "tables_scanned": {}, "total_checked": 0, "total_missing": 0, "total_removed": 0 } try: with app_state.db.get_connection(for_write=True) as conn: cursor = conn.cursor() # Define tables and their file path columns # NOTE: instagram_perceptual_hashes is excluded because the hash data # is valuable for duplicate detection even if the original file is gone tables_to_scan = [ ("file_inventory", "file_path"), ("downloads", "file_path"), ("youtube_downloads", "file_path"), ("video_downloads", "file_path"), ("face_recognition_scans", "file_path"), ("face_recognition_references", "reference_image_path"), ("discovery_scan_queue", "file_path"), ("recycle_bin", "recycle_path"), ] for table_name, column_name in tables_to_scan: logger.info(f"Scanning {table_name}.{column_name}...", module="Maintenance") table_result = _scan_table(cursor, table_name, column_name, dry_run) result["tables_scanned"][table_name] = table_result result["total_checked"] += table_result["checked"] result["total_missing"] += table_result["missing"] result["total_removed"] += table_result["removed"] # Commit if not dry run if not dry_run: conn.commit() logger.info(f"Cleanup completed: removed {result['total_removed']} records", module="Maintenance") else: logger.info(f"Dry run completed: {result['total_missing']} records would be removed", module="Maintenance") # Update result end_time = datetime.now() duration = (end_time - start_time).total_seconds() result.update({ "status": "completed", "completed_at": end_time.isoformat(), "duration_seconds": round(duration, 2) }) except Exception as e: logger.error(f"Cleanup failed: {e}", module="Maintenance", exc_info=True) result.update({ "status": "failed", "error": str(e), "completed_at": datetime.now().isoformat() }) last_scan_result = result def _scan_table(cursor, table_name: str, column_name: str, dry_run: bool) -> Dict: """Scan a table for missing files and optionally remove them. Uses pre-built queries from _CLEANUP_QUERIES to prevent SQL injection. Only tables in ALLOWED_CLEANUP_TABLES whitelist are permitted. """ result = { "checked": 0, "missing": 0, "removed": 0, "missing_files": [] } # Validate table/column against whitelist to prevent SQL injection if table_name not in ALLOWED_CLEANUP_TABLES: logger.error(f"Table {table_name} not in allowed whitelist", module="Maintenance") result["error"] = f"Table {table_name} not allowed" return result if ALLOWED_CLEANUP_TABLES[table_name] != column_name: logger.error(f"Column {column_name} not allowed for table {table_name}", module="Maintenance") result["error"] = f"Column {column_name} not allowed for table {table_name}" return result # Get pre-built queries for this table (built at module load time, not from user input) queries = _CLEANUP_QUERIES[table_name] try: # Check if table exists using parameterized query cursor.execute(queries["check_exists"], (table_name,)) if not cursor.fetchone(): logger.warning(f"Table {table_name} does not exist", module="Maintenance") return result # Get all file paths from table using pre-built query cursor.execute(queries["select"]) rows = cursor.fetchall() result["checked"] = len(rows) missing_rowids = [] for rowid, file_path in rows: if file_path and not os.path.exists(file_path): result["missing"] += 1 missing_rowids.append(rowid) # Only keep first 100 examples if len(result["missing_files"]) < 100: result["missing_files"].append(file_path) # Remove missing entries if not dry run if not dry_run and missing_rowids: # Delete in batches of 100 using pre-built query base delete_base = queries["delete"] for i in range(0, len(missing_rowids), 100): batch = missing_rowids[i:i+100] placeholders = ','.join('?' * len(batch)) # The delete_base is pre-built from whitelist, placeholders are just ? cursor.execute(f"{delete_base}({placeholders})", batch) result["removed"] += len(batch) logger.info( f" {table_name}: checked={result['checked']}, missing={result['missing']}, " f"{'would_remove' if dry_run else 'removed'}={result['missing']}", module="Maintenance" ) except Exception as e: logger.error(f"Error scanning {table_name}: {e}", module="Maintenance", exc_info=True) result["error"] = str(e) return result