Files
media-downloader/web/backend/routers/maintenance.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

260 lines
8.9 KiB
Python

"""
Maintenance Router
Handles database maintenance and cleanup operations:
- Scan and remove missing file references
- Database integrity checks
- Orphaned record cleanup
"""
import os
from pathlib import Path
from datetime import datetime
from typing import Dict, List
from fastapi import APIRouter, Depends, Request, BackgroundTasks
from slowapi import Limiter
from slowapi.util import get_remote_address
from ..core.dependencies import get_current_user, get_app_state
from ..core.config import settings
from ..core.responses import now_iso8601
from ..core.exceptions import handle_exceptions
from modules.universal_logger import get_logger
logger = get_logger('Maintenance')
router = APIRouter(prefix="/api/maintenance", tags=["Maintenance"])
limiter = Limiter(key_func=get_remote_address)
# Whitelist of allowed table/column combinations for cleanup operations
# This prevents SQL injection by only allowing known-safe identifiers
ALLOWED_CLEANUP_TABLES = {
"file_inventory": "file_path",
"downloads": "file_path",
"youtube_downloads": "file_path",
"video_downloads": "file_path",
"face_recognition_scans": "file_path",
"face_recognition_references": "reference_image_path",
"discovery_scan_queue": "file_path",
"recycle_bin": "recycle_path",
}
# Pre-built SQL queries for each allowed table (avoids any string interpolation)
# Uses 'id' instead of 'rowid' (PostgreSQL does not have rowid)
# Uses information_schema for table existence checks (PostgreSQL)
_CLEANUP_QUERIES = {
table: {
"check_exists": "SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_name=?",
"select": f"SELECT id, {col} FROM {table} WHERE {col} IS NOT NULL AND {col} != ''",
"delete": f"DELETE FROM {table} WHERE id IN ",
}
for table, col in ALLOWED_CLEANUP_TABLES.items()
}
# Store last scan results
last_scan_result = None
@router.post("/cleanup/missing-files")
@limiter.limit("5/hour")
@handle_exceptions
async def cleanup_missing_files(
request: Request,
background_tasks: BackgroundTasks,
dry_run: bool = True,
current_user: Dict = Depends(get_current_user)
):
"""
Scan all database tables for file references and remove entries for missing files.
Args:
dry_run: If True, only report what would be deleted (default: True)
Returns:
Summary of files found and removed
"""
app_state = get_app_state()
user_id = current_user.get('sub', 'unknown')
logger.info(f"Database cleanup started by {user_id} (dry_run={dry_run})", module="Maintenance")
# Run cleanup in background
background_tasks.add_task(
_cleanup_missing_files_task,
app_state,
dry_run,
user_id
)
return {
"status": "started",
"dry_run": dry_run,
"message": "Cleanup scan started in background. Check /api/maintenance/cleanup/status for progress.",
"timestamp": now_iso8601()
}
@router.get("/cleanup/status")
@limiter.limit("60/minute")
@handle_exceptions
async def get_cleanup_status(request: Request, current_user: Dict = Depends(get_current_user)):
"""Get the status and results of the last cleanup scan"""
global last_scan_result
if last_scan_result is None:
return {
"status": "no_scan",
"message": "No cleanup scan has been run yet"
}
return last_scan_result
async def _cleanup_missing_files_task(app_state, dry_run: bool, user_id: str):
"""Background task to scan and cleanup missing files"""
global last_scan_result
start_time = datetime.now()
# Initialize result tracking
result = {
"status": "running",
"started_at": start_time.isoformat(),
"dry_run": dry_run,
"user": user_id,
"tables_scanned": {},
"total_checked": 0,
"total_missing": 0,
"total_removed": 0
}
try:
with app_state.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Define tables and their file path columns
# NOTE: instagram_perceptual_hashes is excluded because the hash data
# is valuable for duplicate detection even if the original file is gone
tables_to_scan = [
("file_inventory", "file_path"),
("downloads", "file_path"),
("youtube_downloads", "file_path"),
("video_downloads", "file_path"),
("face_recognition_scans", "file_path"),
("face_recognition_references", "reference_image_path"),
("discovery_scan_queue", "file_path"),
("recycle_bin", "recycle_path"),
]
for table_name, column_name in tables_to_scan:
logger.info(f"Scanning {table_name}.{column_name}...", module="Maintenance")
table_result = _scan_table(cursor, table_name, column_name, dry_run)
result["tables_scanned"][table_name] = table_result
result["total_checked"] += table_result["checked"]
result["total_missing"] += table_result["missing"]
result["total_removed"] += table_result["removed"]
# Commit if not dry run
if not dry_run:
conn.commit()
logger.info(f"Cleanup completed: removed {result['total_removed']} records", module="Maintenance")
else:
logger.info(f"Dry run completed: {result['total_missing']} records would be removed", module="Maintenance")
# Update result
end_time = datetime.now()
duration = (end_time - start_time).total_seconds()
result.update({
"status": "completed",
"completed_at": end_time.isoformat(),
"duration_seconds": round(duration, 2)
})
except Exception as e:
logger.error(f"Cleanup failed: {e}", module="Maintenance", exc_info=True)
result.update({
"status": "failed",
"error": str(e),
"completed_at": datetime.now().isoformat()
})
last_scan_result = result
def _scan_table(cursor, table_name: str, column_name: str, dry_run: bool) -> Dict:
"""Scan a table for missing files and optionally remove them.
Uses pre-built queries from _CLEANUP_QUERIES to prevent SQL injection.
Only tables in ALLOWED_CLEANUP_TABLES whitelist are permitted.
"""
result = {
"checked": 0,
"missing": 0,
"removed": 0,
"missing_files": []
}
# Validate table/column against whitelist to prevent SQL injection
if table_name not in ALLOWED_CLEANUP_TABLES:
logger.error(f"Table {table_name} not in allowed whitelist", module="Maintenance")
result["error"] = f"Table {table_name} not allowed"
return result
if ALLOWED_CLEANUP_TABLES[table_name] != column_name:
logger.error(f"Column {column_name} not allowed for table {table_name}", module="Maintenance")
result["error"] = f"Column {column_name} not allowed for table {table_name}"
return result
# Get pre-built queries for this table (built at module load time, not from user input)
queries = _CLEANUP_QUERIES[table_name]
try:
# Check if table exists using parameterized query
cursor.execute(queries["check_exists"], (table_name,))
if not cursor.fetchone():
logger.warning(f"Table {table_name} does not exist", module="Maintenance")
return result
# Get all file paths from table using pre-built query
cursor.execute(queries["select"])
rows = cursor.fetchall()
result["checked"] = len(rows)
missing_rowids = []
for rowid, file_path in rows:
if file_path and not os.path.exists(file_path):
result["missing"] += 1
missing_rowids.append(rowid)
# Only keep first 100 examples
if len(result["missing_files"]) < 100:
result["missing_files"].append(file_path)
# Remove missing entries if not dry run
if not dry_run and missing_rowids:
# Delete in batches of 100 using pre-built query base
delete_base = queries["delete"]
for i in range(0, len(missing_rowids), 100):
batch = missing_rowids[i:i+100]
placeholders = ','.join('?' * len(batch))
# The delete_base is pre-built from whitelist, placeholders are just ?
cursor.execute(f"{delete_base}({placeholders})", batch)
result["removed"] += len(batch)
logger.info(
f" {table_name}: checked={result['checked']}, missing={result['missing']}, "
f"{'would_remove' if dry_run else 'removed'}={result['missing']}",
module="Maintenance"
)
except Exception as e:
logger.error(f"Error scanning {table_name}: {e}", module="Maintenance", exc_info=True)
result["error"] = str(e)
return result