259
web/backend/routers/maintenance.py
Normal file
259
web/backend/routers/maintenance.py
Normal file
@@ -0,0 +1,259 @@
|
||||
"""
|
||||
Maintenance Router
|
||||
|
||||
Handles database maintenance and cleanup operations:
|
||||
- Scan and remove missing file references
|
||||
- Database integrity checks
|
||||
- Orphaned record cleanup
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, List
|
||||
from fastapi import APIRouter, Depends, Request, BackgroundTasks
|
||||
from slowapi import Limiter
|
||||
from slowapi.util import get_remote_address
|
||||
|
||||
from ..core.dependencies import get_current_user, get_app_state
|
||||
from ..core.config import settings
|
||||
from ..core.responses import now_iso8601
|
||||
from ..core.exceptions import handle_exceptions
|
||||
from modules.universal_logger import get_logger
|
||||
|
||||
logger = get_logger('Maintenance')
|
||||
|
||||
router = APIRouter(prefix="/api/maintenance", tags=["Maintenance"])
|
||||
limiter = Limiter(key_func=get_remote_address)
|
||||
|
||||
|
||||
# Whitelist of allowed table/column combinations for cleanup operations
|
||||
# This prevents SQL injection by only allowing known-safe identifiers
|
||||
ALLOWED_CLEANUP_TABLES = {
|
||||
"file_inventory": "file_path",
|
||||
"downloads": "file_path",
|
||||
"youtube_downloads": "file_path",
|
||||
"video_downloads": "file_path",
|
||||
"face_recognition_scans": "file_path",
|
||||
"face_recognition_references": "reference_image_path",
|
||||
"discovery_scan_queue": "file_path",
|
||||
"recycle_bin": "recycle_path",
|
||||
}
|
||||
|
||||
# Pre-built SQL queries for each allowed table (avoids any string interpolation)
|
||||
# Uses 'id' instead of 'rowid' (PostgreSQL does not have rowid)
|
||||
# Uses information_schema for table existence checks (PostgreSQL)
|
||||
_CLEANUP_QUERIES = {
|
||||
table: {
|
||||
"check_exists": "SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_name=?",
|
||||
"select": f"SELECT id, {col} FROM {table} WHERE {col} IS NOT NULL AND {col} != ''",
|
||||
"delete": f"DELETE FROM {table} WHERE id IN ",
|
||||
}
|
||||
for table, col in ALLOWED_CLEANUP_TABLES.items()
|
||||
}
|
||||
|
||||
# Store last scan results
|
||||
last_scan_result = None
|
||||
|
||||
|
||||
@router.post("/cleanup/missing-files")
|
||||
@limiter.limit("5/hour")
|
||||
@handle_exceptions
|
||||
async def cleanup_missing_files(
|
||||
request: Request,
|
||||
background_tasks: BackgroundTasks,
|
||||
dry_run: bool = True,
|
||||
current_user: Dict = Depends(get_current_user)
|
||||
):
|
||||
"""
|
||||
Scan all database tables for file references and remove entries for missing files.
|
||||
|
||||
Args:
|
||||
dry_run: If True, only report what would be deleted (default: True)
|
||||
|
||||
Returns:
|
||||
Summary of files found and removed
|
||||
"""
|
||||
app_state = get_app_state()
|
||||
user_id = current_user.get('sub', 'unknown')
|
||||
|
||||
logger.info(f"Database cleanup started by {user_id} (dry_run={dry_run})", module="Maintenance")
|
||||
|
||||
# Run cleanup in background
|
||||
background_tasks.add_task(
|
||||
_cleanup_missing_files_task,
|
||||
app_state,
|
||||
dry_run,
|
||||
user_id
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "started",
|
||||
"dry_run": dry_run,
|
||||
"message": "Cleanup scan started in background. Check /api/maintenance/cleanup/status for progress.",
|
||||
"timestamp": now_iso8601()
|
||||
}
|
||||
|
||||
|
||||
@router.get("/cleanup/status")
|
||||
@limiter.limit("60/minute")
|
||||
@handle_exceptions
|
||||
async def get_cleanup_status(request: Request, current_user: Dict = Depends(get_current_user)):
|
||||
"""Get the status and results of the last cleanup scan"""
|
||||
global last_scan_result
|
||||
|
||||
if last_scan_result is None:
|
||||
return {
|
||||
"status": "no_scan",
|
||||
"message": "No cleanup scan has been run yet"
|
||||
}
|
||||
|
||||
return last_scan_result
|
||||
|
||||
|
||||
async def _cleanup_missing_files_task(app_state, dry_run: bool, user_id: str):
|
||||
"""Background task to scan and cleanup missing files"""
|
||||
global last_scan_result
|
||||
|
||||
start_time = datetime.now()
|
||||
|
||||
# Initialize result tracking
|
||||
result = {
|
||||
"status": "running",
|
||||
"started_at": start_time.isoformat(),
|
||||
"dry_run": dry_run,
|
||||
"user": user_id,
|
||||
"tables_scanned": {},
|
||||
"total_checked": 0,
|
||||
"total_missing": 0,
|
||||
"total_removed": 0
|
||||
}
|
||||
|
||||
try:
|
||||
with app_state.db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Define tables and their file path columns
|
||||
# NOTE: instagram_perceptual_hashes is excluded because the hash data
|
||||
# is valuable for duplicate detection even if the original file is gone
|
||||
tables_to_scan = [
|
||||
("file_inventory", "file_path"),
|
||||
("downloads", "file_path"),
|
||||
("youtube_downloads", "file_path"),
|
||||
("video_downloads", "file_path"),
|
||||
("face_recognition_scans", "file_path"),
|
||||
("face_recognition_references", "reference_image_path"),
|
||||
("discovery_scan_queue", "file_path"),
|
||||
("recycle_bin", "recycle_path"),
|
||||
]
|
||||
|
||||
for table_name, column_name in tables_to_scan:
|
||||
logger.info(f"Scanning {table_name}.{column_name}...", module="Maintenance")
|
||||
|
||||
table_result = _scan_table(cursor, table_name, column_name, dry_run)
|
||||
result["tables_scanned"][table_name] = table_result
|
||||
result["total_checked"] += table_result["checked"]
|
||||
result["total_missing"] += table_result["missing"]
|
||||
result["total_removed"] += table_result["removed"]
|
||||
|
||||
# Commit if not dry run
|
||||
if not dry_run:
|
||||
conn.commit()
|
||||
logger.info(f"Cleanup completed: removed {result['total_removed']} records", module="Maintenance")
|
||||
else:
|
||||
logger.info(f"Dry run completed: {result['total_missing']} records would be removed", module="Maintenance")
|
||||
|
||||
# Update result
|
||||
end_time = datetime.now()
|
||||
duration = (end_time - start_time).total_seconds()
|
||||
|
||||
result.update({
|
||||
"status": "completed",
|
||||
"completed_at": end_time.isoformat(),
|
||||
"duration_seconds": round(duration, 2)
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Cleanup failed: {e}", module="Maintenance", exc_info=True)
|
||||
result.update({
|
||||
"status": "failed",
|
||||
"error": str(e),
|
||||
"completed_at": datetime.now().isoformat()
|
||||
})
|
||||
|
||||
last_scan_result = result
|
||||
|
||||
|
||||
def _scan_table(cursor, table_name: str, column_name: str, dry_run: bool) -> Dict:
|
||||
"""Scan a table for missing files and optionally remove them.
|
||||
|
||||
Uses pre-built queries from _CLEANUP_QUERIES to prevent SQL injection.
|
||||
Only tables in ALLOWED_CLEANUP_TABLES whitelist are permitted.
|
||||
"""
|
||||
result = {
|
||||
"checked": 0,
|
||||
"missing": 0,
|
||||
"removed": 0,
|
||||
"missing_files": []
|
||||
}
|
||||
|
||||
# Validate table/column against whitelist to prevent SQL injection
|
||||
if table_name not in ALLOWED_CLEANUP_TABLES:
|
||||
logger.error(f"Table {table_name} not in allowed whitelist", module="Maintenance")
|
||||
result["error"] = f"Table {table_name} not allowed"
|
||||
return result
|
||||
|
||||
if ALLOWED_CLEANUP_TABLES[table_name] != column_name:
|
||||
logger.error(f"Column {column_name} not allowed for table {table_name}", module="Maintenance")
|
||||
result["error"] = f"Column {column_name} not allowed for table {table_name}"
|
||||
return result
|
||||
|
||||
# Get pre-built queries for this table (built at module load time, not from user input)
|
||||
queries = _CLEANUP_QUERIES[table_name]
|
||||
|
||||
try:
|
||||
# Check if table exists using parameterized query
|
||||
cursor.execute(queries["check_exists"], (table_name,))
|
||||
if not cursor.fetchone():
|
||||
logger.warning(f"Table {table_name} does not exist", module="Maintenance")
|
||||
return result
|
||||
|
||||
# Get all file paths from table using pre-built query
|
||||
cursor.execute(queries["select"])
|
||||
|
||||
rows = cursor.fetchall()
|
||||
result["checked"] = len(rows)
|
||||
|
||||
missing_rowids = []
|
||||
|
||||
for rowid, file_path in rows:
|
||||
if file_path and not os.path.exists(file_path):
|
||||
result["missing"] += 1
|
||||
missing_rowids.append(rowid)
|
||||
|
||||
# Only keep first 100 examples
|
||||
if len(result["missing_files"]) < 100:
|
||||
result["missing_files"].append(file_path)
|
||||
|
||||
# Remove missing entries if not dry run
|
||||
if not dry_run and missing_rowids:
|
||||
# Delete in batches of 100 using pre-built query base
|
||||
delete_base = queries["delete"]
|
||||
for i in range(0, len(missing_rowids), 100):
|
||||
batch = missing_rowids[i:i+100]
|
||||
placeholders = ','.join('?' * len(batch))
|
||||
# The delete_base is pre-built from whitelist, placeholders are just ?
|
||||
cursor.execute(f"{delete_base}({placeholders})", batch)
|
||||
result["removed"] += len(batch)
|
||||
|
||||
logger.info(
|
||||
f" {table_name}: checked={result['checked']}, missing={result['missing']}, "
|
||||
f"{'would_remove' if dry_run else 'removed'}={result['missing']}",
|
||||
module="Maintenance"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error scanning {table_name}: {e}", module="Maintenance", exc_info=True)
|
||||
result["error"] = str(e)
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user