- scheduler.py: Use full path for scheduler_state.db instead of relative name - recycle.py: Use full path for thumbnails.db instead of relative name - cloud_backup.py, maintenance.py, stats.py: Require admin for config/cleanup/settings endpoints - press.py: Add auth to press image serving endpoint - private_gallery.py: Fix _create_pg_job call and add missing secrets import - appearances.py: Use sync httpx instead of asyncio.run for background thread HTTP call Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
260 lines
8.9 KiB
Python
260 lines
8.9 KiB
Python
"""
|
|
Maintenance Router
|
|
|
|
Handles database maintenance and cleanup operations:
|
|
- Scan and remove missing file references
|
|
- Database integrity checks
|
|
- Orphaned record cleanup
|
|
"""
|
|
|
|
import os
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Dict, List
|
|
from fastapi import APIRouter, Depends, Request, BackgroundTasks
|
|
from slowapi import Limiter
|
|
from slowapi.util import get_remote_address
|
|
|
|
from ..core.dependencies import get_current_user, require_admin, get_app_state
|
|
from ..core.config import settings
|
|
from ..core.responses import now_iso8601
|
|
from ..core.exceptions import handle_exceptions
|
|
from modules.universal_logger import get_logger
|
|
|
|
logger = get_logger('Maintenance')
|
|
|
|
router = APIRouter(prefix="/api/maintenance", tags=["Maintenance"])
|
|
limiter = Limiter(key_func=get_remote_address)
|
|
|
|
|
|
# Whitelist of allowed table/column combinations for cleanup operations
|
|
# This prevents SQL injection by only allowing known-safe identifiers
|
|
ALLOWED_CLEANUP_TABLES = {
|
|
"file_inventory": "file_path",
|
|
"downloads": "file_path",
|
|
"youtube_downloads": "file_path",
|
|
"video_downloads": "file_path",
|
|
"face_recognition_scans": "file_path",
|
|
"face_recognition_references": "reference_image_path",
|
|
"discovery_scan_queue": "file_path",
|
|
"recycle_bin": "recycle_path",
|
|
}
|
|
|
|
# Pre-built SQL queries for each allowed table (avoids any string interpolation)
|
|
# Uses 'id' instead of 'rowid' (PostgreSQL does not have rowid)
|
|
# Uses information_schema for table existence checks (PostgreSQL)
|
|
_CLEANUP_QUERIES = {
|
|
table: {
|
|
"check_exists": "SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_name=?",
|
|
"select": f"SELECT id, {col} FROM {table} WHERE {col} IS NOT NULL AND {col} != ''",
|
|
"delete": f"DELETE FROM {table} WHERE id IN ",
|
|
}
|
|
for table, col in ALLOWED_CLEANUP_TABLES.items()
|
|
}
|
|
|
|
# Store last scan results
|
|
last_scan_result = None
|
|
|
|
|
|
@router.post("/cleanup/missing-files")
|
|
@limiter.limit("5/hour")
|
|
@handle_exceptions
|
|
async def cleanup_missing_files(
|
|
request: Request,
|
|
background_tasks: BackgroundTasks,
|
|
dry_run: bool = True,
|
|
current_user: Dict = Depends(require_admin)
|
|
):
|
|
"""
|
|
Scan all database tables for file references and remove entries for missing files.
|
|
|
|
Args:
|
|
dry_run: If True, only report what would be deleted (default: True)
|
|
|
|
Returns:
|
|
Summary of files found and removed
|
|
"""
|
|
app_state = get_app_state()
|
|
user_id = current_user.get('sub', 'unknown')
|
|
|
|
logger.info(f"Database cleanup started by {user_id} (dry_run={dry_run})", module="Maintenance")
|
|
|
|
# Run cleanup in background
|
|
background_tasks.add_task(
|
|
_cleanup_missing_files_task,
|
|
app_state,
|
|
dry_run,
|
|
user_id
|
|
)
|
|
|
|
return {
|
|
"status": "started",
|
|
"dry_run": dry_run,
|
|
"message": "Cleanup scan started in background. Check /api/maintenance/cleanup/status for progress.",
|
|
"timestamp": now_iso8601()
|
|
}
|
|
|
|
|
|
@router.get("/cleanup/status")
|
|
@limiter.limit("60/minute")
|
|
@handle_exceptions
|
|
async def get_cleanup_status(request: Request, current_user: Dict = Depends(get_current_user)):
|
|
"""Get the status and results of the last cleanup scan"""
|
|
global last_scan_result
|
|
|
|
if last_scan_result is None:
|
|
return {
|
|
"status": "no_scan",
|
|
"message": "No cleanup scan has been run yet"
|
|
}
|
|
|
|
return last_scan_result
|
|
|
|
|
|
async def _cleanup_missing_files_task(app_state, dry_run: bool, user_id: str):
|
|
"""Background task to scan and cleanup missing files"""
|
|
global last_scan_result
|
|
|
|
start_time = datetime.now()
|
|
|
|
# Initialize result tracking
|
|
result = {
|
|
"status": "running",
|
|
"started_at": start_time.isoformat(),
|
|
"dry_run": dry_run,
|
|
"user": user_id,
|
|
"tables_scanned": {},
|
|
"total_checked": 0,
|
|
"total_missing": 0,
|
|
"total_removed": 0
|
|
}
|
|
|
|
try:
|
|
with app_state.db.get_connection(for_write=True) as conn:
|
|
cursor = conn.cursor()
|
|
|
|
# Define tables and their file path columns
|
|
# NOTE: instagram_perceptual_hashes is excluded because the hash data
|
|
# is valuable for duplicate detection even if the original file is gone
|
|
tables_to_scan = [
|
|
("file_inventory", "file_path"),
|
|
("downloads", "file_path"),
|
|
("youtube_downloads", "file_path"),
|
|
("video_downloads", "file_path"),
|
|
("face_recognition_scans", "file_path"),
|
|
("face_recognition_references", "reference_image_path"),
|
|
("discovery_scan_queue", "file_path"),
|
|
("recycle_bin", "recycle_path"),
|
|
]
|
|
|
|
for table_name, column_name in tables_to_scan:
|
|
logger.info(f"Scanning {table_name}.{column_name}...", module="Maintenance")
|
|
|
|
table_result = _scan_table(cursor, table_name, column_name, dry_run)
|
|
result["tables_scanned"][table_name] = table_result
|
|
result["total_checked"] += table_result["checked"]
|
|
result["total_missing"] += table_result["missing"]
|
|
result["total_removed"] += table_result["removed"]
|
|
|
|
# Commit if not dry run
|
|
if not dry_run:
|
|
conn.commit()
|
|
logger.info(f"Cleanup completed: removed {result['total_removed']} records", module="Maintenance")
|
|
else:
|
|
logger.info(f"Dry run completed: {result['total_missing']} records would be removed", module="Maintenance")
|
|
|
|
# Update result
|
|
end_time = datetime.now()
|
|
duration = (end_time - start_time).total_seconds()
|
|
|
|
result.update({
|
|
"status": "completed",
|
|
"completed_at": end_time.isoformat(),
|
|
"duration_seconds": round(duration, 2)
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.error(f"Cleanup failed: {e}", module="Maintenance", exc_info=True)
|
|
result.update({
|
|
"status": "failed",
|
|
"error": str(e),
|
|
"completed_at": datetime.now().isoformat()
|
|
})
|
|
|
|
last_scan_result = result
|
|
|
|
|
|
def _scan_table(cursor, table_name: str, column_name: str, dry_run: bool) -> Dict:
|
|
"""Scan a table for missing files and optionally remove them.
|
|
|
|
Uses pre-built queries from _CLEANUP_QUERIES to prevent SQL injection.
|
|
Only tables in ALLOWED_CLEANUP_TABLES whitelist are permitted.
|
|
"""
|
|
result = {
|
|
"checked": 0,
|
|
"missing": 0,
|
|
"removed": 0,
|
|
"missing_files": []
|
|
}
|
|
|
|
# Validate table/column against whitelist to prevent SQL injection
|
|
if table_name not in ALLOWED_CLEANUP_TABLES:
|
|
logger.error(f"Table {table_name} not in allowed whitelist", module="Maintenance")
|
|
result["error"] = f"Table {table_name} not allowed"
|
|
return result
|
|
|
|
if ALLOWED_CLEANUP_TABLES[table_name] != column_name:
|
|
logger.error(f"Column {column_name} not allowed for table {table_name}", module="Maintenance")
|
|
result["error"] = f"Column {column_name} not allowed for table {table_name}"
|
|
return result
|
|
|
|
# Get pre-built queries for this table (built at module load time, not from user input)
|
|
queries = _CLEANUP_QUERIES[table_name]
|
|
|
|
try:
|
|
# Check if table exists using parameterized query
|
|
cursor.execute(queries["check_exists"], (table_name,))
|
|
if not cursor.fetchone():
|
|
logger.warning(f"Table {table_name} does not exist", module="Maintenance")
|
|
return result
|
|
|
|
# Get all file paths from table using pre-built query
|
|
cursor.execute(queries["select"])
|
|
|
|
rows = cursor.fetchall()
|
|
result["checked"] = len(rows)
|
|
|
|
missing_rowids = []
|
|
|
|
for rowid, file_path in rows:
|
|
if file_path and not os.path.exists(file_path):
|
|
result["missing"] += 1
|
|
missing_rowids.append(rowid)
|
|
|
|
# Only keep first 100 examples
|
|
if len(result["missing_files"]) < 100:
|
|
result["missing_files"].append(file_path)
|
|
|
|
# Remove missing entries if not dry run
|
|
if not dry_run and missing_rowids:
|
|
# Delete in batches of 100 using pre-built query base
|
|
delete_base = queries["delete"]
|
|
for i in range(0, len(missing_rowids), 100):
|
|
batch = missing_rowids[i:i+100]
|
|
placeholders = ','.join('?' * len(batch))
|
|
# The delete_base is pre-built from whitelist, placeholders are just ?
|
|
cursor.execute(f"{delete_base}({placeholders})", batch)
|
|
result["removed"] += len(batch)
|
|
|
|
logger.info(
|
|
f" {table_name}: checked={result['checked']}, missing={result['missing']}, "
|
|
f"{'would_remove' if dry_run else 'removed'}={result['missing']}",
|
|
module="Maintenance"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error scanning {table_name}: {e}", module="Maintenance", exc_info=True)
|
|
result["error"] = str(e)
|
|
|
|
return result
|