media-downloader/web/backend/routers/cloud_backup.py

"""
Cloud Backup Router

Manages cloud backup configuration (rclone-based sync to B2/S3/etc.):
- Config CRUD with sensitive field masking
- rclone.conf generation (preserves existing sections)
- Sync control (trigger, pause, resume)
- Status and log retrieval
- Connection testing
"""

import json
import os
import re
import subprocess
import tempfile
import uuid
from datetime import datetime
from pathlib import Path
from threading import Lock
from typing import Any, Dict, List, Optional

from fastapi import APIRouter, BackgroundTasks, Depends
from pydantic import BaseModel, Field
from slowapi import Limiter
from slowapi.util import get_remote_address

from ..core.dependencies import get_current_user, require_admin, get_app_state
from modules.universal_logger import get_logger

logger = get_logger('CloudBackup')

router = APIRouter(prefix="/api/cloud-backup", tags=["Cloud Backup"])
limiter = Limiter(key_func=get_remote_address)

# ============================================================================
# CONSTANTS
# ============================================================================

RCLONE_CONF_PATH = Path("/root/.config/rclone/rclone.conf")
RCLONE_REMOTE_NAME = "cloud-backup-remote"
RCLONE_CRYPT_NAME = "cloud-backup-crypt"
STATUS_FILE = Path("/tmp/cloud_backup_status.json")
TRIGGER_FILE = Path("/tmp/cloud_backup_trigger.json")
LOG_FILE = Path("/opt/media-downloader/logs/cloud_backup.log")
IMMICH_BASE = Path("/opt/immich")
SETTINGS_KEY = "cloud_backup"
SERVICE_NAME = "cloud-backup-sync.service"

SENSITIVE_FIELDS = {"key_id", "application_key", "encryption_password", "encryption_salt"}
MASK_VALUE = "****"

DEFAULT_INCLUDE_DIRS = ["paid", "private", "md", "el", "elv", "ela", "upload", "review", "recycle", "db_dumps", "app_backup"]
DEFAULT_EXCLUDE_DIRS = ["lost+found", "db.old"]

# ============================================================================
# JOB TRACKING FOR BACKGROUND SYNC
# ============================================================================

_sync_jobs: Dict[str, Dict] = {}
_jobs_lock = Lock()


def _get_sync_job(job_id: str) -> Optional[Dict]:
    with _jobs_lock:
        return _sync_jobs.get(job_id)


def _update_sync_job(job_id: str, updates: Dict):
    with _jobs_lock:
        if job_id in _sync_jobs:
            _sync_jobs[job_id].update(updates)


def _create_sync_job(job_id: str) -> Dict:
    with _jobs_lock:
        _sync_jobs[job_id] = {
            'id': job_id,
            'status': 'running',
            'started_at': datetime.now().isoformat(),
            'completed_at': None,
            'dirs_synced': 0,
            'dirs_total': 0,
            'current_dir': None,
            'current_file': None,
            'transfer_stats': None,
            'phase': 'preparing',
            'error': None,
        }
        return _sync_jobs[job_id]


def _cleanup_old_sync_jobs():
    with _jobs_lock:
        now = datetime.now()
        to_remove = []
        for job_id, job in _sync_jobs.items():
            if job.get('completed_at'):
                try:
                    completed = datetime.fromisoformat(job['completed_at'])
                    if (now - completed).total_seconds() > 3600:
                        to_remove.append(job_id)
                except (ValueError, TypeError):
                    pass
        for job_id in to_remove:
            del _sync_jobs[job_id]


# ============================================================================
# PYDANTIC MODELS
# ============================================================================

class CloudBackupConfigModel(BaseModel):
    enabled: bool = False
    provider: str = "b2"
    endpoint: str = ""
    bucket: str = ""
    key_id: str = ""
    application_key: str = ""
    encryption_enabled: bool = True
    encryption_password: str = ""
    encryption_salt: str = ""
    include_dirs: List[str] = Field(default_factory=lambda: list(DEFAULT_INCLUDE_DIRS))
    exclude_dirs: List[str] = Field(default_factory=lambda: list(DEFAULT_EXCLUDE_DIRS))
    cooldown_seconds: int = 300
    bandwidth_limit: Optional[str] = None


class CloudBackupConfigUpdate(BaseModel):
    enabled: Optional[bool] = None
    provider: Optional[str] = None
    endpoint: Optional[str] = None
    bucket: Optional[str] = None
    key_id: Optional[str] = None
    application_key: Optional[str] = None
    encryption_enabled: Optional[bool] = None
    encryption_password: Optional[str] = None
    encryption_salt: Optional[str] = None
    include_dirs: Optional[List[str]] = None
    exclude_dirs: Optional[List[str]] = None
    cooldown_seconds: Optional[int] = None
    bandwidth_limit: Optional[str] = None


# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def _get_settings_manager():
    """Get SettingsManager from AppState."""
    app_state = get_app_state()
    return app_state.settings


def _load_config() -> dict:
    """Load cloud backup config from settings DB."""
    sm = _get_settings_manager()
    stored = sm.get(SETTINGS_KEY)
    if stored and isinstance(stored, dict):
        return stored
    return CloudBackupConfigModel().model_dump()


def _save_config(config: dict):
    """Save cloud backup config to settings DB."""
    sm = _get_settings_manager()
    sm.set(SETTINGS_KEY, config, category='cloud_backup', description='Cloud backup configuration')


def _mask_config(config: dict) -> dict:
    """Return config with sensitive fields masked."""
    masked = dict(config)
    for field in SENSITIVE_FIELDS:
        val = masked.get(field)
        if val and val != MASK_VALUE:
            masked[field] = MASK_VALUE
        elif not val:
            masked[field] = None
    return masked


def _merge_config_update(existing: dict, update: dict) -> dict:
    """Merge update into existing config, treating MASK_VALUE as 'keep existing'."""
    merged = dict(existing)
    for key, val in update.items():
        if val is None:
            continue
        if key in SENSITIVE_FIELDS and val == MASK_VALUE:
            continue  # keep existing
        merged[key] = val
    return merged


def _rclone_obscure(password: str) -> str:
    """Obscure a password using rclone obscure."""
    if not password:
        return ""
    result = subprocess.run(
        ["rclone", "obscure", password],
        capture_output=True, text=True, timeout=10
    )
    if result.returncode != 0:
        raise RuntimeError(f"rclone obscure failed: {result.stderr.strip()}")
    return result.stdout.strip()


def _regenerate_rclone_config(config: dict):
    """
    Regenerate rclone.conf preserving all existing sections
    except cloud-backup-remote and cloud-backup-crypt.
    """
    RCLONE_CONF_PATH.parent.mkdir(parents=True, exist_ok=True)

    # Read existing config
    existing_content = ""
    if RCLONE_CONF_PATH.exists():
        existing_content = RCLONE_CONF_PATH.read_text()

    # Parse existing sections (preserve all except our managed ones)
    sections = {}
    current_section = None
    current_lines = []

    for line in existing_content.splitlines():
        section_match = re.match(r'^\[(.+)\]$', line.strip())
        if section_match:
            if current_section is not None:
                sections[current_section] = '\n'.join(current_lines)
            current_section = section_match.group(1)
            current_lines = [line]
        elif current_section is not None:
            current_lines.append(line)
        # Lines before any section are ignored (shouldn't exist in rclone.conf)

    if current_section is not None:
        sections[current_section] = '\n'.join(current_lines)

    # Remove our managed sections
    sections.pop(RCLONE_REMOTE_NAME, None)
    sections.pop(RCLONE_CRYPT_NAME, None)

    # Build new sections
    provider = config.get('provider', 'b2')
    new_sections = []

    if provider == 'b2':
        new_sections.append(f"""[{RCLONE_REMOTE_NAME}]
type = b2
account = {config.get('key_id', '')}
key = {config.get('application_key', '')}
endpoint = {config.get('endpoint', '')}""")
    else:
        # S3-compatible fallback
        new_sections.append(f"""[{RCLONE_REMOTE_NAME}]
type = s3
provider = Other
access_key_id = {config.get('key_id', '')}
secret_access_key = {config.get('application_key', '')}
endpoint = {config.get('endpoint', '')}""")

    if config.get('encryption_enabled', True):
        enc_pass = _rclone_obscure(config.get('encryption_password', ''))
        enc_salt = _rclone_obscure(config.get('encryption_salt', ''))
        bucket = config.get('bucket', '')
        new_sections.append(f"""[{RCLONE_CRYPT_NAME}]
type = crypt
remote = {RCLONE_REMOTE_NAME}:{bucket}
password = {enc_pass}
password2 = {enc_salt}
filename_encryption = standard
directory_name_encryption = true""")

    # Write config atomically
    output_parts = []
    for section_name, section_content in sections.items():
        output_parts.append(section_content)
    for section in new_sections:
        output_parts.append(section)

    final_content = '\n\n'.join(output_parts) + '\n'

    fd, tmp_path = tempfile.mkstemp(dir=str(RCLONE_CONF_PATH.parent), suffix='.tmp')
    try:
        with os.fdopen(fd, 'w') as f:
            f.write(final_content)
        os.chmod(tmp_path, 0o600)
        os.rename(tmp_path, str(RCLONE_CONF_PATH))
        logger.info("rclone.conf regenerated successfully")
    except Exception:
        try:
            os.unlink(tmp_path)
        except OSError:
            pass
        raise


def _read_status_file() -> dict:
    """Read the daemon's status JSON file."""
    try:
        if STATUS_FILE.exists():
            data = json.loads(STATUS_FILE.read_text())
            return data
    except (json.JSONDecodeError, OSError) as e:
        logger.warning(f"Failed to read status file: {e}")
    return {}


def _get_service_status() -> str:
    """Check systemd service status."""
    try:
        result = subprocess.run(
            ["systemctl", "is-active", SERVICE_NAME],
            capture_output=True, text=True, timeout=5
        )
        return result.stdout.strip()  # active, inactive, failed, etc.
    except Exception:
        return "unknown"


def _find_ssd_mount() -> Optional[Path]:
    """Auto-detect the SSD underlying a mergerfs mount at IMMICH_BASE.

    Reads /proc/mounts to find the mergerfs entry, resolves drive names
    to mount paths, then checks each drive's rotational flag (ROTA=0 = SSD).
    """
    try:
        # Build a map of all mount points: mountpoint -> (device, ...)
        mounts = {}
        with open('/proc/mounts') as f:
            for line in f:
                parts = line.split()
                if len(parts) >= 2 and parts[0].startswith('/dev/'):
                    mounts[parts[1]] = parts[0]

        with open('/proc/mounts') as f:
            for line in f:
                parts = line.split()
                if len(parts) >= 3 and parts[1] == str(IMMICH_BASE) and 'mergerfs' in parts[2]:
                    # Source can be "samsung2tb:onetouch4tb" or "/mnt/a:/mnt/b"
                    drive_names = parts[0].split(':')
                    for name in drive_names:
                        name = name.strip()
                        if not name:
                            continue
                        # Try as-is first (absolute path), then /mnt/<name>
                        candidates = [name, f'/mnt/{name}']
                        for drive_path in candidates:
                            if drive_path in mounts:
                                dev = mounts[drive_path]
                                dev_base = dev.rstrip('0123456789')
                                dev_name = dev_base.split('/')[-1]
                                rotational = Path(f'/sys/block/{dev_name}/queue/rotational')
                                if rotational.exists() and rotational.read_text().strip() == '0':
                                    return Path(drive_path)
    except Exception:
        pass
    return None


def _get_fast_storage_base() -> Path:
    """Get the fastest storage path for dumps.

    Returns the SSD mount under mergerfs if detected, otherwise falls back
    to the mergerfs path itself. Dirs created here are visible via mergerfs
    without any cross-filesystem copies.
    """
    ssd = _find_ssd_mount()
    if ssd:
        return ssd
    return IMMICH_BASE


_FAST_BASE = _get_fast_storage_base()
DB_DUMPS_DIR = _FAST_BASE / "db_dumps"
APP_BACKUP_DIR = _FAST_BASE / "app_backup"

# Systemd service files to back up
SYSTEMD_FILES = [
    "media-downloader.service",
    "media-downloader-api.service",
    "media-downloader-frontend.service",
    "media-downloader-db-cleanup.service",
    "media-downloader-db-cleanup.timer",
    "xvfb-media-downloader.service",
    "cloud-backup-sync.service",
]


def _run_pre_sync_dumps(job_id: str = None, include_immich_db: bool = True, include_app_archive: bool = True):
    """Dump databases and archive app + system configs before syncing.

    Args:
        include_immich_db: If False, skip Immich DB dump (only needed daily).
        include_app_archive: If False, skip app tar.gz + system configs (only needed daily).
    """
    DB_DUMPS_DIR.mkdir(parents=True, exist_ok=True)
    APP_BACKUP_DIR.mkdir(parents=True, exist_ok=True)
    errors = []

    import shutil
    import time as _time

    def _status(msg):
        if job_id:
            _update_sync_job(job_id, {'current_dir': msg})

    def _get_dir_size(path):
        """Get total size of a directory in bytes."""
        try:
            return sum(f.stat().st_size for f in Path(path).rglob('*') if f.is_file())
        except OSError:
            return 0

    def _get_docker_dir_size(container, path):
        """Get size of a directory inside a docker container."""
        try:
            r = subprocess.run(
                ["docker", "exec", container, "du", "-sb", path],
                capture_output=True, text=True, timeout=5
            )
            if r.returncode == 0:
                return int(r.stdout.split()[0])
        except Exception:
            pass
        return 0

    def _run_with_progress(cmd, label, total_bytes, size_fn, env=None):
        """Run a subprocess while polling output size for progress updates."""
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
        while proc.poll() is None:
            _time.sleep(3)
            current = size_fn()
            if total_bytes > 0:
                pct = min(99, int(current / total_bytes * 100))
                _status(f"{label} ({pct}%)")
        stdout, stderr = proc.communicate()
        return proc.returncode, stderr.decode() if stderr else ""

    # ── 1. Database dumps ──────────────────────────────────────────────

    # Immich PostgreSQL (Docker container) — only during daily/full backup
    if include_immich_db:
        try:
            _status("Dumping Immich database (0%)...")
            logger.info("Dumping Immich database...")
            # Get DB size for progress tracking
            sz_result = subprocess.run(
                ["docker", "exec", "immich_postgres",
                 "psql", "-U", "postgres", "-d", "immich", "-tAc",
                 "SELECT pg_database_size('immich')"],
                capture_output=True, text=True, timeout=10
            )
            immich_db_size = int(sz_result.stdout.strip()) if sz_result.returncode == 0 else 0

            # Clean previous dump dir
            subprocess.run(
                ["docker", "exec", "immich_postgres", "rm", "-rf", "/tmp/immich_dump"],
                capture_output=True, text=True, timeout=10
            )

            rc, stderr = _run_with_progress(
                ["docker", "exec", "immich_postgres",
                 "pg_dump", "-U", "postgres", "-d", "immich",
                 "--no-owner", "--no-acl", "-Fd", "-j", "4",
                 "--exclude-table-data=face_search",
                 "--exclude-table-data=smart_search",
                 "-f", "/tmp/immich_dump"],
                "Dumping Immich DB",
                immich_db_size,
                lambda: _get_docker_dir_size("immich_postgres", "/tmp/immich_dump"),
            )
            if rc == 0:
                _status("Copying Immich dump...")
                subprocess.run(
                    ["docker", "exec", "immich_postgres",
                     "tar", "cf", "/tmp/immich_dump.tar", "-C", "/tmp", "immich_dump"],
                    capture_output=True, text=True, timeout=120
                )
                # docker cp directly to SSD-backed DB_DUMPS_DIR (no cross-fs move needed)
                final_dest = str(DB_DUMPS_DIR / "immich_dump.tar")
                subprocess.run(
                    ["docker", "cp", "immich_postgres:/tmp/immich_dump.tar", final_dest],
                    capture_output=True, text=True, timeout=120
                )
                subprocess.run(
                    ["docker", "exec", "immich_postgres",
                     "sh", "-c", "rm -rf /tmp/immich_dump /tmp/immich_dump.tar"],
                    capture_output=True, text=True, timeout=10
                )
                size_mb = Path(final_dest).stat().st_size / 1e6
                logger.info(f"Immich DB dump: {size_mb:.1f} MB")
            else:
                errors.append(f"Immich pg_dump failed: {stderr[:200]}")
                logger.error(f"Immich pg_dump failed: {stderr[:200]}")
        except Exception as e:
            errors.append(f"Immich pg_dump error: {e}")
            logger.error(f"Immich pg_dump error: {e}")

    # Media Downloader PostgreSQL (local) — parallel directory-format dump
    # Uses --lock-wait-timeout to avoid hanging when scheduler holds locks,
    # with a serial (-j 1) retry if parallel dump fails on lock contention.
    try:
        _status("Dumping Media Downloader DB (0%)...")
        logger.info("Dumping Media Downloader database...")
        # Get DB size
        env = os.environ.copy()
        env["PGPASSWORD"] = "PNsihOXvvuPwWiIvGlsc9Fh2YmMmB"
        sz_result = subprocess.run(
            ["psql", "-h", "localhost", "-U", "media_downloader",
             "-d", "media_downloader", "-tAc",
             "SELECT pg_database_size('media_downloader')"],
            capture_output=True, text=True, timeout=10, env=env
        )
        md_db_size = int(sz_result.stdout.strip()) if sz_result.returncode == 0 else 0

        # Dump directly to destination (cross-fs move is slower than direct write)
        final_dump_dir = str(DB_DUMPS_DIR / "media_downloader_dump")
        if Path(final_dump_dir).exists():
            shutil.rmtree(final_dump_dir)

        base_cmd = [
            "pg_dump", "-h", "localhost", "-U", "media_downloader",
            "-d", "media_downloader", "--no-owner", "--no-acl",
            "--exclude-table-data=thumbnails",
            "--lock-wait-timeout=30000",
        ]
        rc, stderr = _run_with_progress(
            base_cmd + ["-Fd", "-j", "4", "-f", final_dump_dir],
            "Dumping Media Downloader DB",
            md_db_size,
            lambda: _get_dir_size(final_dump_dir),
            env=env,
        )
        if rc != 0 and "could not obtain lock" in stderr:
            # Lock contention — retry serial after a short wait
            logger.warning("Parallel pg_dump hit lock contention, retrying serial in 15s...")
            _status("DB lock contention, retrying...")
            if Path(final_dump_dir).exists():
                shutil.rmtree(final_dump_dir)
            import time as _time2
            _time2.sleep(15)
            rc, stderr = _run_with_progress(
                base_cmd + ["-Fd", "-j", "1", "-f", final_dump_dir],
                "Dumping Media Downloader DB (retry)",
                md_db_size,
                lambda: _get_dir_size(final_dump_dir),
                env=env,
            )
        if rc == 0:
            total = _get_dir_size(final_dump_dir)
            logger.info(f"Media Downloader DB dump: {total / 1e6:.1f} MB")
        else:
            errors.append(f"Media Downloader pg_dump failed: {stderr[:200]}")
            logger.error(f"Media Downloader pg_dump failed: {stderr[:200]}")
    except Exception as e:
        errors.append(f"Media Downloader pg_dump error: {e}")
        logger.error(f"Media Downloader pg_dump error: {e}")

    # ── 2-6: App archive + system configs (daily only) ─────────────────

    if include_app_archive:
        try:
            _status("Archiving media-downloader app...")
            logger.info("Archiving media-downloader app...")
            # Write directly to SSD-backed APP_BACKUP_DIR (no cross-fs move)
            final_archive = APP_BACKUP_DIR / "media-downloader-app.tar.gz"
            result = subprocess.run(
                ["tar", "czf", str(final_archive),
                 "--exclude=./venv",
                 "--exclude=./web/frontend/node_modules",
                 "--exclude=./logs",
                 "--exclude=./cache/thumbnails",
                 "--exclude=./__pycache__",
                 "--exclude=./.git",
                 "--exclude=./temp",
                 "-C", "/opt", "media-downloader"],
                capture_output=True, text=True, timeout=600
            )
            if result.returncode == 0:
                size_mb = final_archive.stat().st_size / 1e6
                logger.info(f"App archive: {size_mb:.1f} MB")
            else:
                errors.append(f"App archive failed: {result.stderr[:200]}")
                logger.error(f"App archive failed: {result.stderr[:200]}")
        except Exception as e:
            errors.append(f"App archive error: {e}")
            logger.error(f"App archive error: {e}")

        try:
            services_dir = APP_BACKUP_DIR / "systemd"
            services_dir.mkdir(parents=True, exist_ok=True)
            copied = 0
            for svc in SYSTEMD_FILES:
                src = Path(f"/etc/systemd/system/{svc}")
                if src.exists():
                    (services_dir / svc).write_text(src.read_text())
                    copied += 1
            logger.info(f"Copied {copied} systemd service files")
        except Exception as e:
            errors.append(f"Systemd backup error: {e}")
            logger.error(f"Systemd backup error: {e}")

        try:
            if RCLONE_CONF_PATH.exists():
                (APP_BACKUP_DIR / "rclone.conf").write_text(RCLONE_CONF_PATH.read_text())
                logger.info("Copied rclone.conf")
        except Exception as e:
            errors.append(f"rclone config backup error: {e}")
            logger.error(f"rclone config backup error: {e}")

        try:
            compose_src = IMMICH_BASE / "docker-compose.yml"
            if compose_src.exists():
                (APP_BACKUP_DIR / "immich-docker-compose.yml").write_text(compose_src.read_text())
            env_src = IMMICH_BASE / ".env"
            if env_src.exists():
                (APP_BACKUP_DIR / "immich-env").write_text(env_src.read_text())
            logger.info("Copied Immich docker-compose + .env")
        except Exception as e:
            errors.append(f"Immich compose backup error: {e}")
            logger.error(f"Immich compose backup error: {e}")

        try:
            restore_src = Path("/opt/media-downloader/scripts/cloud_backup_restore.sh")
            if restore_src.exists():
                (APP_BACKUP_DIR / "RESTORE.sh").write_text(restore_src.read_text())
                logger.info("Copied restore script as RESTORE.sh")
        except Exception as e:
            errors.append(f"Restore script backup error: {e}")
            logger.error(f"Restore script backup error: {e}")

    return errors


def _run_full_backup(config: dict, job_id: str):
    """Run Media Downloader DB dump + rclone sync.
    Immich DB + app archive are daily-only (handled by daemon at 3 AM).
    """
    dump_errors = _run_pre_sync_dumps(job_id=job_id, include_immich_db=False, include_app_archive=False)
    _run_file_sync(config, job_id, extra_errors=dump_errors)


def _parse_rclone_stats(line: str) -> Optional[Dict]:
    """Parse an rclone --stats-one-line output line into structured data.

    Actual rclone format:
      INFO  :     1.424 GiB / 1.424 GiB, 100%, 0 B/s, ETA -
      INFO  :     1.424 GiB / 1.424 GiB, 100%, 10.5 MiB/s, ETA 7m30s, Checks: 100, Transferred: 15 / 45
    """
    stats = {}
    try:
        # Extract: bytes_done / bytes_total, pct%, speed
        m = re.search(r'([\d.]+\s*\S*?B)\s*/\s*([\d.]+\s*\S*?B),\s*(\d+)%,\s*([\d.]+\s*\S*?/s)', line)
        if m:
            stats['bytes_done'] = m.group(1).strip()
            stats['bytes_total'] = m.group(2).strip()
            stats['pct'] = int(m.group(3))
            stats['speed'] = m.group(4).strip()
        # Extract ETA
        m_eta = re.search(r'ETA\s+([\w.]+)', line)
        if m_eta and m_eta.group(1) != '-':
            stats['eta'] = m_eta.group(1)
        # Extract file count: "Transferred: 15 / 45" at end of one-line stats
        m_files = re.search(r'Transferred:\s*(\d+)\s*/\s*(\d+)', line)
        if m_files:
            stats['files_done'] = int(m_files.group(1))
            stats['files_total'] = int(m_files.group(2))
        # Extract checks count — "Checks: 100" or "(chk#573/582)"
        m_checks = re.search(r'Checks:\s*(\d+)', line)
        if m_checks:
            stats['checks_done'] = int(m_checks.group(1))
        m_chk = re.search(r'chk#(\d+)/(\d+)', line)
        if m_chk:
            stats['checks_done'] = int(m_chk.group(1))
            stats['checks_total'] = int(m_chk.group(2))
    except (ValueError, AttributeError):
        pass
    return stats if stats else None


def _run_file_sync(config: dict, job_id: str, extra_errors: list = None):
    """Run rclone sync for all included directories with live transfer stats."""
    include_dirs = config.get('include_dirs', DEFAULT_INCLUDE_DIRS)
    encryption_enabled = config.get('encryption_enabled', True)
    bandwidth_limit = config.get('bandwidth_limit')
    bucket = config.get('bucket', '')

    dirs_to_sync = [d for d in include_dirs if (IMMICH_BASE / d).is_dir()]
    _update_sync_job(job_id, {'dirs_total': len(dirs_to_sync), 'transfer_stats': None})

    errors = list(extra_errors) if extra_errors else []
    for i, dir_name in enumerate(dirs_to_sync):
        _update_sync_job(job_id, {
            'current_dir': dir_name, 'dirs_synced': i,
            'transfer_stats': None, 'current_file': None,
            'phase': 'checking',
        })
        src = str(IMMICH_BASE / dir_name)
        if encryption_enabled:
            dest = f"{RCLONE_CRYPT_NAME}:{dir_name}"
        else:
            dest = f"{RCLONE_REMOTE_NAME}:{bucket}/{dir_name}"

        cmd = [
            "rclone", "sync", src, dest,
            "--config", str(RCLONE_CONF_PATH),
            "--stats", "3s",
            "--stats-one-line",
            "-v",
            "--transfers", "4",
            "--checkers", "16",
            "--fast-list",
        ]
        if bandwidth_limit:
            cmd.extend(["--bwlimit", bandwidth_limit])

        try:
            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            log_fh = open(str(LOG_FILE), 'a')
            ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            log_fh.write(f"\n[{ts}] Syncing {dir_name}/ ({i+1}/{len(dirs_to_sync)})\n")
            log_fh.flush()
            try:
                for line in proc.stderr:
                    stripped = line.strip()
                    if not stripped:
                        continue

                    # Clean rclone output: strip syslog prefix "<6>INFO  :" → clean message
                    clean = re.sub(r'^<\d+>', '', stripped)  # strip syslog priority
                    clean = re.sub(r'^(INFO|DEBUG|ERROR|NOTICE)\s*:\s*', '', clean).strip()

                    # Detect progress/stats lines (with % or chk# markers)
                    is_progress = ('%,' in stripped and '/s' in stripped) or 'chk#' in stripped

                    # Write cleaned output to log file
                    if is_progress:
                        # Progress line — only log the latest (overwrite style)
                        log_fh.write(f"  [progress] {clean}\n")
                    elif ': Copied' in stripped or ': Moved' in stripped:
                        m = re.search(r'(.+?):\s*(Copied|Moved)\s*(.*)', clean)
                        if m:
                            fname = m.group(1).strip()
                            action = m.group(2)
                            detail = m.group(3).strip()
                            short = fname.split('/')[-1] if '/' in fname else fname
                            log_fh.write(f"  {action}: {short} {detail}\n")
                        else:
                            log_fh.write(f"  {clean}\n")
                    elif 'Checks:' in clean and 'Transferred:' in clean:
                        # Summary line at end of dir sync
                        log_fh.write(f"  {clean}\n")
                    elif clean:
                        log_fh.write(f"  {clean}\n")
                    log_fh.flush()

                    # Update live stats for frontend
                    if is_progress:
                        stats = _parse_rclone_stats(stripped)
                        if stats:
                            # Determine phase from stats
                            phase = 'checking'
                            if stats.get('files_done', 0) > 0 or stats.get('files_total', 0) > 0:
                                phase = 'transferring'
                            elif stats.get('pct', 0) > 0:
                                phase = 'transferring'
                            _update_sync_job(job_id, {'transfer_stats': stats, 'phase': phase})

                    if ': Copied' in stripped or ': Moved' in stripped:
                        m = re.search(r'INFO\s*:\s*(.+?):\s*(?:Copied|Moved)', stripped)
                        if m:
                            fname = m.group(1).strip()
                            short = fname.split('/')[-1] if '/' in fname else fname
                            _update_sync_job(job_id, {'current_file': short, 'phase': 'transferring'})
            finally:
                log_fh.close()

            proc.wait()
            ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            if proc.returncode != 0:
                err_msg = f"Sync failed for {dir_name}: exit code {proc.returncode}"
                errors.append(err_msg)
                logger.error(err_msg)
                with open(str(LOG_FILE), 'a') as f:
                    f.write(f"[{ts}] ERROR: {dir_name}/ failed (exit {proc.returncode})\n")
            else:
                with open(str(LOG_FILE), 'a') as f:
                    f.write(f"[{ts}] Completed {dir_name}/\n")
        except Exception as e:
            errors.append(f"Sync error for {dir_name}: {e}")
            logger.error(f"Sync error for {dir_name}: {e}")

    ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    status_msg = 'completed' if not errors else f'completed with {len(errors)} error(s)'
    with open(str(LOG_FILE), 'a') as f:
        f.write(f"\n[{ts}] Sync {status_msg} — {len(dirs_to_sync)} directories\n{'='*60}\n")

    completed_status = 'completed' if not errors else 'completed_with_errors'
    _update_sync_job(job_id, {
        'status': completed_status,
        'completed_at': datetime.now().isoformat(),
        'dirs_synced': len(dirs_to_sync),
        'current_dir': None,
        'error': '; '.join(errors) if errors else None,
    })

    # Write status file for daemon/dashboard
    _write_manual_sync_status(completed_status, errors)
    _cleanup_old_sync_jobs()

    # Auto-refresh cloud storage stats after sync
    _trigger_cloud_status_refresh()


def _write_manual_sync_status(status: str, errors: list):
    """Write sync completion info to status file for dashboard consumption."""
    try:
        existing = _read_status_file()
        existing['last_manual_sync'] = datetime.now().isoformat()
        existing['last_manual_sync_status'] = status
        if errors:
            existing['last_manual_sync_errors'] = errors
        STATUS_FILE.write_text(json.dumps(existing))
    except OSError as e:
        logger.warning(f"Failed to write status file: {e}")


# ============================================================================
# API ENDPOINTS
# ============================================================================

@router.get("/config")
async def get_config(user=Depends(get_current_user)):
    """Get cloud backup configuration (sensitive fields masked)."""
    config = _load_config()
    return _mask_config(config)


@router.put("/config")
async def update_config(update: CloudBackupConfigUpdate, user=Depends(require_admin)):
    """Save cloud backup configuration and regenerate rclone.conf."""
    existing = _load_config()
    update_dict = update.model_dump(exclude_unset=True)
    merged = _merge_config_update(existing, update_dict)

    # Validate required fields if enabling
    if merged.get('enabled'):
        missing = []
        if not merged.get('endpoint'):
            missing.append('endpoint')
        if not merged.get('bucket'):
            missing.append('bucket')
        if not merged.get('key_id'):
            missing.append('key_id')
        if not merged.get('application_key'):
            missing.append('application_key')
        if merged.get('encryption_enabled'):
            if not merged.get('encryption_password'):
                missing.append('encryption_password')
            if not merged.get('encryption_salt'):
                missing.append('encryption_salt')
        if missing:
            from fastapi import HTTPException
            raise HTTPException(status_code=400, detail=f"Missing required fields: {', '.join(missing)}")

    _save_config(merged)

    # Regenerate rclone config if we have credentials
    if merged.get('key_id') and merged.get('application_key'):
        try:
            _regenerate_rclone_config(merged)
        except Exception as e:
            logger.error(f"Failed to regenerate rclone config: {e}")
            return {"status": "saved", "rclone_config": "failed", "error": str(e), "config": _mask_config(merged)}

    return {"status": "saved", "rclone_config": "updated", "config": _mask_config(merged)}


@router.get("/status")
async def get_status(user=Depends(get_current_user)):
    """Get sync status including service state, last sync time, storage info.

    Reads job progress from the daemon's status file. When the daemon is
    running a manual sync (has job_id), its progress fields take priority
    over in-memory job tracking.
    """
    config = _load_config()
    service_status = _get_service_status()
    daemon_status = _read_status_file()

    active_job = None
    daemon_job_id = daemon_status.get('job_id')

    # Priority 1: Daemon is actively syncing a job — build active_job from daemon progress
    if daemon_job_id and daemon_status.get('state') == 'syncing':
        active_job = {
            'id': daemon_job_id,
            'status': 'running',
            'started_at': daemon_status.get('started_at'),
            'completed_at': None,
            'dirs_synced': daemon_status.get('dirs_synced', 0),
            'dirs_total': daemon_status.get('dirs_total', 0),
            'current_dir': daemon_status.get('current_dir'),
            'current_file': daemon_status.get('current_file'),
            'transfer_stats': daemon_status.get('transfer_stats'),
            'phase': daemon_status.get('phase', 'syncing'),
            'error': None,
        }
        # Also update in-memory job to stay in sync
        if _get_sync_job(daemon_job_id):
            _update_sync_job(daemon_job_id, {
                'dirs_synced': daemon_status.get('dirs_synced', 0),
                'dirs_total': daemon_status.get('dirs_total', 0),
                'current_dir': daemon_status.get('current_dir'),
                'current_file': daemon_status.get('current_file'),
                'transfer_stats': daemon_status.get('transfer_stats'),
                'phase': daemon_status.get('phase', 'syncing'),
            })

    # Priority 2: Daemon has a completed job — update in-memory job
    elif daemon_job_id and daemon_status.get('completed_at'):
        in_mem = _get_sync_job(daemon_job_id)
        if in_mem and in_mem.get('status') == 'running':
            completed_status = 'completed' if not daemon_status.get('sync_error') else 'completed_with_errors'
            _update_sync_job(daemon_job_id, {
                'status': completed_status,
                'completed_at': daemon_status.get('completed_at'),
                'dirs_synced': daemon_status.get('dirs_synced', 0),
                'dirs_total': daemon_status.get('dirs_total', 0),
                'error': daemon_status.get('sync_error'),
                'phase': 'completed',
                'current_dir': None,
                'current_file': None,
                'transfer_stats': None,
            })
            # Sync just finished in daemon — refresh cloud storage stats
            _trigger_cloud_status_refresh()

    # Priority 3: Fall back to in-memory jobs (legacy / pending trigger)
    if active_job is None:
        with _jobs_lock:
            for job in _sync_jobs.values():
                if job['status'] == 'running':
                    active_job = dict(job)
                    break

    # Determine effective state
    is_syncing = active_job is not None or daemon_status.get('state') == 'syncing'
    if active_job is not None:
        effective_state = 'syncing'
    elif daemon_status.get('state'):
        effective_state = daemon_status['state']
    elif service_status == 'active':
        effective_state = 'idle'
    else:
        effective_state = 'idle'

    return {
        "configured": bool(config.get('key_id') and config.get('bucket')),
        "enabled": config.get('enabled', False),
        "service_status": service_status,
        "syncing": is_syncing,
        "last_sync": daemon_status.get('last_sync', daemon_status.get('last_manual_sync')),
        "last_sync_status": daemon_status.get('last_sync_status', daemon_status.get('last_manual_sync_status')),
        "files_watched": daemon_status.get('files_watched', 0),
        "storage_used": daemon_status.get('storage_used'),
        "error_count": daemon_status.get('error_count', 0),
        "last_error": daemon_status.get('last_error'),
        "cooldown_remaining": daemon_status.get('cooldown_remaining', 0),
        "active_job": active_job,
        "state": effective_state,
    }


@router.post("/sync")
async def trigger_sync(user=Depends(get_current_user)):
    """Trigger a manual sync by delegating to the daemon via trigger file.

    The sync runs inside the daemon process (cloud-backup-sync.service),
    so it survives API/scheduler restarts. Progress is tracked via the
    daemon's status file.
    """
    from fastapi import HTTPException

    config = _load_config()
    if not config.get('key_id') or not config.get('bucket'):
        raise HTTPException(status_code=400, detail="Cloud backup not configured")

    # Check daemon is running
    service_status = _get_service_status()
    if service_status != "active":
        raise HTTPException(status_code=503, detail=f"Cloud backup daemon is not running (status: {service_status})")

    # Check daemon not already syncing (via status file)
    daemon_status = _read_status_file()
    if daemon_status.get('state') == 'syncing':
        raise HTTPException(status_code=409, detail="A sync is already in progress")

    # Check no trigger file already pending
    if TRIGGER_FILE.exists():
        raise HTTPException(status_code=409, detail="A sync trigger is already pending")

    # Check for in-memory running jobs (race condition guard)
    with _jobs_lock:
        for job in _sync_jobs.values():
            if job['status'] == 'running':
                raise HTTPException(status_code=409, detail="A sync is already in progress")

    job_id = str(uuid.uuid4())

    # Write trigger file for daemon to pick up
    trigger_data = {
        "type": "sync",
        "job_id": job_id,
        "requested_at": datetime.now().isoformat(),
        "requested_by": "api",
    }
    try:
        TRIGGER_FILE.write_text(json.dumps(trigger_data))
    except OSError as e:
        raise HTTPException(status_code=500, detail=f"Failed to write trigger file: {e}")

    # Create in-memory job tracker for immediate API response
    _create_sync_job(job_id)

    return {"status": "started", "job_id": job_id}


@router.post("/pause")
async def pause_sync(user=Depends(get_current_user)):
    """Stop the cloud backup sync service."""
    try:
        result = subprocess.run(
            ["systemctl", "stop", SERVICE_NAME],
            capture_output=True, text=True, timeout=30
        )
        if result.returncode != 0:
            return {"status": "error", "message": result.stderr.strip()}
        return {"status": "stopped"}
    except Exception as e:
        logger.error(f"Failed to stop sync service: {e}")
        return {"status": "error", "message": str(e)}


@router.post("/resume")
async def resume_sync(user=Depends(get_current_user)):
    """Start the cloud backup sync service."""
    try:
        result = subprocess.run(
            ["systemctl", "start", SERVICE_NAME],
            capture_output=True, text=True, timeout=30
        )
        if result.returncode != 0:
            return {"status": "error", "message": result.stderr.strip()}
        return {"status": "started"}
    except Exception as e:
        logger.error(f"Failed to start sync service: {e}")
        return {"status": "error", "message": str(e)}


def _format_progress_summary(progress_line: str) -> str:
    """Format a [progress] line into a clean summary like '    Checked 108,024 / 118,056 files — 100% — 46.985 KiB/s'."""
    parts = []
    chk_m = re.search(r'chk#(\d+)/(\d+)', progress_line)
    if chk_m:
        parts.append(f"Checked {int(chk_m.group(1)):,} / {int(chk_m.group(2)):,} files")
    xfr_m = re.search(r'xfr#(\d+)/(\d+)', progress_line)
    if xfr_m:
        parts.append(f"Transferred {int(xfr_m.group(1)):,} / {int(xfr_m.group(2)):,} files")
    pct_m = re.search(r'(\d+)%', progress_line)
    if pct_m:
        parts.append(f"{pct_m.group(1)}%")
    speed_m = re.search(r'([\d.]+\s*\S*i?B/s)', progress_line)
    if speed_m:
        parts.append(speed_m.group(1))
    return f"    {' — '.join(parts)}" if parts else ""


@router.get("/logs")
async def get_logs(lines: int = 200, user=Depends(get_current_user)):
    """Get last N lines from the cloud backup sync log, cleaned and summarized."""
    if not LOG_FILE.exists():
        return {"logs": "", "lines": 0}

    try:
        result = subprocess.run(
            ["tail", "-n", str(min(lines * 10, 5000)), str(LOG_FILE)],
            capture_output=True, text=True, timeout=10
        )
        # Whitelist-based parser: only emit lines matching known useful patterns.
        # Everything else (daemon internals, tracebacks, config noise) is dropped.
        output = []
        last_progress = None

        # Regex to strip daemon/universal-logger timestamp prefixes from any line
        _prefix_re = re.compile(
            r'^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}[.,]\d+\s+'
            r'(?:\[MediaDownloader\.\w+\]\s+)?'
            r'(?:\[(?:INFO|DEBUG|WARNING|ERROR)\]\s+)?'
            r'(?:\[Core\]\s+\[(?:INFO|DEBUG|WARNING|ERROR)\]\s+)?'
        )

        for line in result.stdout.splitlines():
            raw = line.strip()
            if not raw:
                continue

            # Skip universal-logger formatted lines entirely (they duplicate
            # the file_handler lines with a different format)
            if '[MediaDownloader.' in raw:
                continue

            # Strip daemon log prefix to get the clean message
            stripped = _prefix_re.sub('', raw).strip()
            if not stripped:
                continue

            # ── Capture rclone progress lines (emitted by daemon Popen loop) ──
            if re.match(r'^\s*\[progress\]', stripped):
                last_progress = stripped
                continue

            # ── Section headers: "[2026-03-22 01:13:58] Syncing paid/ (1/11)" ──
            if re.match(r'^\[\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\]\s+', stripped):
                # Flush pending progress summary before new section
                if last_progress:
                    output.append(_format_progress_summary(last_progress))
                    last_progress = None
                output.append(stripped)
                continue

            # ── Separator lines: "===..." ──
            if re.match(r'^={10,}$', stripped):
                output.append(stripped)
                continue

            # ── File transfer lines: "fname: Copied (new, replaced existing)" ──
            m_xfer = re.match(r'^(.+?):\s*(Copied|Moved|Deleted)\s*(.*)', stripped)
            if m_xfer:
                fname = m_xfer.group(1).strip()
                action = m_xfer.group(2)
                detail = m_xfer.group(3).strip()
                short = fname.split('/')[-1] if '/' in fname else fname
                detail_short = f" ({detail})" if detail and detail != '(new)' else ''
                output.append(f"    {action}: {short}{detail_short}")
                continue

            # ── Our cleaned format: "    Copied: filename.jpg" ──
            m_clean = re.match(r'^\s*(Copied|Moved|Deleted):\s*(.+)', stripped)
            if m_clean:
                output.append(f"    {m_clean.group(1)}: {m_clean.group(2).strip()}")
                continue

            # ── Completion lines from manual log writes: "Completed dir/" ──
            if re.match(r'^Completed\s+\S+/', stripped):
                output.append(stripped)
                continue

            # ── rclone sync errors ──
            if re.match(r'^rclone sync failed', stripped):
                if not any(stripped in prev for prev in output[-3:]):
                    output.append(f"    ERROR: {stripped}")
                continue
            if re.match(r'^ERROR:', stripped):
                if not any(stripped in prev for prev in output[-3:]):
                    output.append(stripped)
                continue

            # ── Sync summary lines: "Sync completed — 11 directories" ──
            if re.match(r'^Sync (completed|completed with \d+ error)', stripped):
                output.append(stripped)
                continue

            # ── pg_dump error (deduplicated) ──
            if 'pg_dump failed' in stripped:
                if not any('pg_dump failed' in prev for prev in output[-5:]):
                    output.append(f"    ERROR: {stripped}")
                continue

            # Everything else is dropped (daemon internals, tracebacks, config noise, etc.)

        # Flush trailing progress for active directory
        if last_progress:
            output.append(_format_progress_summary(last_progress))

        # Return last N lines
        final = output[-lines:] if len(output) > lines else output
        log_content = '\n'.join(final)
        return {"logs": log_content, "lines": len(final)}
    except Exception as e:
        return {"logs": f"Error reading logs: {e}", "lines": 0}


@router.post("/test")
async def test_connection(user=Depends(get_current_user)):
    """Test the rclone connection by listing the remote."""
    config = _load_config()
    if not config.get('key_id') or not config.get('application_key'):
        from fastapi import HTTPException
        raise HTTPException(status_code=400, detail="Cloud backup credentials not configured")

    # Ensure rclone config is up to date
    try:
        _regenerate_rclone_config(config)
    except Exception as e:
        return {"status": "error", "message": f"Failed to write rclone config: {e}"}

    # Test with rclone lsd
    bucket = config.get('bucket', '')
    encryption_enabled = config.get('encryption_enabled', True)

    if encryption_enabled:
        test_remote = f"{RCLONE_CRYPT_NAME}:"
    else:
        test_remote = f"{RCLONE_REMOTE_NAME}:{bucket}"

    try:
        result = subprocess.run(
            ["rclone", "lsd", test_remote, "--config", str(RCLONE_CONF_PATH), "--max-depth", "1"],
            capture_output=True, text=True, timeout=30
        )
        if result.returncode == 0:
            dirs = [line.strip().split()[-1] for line in result.stdout.strip().splitlines() if line.strip()]
            return {"status": "success", "message": "Connection successful", "remote_dirs": dirs}
        else:
            return {"status": "error", "message": result.stderr.strip() or "Connection failed"}
    except subprocess.TimeoutExpired:
        return {"status": "error", "message": "Connection timed out (30s)"}
    except Exception as e:
        return {"status": "error", "message": str(e)}


@router.get("/job/{job_id}")
async def get_job(job_id: str, user=Depends(get_current_user)):
    """Get the status of a specific sync job.

    Checks both in-memory tracking and the daemon's status file.
    Daemon status takes priority when the job is actively running there.
    """
    # Check daemon status file for this job
    daemon_status = _read_status_file()
    daemon_job_id = daemon_status.get('job_id')

    if daemon_job_id == job_id:
        if daemon_status.get('state') == 'syncing':
            # Daemon is actively running this job
            return {
                'id': job_id,
                'status': 'running',
                'started_at': daemon_status.get('started_at'),
                'completed_at': None,
                'dirs_synced': daemon_status.get('dirs_synced', 0),
                'dirs_total': daemon_status.get('dirs_total', 0),
                'current_dir': daemon_status.get('current_dir'),
                'current_file': daemon_status.get('current_file'),
                'transfer_stats': daemon_status.get('transfer_stats'),
                'phase': daemon_status.get('phase', 'syncing'),
                'error': None,
            }
        elif daemon_status.get('completed_at'):
            # Daemon completed this job
            has_error = bool(daemon_status.get('sync_error'))
            return {
                'id': job_id,
                'status': 'completed_with_errors' if has_error else 'completed',
                'started_at': daemon_status.get('started_at'),
                'completed_at': daemon_status.get('completed_at'),
                'dirs_synced': daemon_status.get('dirs_synced', 0),
                'dirs_total': daemon_status.get('dirs_total', 0),
                'current_dir': None,
                'current_file': None,
                'transfer_stats': None,
                'phase': 'completed',
                'error': daemon_status.get('sync_error'),
            }

    # Fall back to in-memory job tracking
    job = _get_sync_job(job_id)
    if not job:
        from fastapi import HTTPException
        raise HTTPException(status_code=404, detail="Job not found")
    return job


@router.get("/available-dirs")
async def get_available_dirs(user=Depends(get_current_user)):
    """List available directories under /opt/immich for backup."""
    dirs = []
    if IMMICH_BASE.is_dir():
        for entry in sorted(IMMICH_BASE.iterdir()):
            if entry.is_dir() and not entry.name.startswith('.'):
                dirs.append(entry.name)
    return {"dirs": dirs}


CLOUD_STATUS_CACHE_FILE = Path("/tmp/cloud_storage_status.json")
CLOUD_STATUS_REFRESH_INTERVAL = 6 * 3600  # 6 hours

_cloud_status_query: Dict[str, Any] = {"running": False}
_cloud_status_lock = Lock()


def _trigger_cloud_status_refresh():
    """Start a cloud status refresh in a background thread if not already running."""
    import threading
    with _cloud_status_lock:
        if _cloud_status_query["running"]:
            return
        _cloud_status_query["running"] = True
    t = threading.Thread(target=_run_cloud_status_query, daemon=True)
    t.start()


def _cloud_status_periodic_loop():
    """Periodic loop that refreshes cloud status every 6 hours.

    On startup, checks if the cached data is stale (older than the refresh
    interval) and triggers an immediate refresh if so. This prevents restarts
    from resetting the 6-hour countdown indefinitely.
    """
    import time as _time

    # Check staleness on startup — refresh immediately if cache is old
    try:
        _time.sleep(30)  # brief delay to let API fully start
        stale = True
        if CLOUD_STATUS_CACHE_FILE.exists():
            cached = json.loads(CLOUD_STATUS_CACHE_FILE.read_text())
            queried_at = cached.get('queried_at')
            if queried_at:
                from datetime import datetime as _dt
                age = (_dt.now() - _dt.fromisoformat(queried_at)).total_seconds()
                stale = age > CLOUD_STATUS_REFRESH_INTERVAL
        if stale:
            config = _load_config()
            if config.get('key_id') and config.get('bucket'):
                logger.info("Cloud status cache is stale on startup, refreshing...")
                _trigger_cloud_status_refresh()
    except Exception as e:
        logger.warning(f"Startup cloud status staleness check error: {e}")

    while True:
        _time.sleep(CLOUD_STATUS_REFRESH_INTERVAL)
        try:
            config = _load_config()
            if config.get('key_id') and config.get('bucket'):
                logger.info("Periodic cloud status refresh starting...")
                _trigger_cloud_status_refresh()
        except Exception as e:
            logger.warning(f"Periodic cloud status refresh error: {e}")


# Start the periodic refresh thread
import threading as _threading
_cloud_status_timer = _threading.Thread(target=_cloud_status_periodic_loop, daemon=True)
_cloud_status_timer.start()


def _run_cloud_status_query():
    """Background worker: query rclone for cloud storage stats and cache to disk."""
    from concurrent.futures import ThreadPoolExecutor, as_completed

    config = _load_config()
    bucket = config.get('bucket', '')
    encryption_enabled = config.get('encryption_enabled', True)
    provider = config.get('provider', 'b2')
    endpoint = config.get('endpoint', '')

    if encryption_enabled:
        remote = f"{RCLONE_CRYPT_NAME}:"
    else:
        remote = f"{RCLONE_REMOTE_NAME}:{bucket}"

    result = {
        "provider": provider,
        "bucket": bucket,
        "endpoint": endpoint,
        "encryption": encryption_enabled,
        "total_files": None,
        "total_size": None,
        "total_size_human": None,
        "directories": [],
        "error": None,
        "queried_at": datetime.now().isoformat(),
        "query_status": "running",
    }

    def _rclone_size(target: str, timeout: int = 120) -> dict:
        r = subprocess.run(
            ["rclone", "size", target, "--config", str(RCLONE_CONF_PATH), "--json"],
            capture_output=True, text=True, timeout=timeout
        )
        if r.returncode == 0:
            return json.loads(r.stdout)
        return {"error": r.stderr.strip()[:100]}

    def _get_dir_size(d: str) -> dict:
        dir_remote = f"{remote}{d}" if remote.endswith(":") else f"{remote}/{d}"
        try:
            dd = _rclone_size(dir_remote, timeout=90)
            if "error" in dd:
                return {"name": d, "files": 0, "size": 0, "size_human": "error", "error": dd["error"]}
            return {
                "name": d,
                "files": dd.get("count", 0),
                "size": dd.get("bytes", 0),
                "size_human": _format_bytes(dd.get("bytes", 0)),
            }
        except subprocess.TimeoutExpired:
            return {"name": d, "files": 0, "size": 0, "size_human": "timeout"}
        except Exception as e:
            return {"name": d, "files": 0, "size": 0, "size_human": "error", "error": str(e)[:100]}

    try:
        # Save initial "running" state so frontend can show progress
        CLOUD_STATUS_CACHE_FILE.write_text(json.dumps(result))

        lsd_result = subprocess.run(
            ["rclone", "lsd", remote, "--config", str(RCLONE_CONF_PATH), "--max-depth", "1"],
            capture_output=True, text=True, timeout=30
        )
        if lsd_result.returncode != 0:
            result["error"] = lsd_result.stderr.strip()[:200]
            result["query_status"] = "error"
            CLOUD_STATUS_CACHE_FILE.write_text(json.dumps(result))
            return

        dir_names = []
        for line in lsd_result.stdout.strip().splitlines():
            parts = line.strip().split()
            if parts:
                dir_names.append(parts[-1])

        with ThreadPoolExecutor(max_workers=6) as executor:
            dir_futures = {executor.submit(_get_dir_size, d): d for d in sorted(dir_names)}

            dir_infos = []
            for future in as_completed(dir_futures, timeout=300):
                dir_infos.append(future.result())

        # Compute totals from per-directory results (avoids slow rclone size on entire remote)
        total_files = sum(d.get("files", 0) for d in dir_infos)
        total_bytes = sum(d.get("size", 0) for d in dir_infos)
        result["total_files"] = total_files
        result["total_size"] = total_bytes
        result["total_size_human"] = _format_bytes(total_bytes)
        result["directories"] = dir_infos
        result["query_status"] = "complete"

    except Exception as e:
        result["error"] = str(e)[:200]
        result["query_status"] = "error"
    finally:
        result["queried_at"] = datetime.now().isoformat()
        CLOUD_STATUS_CACHE_FILE.write_text(json.dumps(result))

        with _cloud_status_lock:
            _cloud_status_query["running"] = False


@router.get("/cloud-status")
async def get_cloud_status(user=Depends(get_current_user)):
    """Return cached cloud storage stats. Returns instantly."""
    config = _load_config()
    if not config.get('key_id') or not config.get('bucket'):
        from fastapi import HTTPException
        raise HTTPException(status_code=400, detail="Cloud backup not configured")

    # Return cached results if available
    cached = {}
    try:
        if CLOUD_STATUS_CACHE_FILE.exists():
            cached = json.loads(CLOUD_STATUS_CACHE_FILE.read_text())
    except (json.JSONDecodeError, OSError):
        pass

    with _cloud_status_lock:
        is_running = _cloud_status_query["running"]

    if cached:
        cached["query_running"] = is_running
        return cached

    return {
        "provider": config.get('provider', 'b2'),
        "bucket": config.get('bucket', ''),
        "endpoint": config.get('endpoint', ''),
        "encryption": config.get('encryption_enabled', True),
        "total_files": None,
        "total_size": None,
        "total_size_human": None,
        "directories": [],
        "error": None,
        "queried_at": None,
        "query_status": "never",
        "query_running": is_running,
    }


@router.post("/cloud-status/refresh")
async def refresh_cloud_status(background_tasks: BackgroundTasks, user=Depends(get_current_user)):
    """Trigger a background refresh of cloud storage stats."""
    config = _load_config()
    if not config.get('key_id') or not config.get('bucket'):
        from fastapi import HTTPException
        raise HTTPException(status_code=400, detail="Cloud backup not configured")

    with _cloud_status_lock:
        if _cloud_status_query["running"]:
            return {"status": "already_running"}
        _cloud_status_query["running"] = True

    background_tasks.add_task(_run_cloud_status_query)
    return {"status": "started"}


def _format_bytes(b: int) -> str:
    """Format bytes into human-readable string."""
    if b is None or b == 0:
        return "0 B"
    for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
        if abs(b) < 1024.0:
            return f"{b:.1f} {unit}" if unit != 'B' else f"{b} B"
        b /= 1024.0
    return f"{b:.1f} EB"