media-downloader/scripts/migrate_immich_to_gallery.py

#!/usr/bin/env python3
"""
Migrate Immich assets into file_inventory and face_recognition_scans.

Connects to Immich PostgreSQL (via docker exec) and app PostgreSQL directly.
Idempotent — safe to re-run. Uses ON CONFLICT DO NOTHING for file_inventory
and checks for existing immich_import scans before inserting face data.

Path mapping:
  /mnt/media/evalongoria/ → /opt/immich/el/
  /mnt/media/elvideo/     → /opt/immich/elv/
  /mnt/media/md/          → SKIPPED (already in file_inventory)

Platform inference from subdirectories:
  evalongoria: IG→instagram, TT→tiktok, X→twitter, Discord→discord,
               Flickr→flickr, rest→unknown
  elvideo: YT→youtube, rest→unknown
"""

import subprocess
import sys
import time
import psycopg2
import psycopg2.extras

# ── Configuration ──────────────────────────────────────────────────────────

APP_DB_DSN = "postgresql://media_downloader:PNsihOXvvuPwWiIvGlsc9Fh2YmMmB@localhost/media_downloader"
IMMICH_CONTAINER = "immich_postgres"
IMMICH_DB = "immich"
IMMICH_USER = "postgres"

BATCH_SIZE = 5000

EVA_PERSON_UUID = "0154270a-8c30-4fb7-b73b-3fb3acc49483"

# Path prefix replacements (Immich → local)
PATH_MAP = {
    "/mnt/media/evalongoria/": "/opt/immich/el/",
    "/mnt/media/elvideo/": "/opt/immich/elv/",
}

# Subdirectory → platform mapping for evalongoria
EVALONGORIA_PLATFORM_MAP = {
    "IG": "instagram",
    "TT": "tiktok",
    "X": "twitter",
    "Discord": "discord",
    "Flickr": "flickr",
    "SC": "unknown",
    "Caps": "unknown",
    "Clips": "unknown",
    "CT": "unknown",
    "HQ": "unknown",
    "Misc": "unknown",
}

# Subdirectory → platform mapping for elvideo
ELVIDEO_PLATFORM_MAP = {
    "YT": "youtube",
    "Misc": "unknown",
}


# ── Immich DB helper ───────────────────────────────────────────────────────

def immich_query(sql):
    """Run a SQL query against Immich PostgreSQL via docker exec, return rows as dicts."""
    cmd = [
        "docker", "exec", IMMICH_CONTAINER,
        "psql", "-U", IMMICH_USER, "-d", IMMICH_DB,
        "-t", "-A", "-F", "\x1f",  # tuples-only, unaligned, unit-separator delimiter
        "-c", sql,
    ]
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
    if result.returncode != 0:
        print(f"ERROR running Immich query: {result.stderr}", file=sys.stderr)
        sys.exit(1)
    return result.stdout.strip()


def immich_query_rows(sql, columns):
    """Run query, parse into list of dicts with given column names."""
    raw = immich_query(sql)
    if not raw:
        return []
    rows = []
    for line in raw.split("\n"):
        if not line.strip():
            continue
        fields = line.split("\x1f")
        if len(fields) != len(columns):
            continue
        rows.append(dict(zip(columns, fields)))
    return rows


# ── Path & platform helpers ────────────────────────────────────────────────

def map_path(immich_path):
    """Convert Immich path to local path. Returns None for /mnt/media/md/ paths."""
    for immich_prefix, local_prefix in PATH_MAP.items():
        if immich_path.startswith(immich_prefix):
            return local_prefix + immich_path[len(immich_prefix):]
    return None  # md/ or unknown prefix — skip


def infer_platform(immich_path):
    """Infer platform from Immich path based on subdirectory."""
    if immich_path.startswith("/mnt/media/evalongoria/"):
        remainder = immich_path[len("/mnt/media/evalongoria/"):]
        # Check if first component is a known subdirectory
        first_component = remainder.split("/")[0] if "/" in remainder else None
        if first_component and first_component in EVALONGORIA_PLATFORM_MAP:
            return EVALONGORIA_PLATFORM_MAP[first_component]
        return "unknown"
    elif immich_path.startswith("/mnt/media/elvideo/"):
        remainder = immich_path[len("/mnt/media/elvideo/"):]
        first_component = remainder.split("/")[0] if "/" in remainder else None
        if first_component and first_component in ELVIDEO_PLATFORM_MAP:
            return ELVIDEO_PLATFORM_MAP[first_component]
        return "unknown"
    return "unknown"


def infer_content_type(asset_type):
    """Map Immich asset type to content_type."""
    if asset_type == "IMAGE":
        return "image"
    elif asset_type == "VIDEO":
        return "video"
    return "unknown"


# ── Main migration ─────────────────────────────────────────────────────────

def migrate_assets(app_conn):
    """Fetch assets from Immich and insert into file_inventory."""
    print("=" * 60)
    print("Phase 1: Migrating Immich assets → file_inventory")
    print("=" * 60)

    # Fetch all evalongoria + elvideo assets from Immich
    sql = """
        SELECT
            a.id::text,
            a."originalPath",
            a."originalFileName",
            a.type,
            a."fileCreatedAt"::text,
            a."deletedAt"::text,
            a.width::text,
            a.height::text,
            encode(a.checksum, 'hex') as file_hash,
            COALESCE(e."fileSizeInByte"::text, '') as file_size
        FROM asset a
        LEFT JOIN asset_exif e ON a.id = e."assetId"
        WHERE (a."originalPath" LIKE '/mnt/media/evalongoria/%'
            OR a."originalPath" LIKE '/mnt/media/elvideo/%')
        ORDER BY a."fileCreatedAt"
    """

    print("Fetching assets from Immich...")
    columns = [
        "id", "originalPath", "originalFileName", "type",
        "fileCreatedAt", "deletedAt", "width", "height",
        "file_hash", "file_size",
    ]
    rows = immich_query_rows(sql, columns)
    total = len(rows)
    print(f"  Found {total:,} assets to process")

    # Prepare and batch-insert
    inserted = 0
    skipped = 0
    batch = []

    cur = app_conn.cursor()

    insert_sql = """
        INSERT INTO file_inventory
            (file_path, filename, platform, source, content_type,
             file_size, file_hash, width, height, location, created_date)
        VALUES %s
        ON CONFLICT (file_path) DO NOTHING
    """

    for i, row in enumerate(rows):
        local_path = map_path(row["originalPath"])
        if local_path is None:
            skipped += 1
            continue

        platform = infer_platform(row["originalPath"])
        content_type = infer_content_type(row["type"])
        location = "recycle" if row["deletedAt"] else "final"

        width = int(row["width"]) if row["width"] else None
        height = int(row["height"]) if row["height"] else None
        file_size = int(row["file_size"]) if row["file_size"] else None

        # Parse timestamp — strip timezone info for timestamp without time zone column
        created_date = row["fileCreatedAt"]
        if created_date:
            # Remove timezone suffix like +00 or +00:00 for naive timestamp
            created_date = created_date.replace("+00:00", "").replace("+00", "").strip()

        batch.append((
            local_path,
            row["originalFileName"],
            platform,
            "evalongoria",
            content_type,
            file_size,
            row["file_hash"],
            width,
            height,
            location,
            created_date if created_date else None,
        ))

        if len(batch) >= BATCH_SIZE:
            psycopg2.extras.execute_values(
                cur, insert_sql, batch,
                template="(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
            )
            inserted += cur.rowcount
            app_conn.commit()
            processed = i + 1
            print(f"  Progress: {processed:,}/{total:,} processed, {inserted:,} inserted")
            batch = []

    # Final batch
    if batch:
        psycopg2.extras.execute_values(
            cur, insert_sql, batch,
            template="(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
        )
        inserted += cur.rowcount
        app_conn.commit()

    cur.close()
    print(f"\n  DONE: {inserted:,} rows inserted, {skipped:,} skipped (md/ paths)")
    return inserted


def migrate_face_detections(app_conn):
    """Migrate Eva Longoria face detections from Immich → face_recognition_scans."""
    print("\n" + "=" * 60)
    print("Phase 2: Migrating face detections → face_recognition_scans")
    print("=" * 60)

    # First, check if we already ran this migration
    cur = app_conn.cursor()
    cur.execute("SELECT COUNT(*) FROM face_recognition_scans WHERE scan_type = 'immich_import'")
    existing = cur.fetchone()[0]
    if existing > 0:
        print(f"  Found {existing:,} existing immich_import scans — skipping face migration")
        print("  (Delete existing immich_import scans first if you want to re-run)")
        cur.close()
        return 0

    # Get distinct assets with Eva Longoria face + face count + path in one query
    print("Fetching face detection data with paths from Immich...")
    sql = f"""
        SELECT
            a."originalPath",
            COUNT(*) as eva_faces
        FROM asset_face af
        JOIN asset a ON af."assetId" = a.id
        WHERE af."personId" = '{EVA_PERSON_UUID}'
          AND af."deletedAt" IS NULL
          AND (a."originalPath" LIKE '/mnt/media/evalongoria/%'
               OR a."originalPath" LIKE '/mnt/media/elvideo/%')
        GROUP BY a."originalPath"
    """
    columns = ["originalPath", "face_count"]
    face_rows = immich_query_rows(sql, columns)
    print(f"  Found {len(face_rows):,} assets with Eva Longoria face detections")

    # Build file_path lookup from file_inventory (for /opt/immich/el/ and /opt/immich/elv/ paths)
    print("Building file_inventory lookup...")
    cur.execute("""
        SELECT file_path FROM file_inventory
        WHERE file_path LIKE '/opt/immich/el/%' OR file_path LIKE '/opt/immich/elv/%'
    """)
    inventory_paths = set(row[0] for row in cur.fetchall())
    print(f"  {len(inventory_paths):,} paths in file_inventory for el/elv")

    # Prepare face scan inserts
    insert_sql = """
        INSERT INTO face_recognition_scans
            (file_path, has_match, matched_person, confidence, face_count, scan_type)
        VALUES %s
    """

    batch = []
    inserted = 0
    skipped_not_in_inventory = 0
    total = len(face_rows)

    for i, row in enumerate(face_rows):
        local_path = map_path(row["originalPath"])
        if local_path is None:
            continue

        if local_path not in inventory_paths:
            skipped_not_in_inventory += 1
            continue

        face_count = int(row["face_count"])

        batch.append((
            local_path,
            True,
            "Eva Longoria",
            1.0,
            face_count,
            "immich_import",
        ))

        if len(batch) >= BATCH_SIZE:
            psycopg2.extras.execute_values(
                cur, insert_sql, batch,
                template="(%s, %s, %s, %s, %s, %s)",
            )
            inserted += cur.rowcount
            app_conn.commit()
            print(f"  Progress: {i + 1:,}/{total:,} processed, {inserted:,} inserted")
            batch = []

    if batch:
        psycopg2.extras.execute_values(
            cur, insert_sql, batch,
            template="(%s, %s, %s, %s, %s, %s)",
        )
        inserted += cur.rowcount
        app_conn.commit()

    cur.close()
    print(f"\n  DONE: {inserted:,} face scans inserted")
    print(f"  Skipped: {skipped_not_in_inventory:,} (not in file_inventory)")
    return inserted


def verify(app_conn):
    """Print verification counts."""
    print("\n" + "=" * 60)
    print("Verification")
    print("=" * 60)

    cur = app_conn.cursor()

    # file_inventory counts
    cur.execute("SELECT COUNT(*) FROM file_inventory WHERE file_path LIKE '/opt/immich/el/%'")
    el_count = cur.fetchone()[0]

    cur.execute("SELECT COUNT(*) FROM file_inventory WHERE file_path LIKE '/opt/immich/elv/%'")
    elv_count = cur.fetchone()[0]

    cur.execute("""
        SELECT location, COUNT(*)
        FROM file_inventory
        WHERE file_path LIKE '/opt/immich/el/%' OR file_path LIKE '/opt/immich/elv/%'
        GROUP BY location
    """)
    location_counts = dict(cur.fetchall())

    cur.execute("""
        SELECT platform, COUNT(*)
        FROM file_inventory
        WHERE file_path LIKE '/opt/immich/el/%' OR file_path LIKE '/opt/immich/elv/%'
        GROUP BY platform
        ORDER BY 2 DESC
    """)
    platform_counts = cur.fetchall()

    # face_recognition_scans counts
    cur.execute("SELECT COUNT(*) FROM face_recognition_scans WHERE scan_type = 'immich_import'")
    face_count = cur.fetchone()[0]

    cur.execute("SELECT COUNT(*) FROM face_recognition_scans")
    total_face_scans = cur.fetchone()[0]

    # Total file_inventory
    cur.execute("SELECT COUNT(*) FROM file_inventory")
    total_inventory = cur.fetchone()[0]

    cur.close()

    print(f"\n  file_inventory:")
    print(f"    /opt/immich/el/*  (evalongoria): {el_count:,}")
    print(f"    /opt/immich/elv/* (elvideo):     {elv_count:,}")
    print(f"    Total new:                       {el_count + elv_count:,}")
    print(f"    By location: {dict(location_counts)}")
    print(f"    By platform:")
    for platform, count in platform_counts:
        print(f"      {platform:12s}: {count:,}")

    print(f"\n  face_recognition_scans:")
    print(f"    immich_import: {face_count:,}")
    print(f"    Total scans:   {total_face_scans:,}")

    print(f"\n  Total file_inventory rows: {total_inventory:,}")


def main():
    start = time.time()
    print("Immich → file_inventory migration")
    print("=" * 60)

    # Test Immich connection
    print("Testing Immich database connection...")
    test = immich_query("SELECT COUNT(*) FROM asset")
    print(f"  Immich has {int(test):,} assets")

    # Connect to app database
    print("Connecting to app database...")
    app_conn = psycopg2.connect(APP_DB_DSN)

    try:
        assets_inserted = migrate_assets(app_conn)
        faces_inserted = migrate_face_detections(app_conn)
        verify(app_conn)
    finally:
        app_conn.close()

    elapsed = time.time() - start
    print(f"\nCompleted in {elapsed:.1f}s")


if __name__ == "__main__":
    main()