media-downloader/modules/scraper_gallery_bridge.py

"""
Scraper Gallery Bridge

Maps scraper accounts (Instagram, TikTok, Snapchat) to private gallery persons.
After each download session, auto-imports new media as gallery posts.
"""

import hashlib
import logging
import mimetypes
import sqlite3
import subprocess
import tempfile
import uuid
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional

logger = logging.getLogger(__name__)

SCRAPER_BRIDGE_KEY_FILE = '/opt/immich/private/.scraper_bridge_key'

# Map scraper module names → platform
SCRAPER_TO_PLATFORM = {
    'fastdl': 'instagram',
    'imginn': 'instagram',
    'imginn_api': 'instagram',
    'instagram_client': 'instagram',
    'toolzu': 'instagram',
    'instagram': 'instagram',
    'instagram_unified': 'instagram',
    'tiktok': 'tiktok',
    'snapchat': 'snapchat',
    'snapchat_client': 'snapchat',
}

PLATFORM_COLORS = {
    'instagram': '#E1306C',
    'tiktok': '#00f2ea',
    'snapchat': '#FFFC00',
}

PLATFORM_LABELS = {
    'instagram': 'Instagram',
    'tiktok': 'TikTok',
    'snapchat': 'Snapchat',
}


def get_crypto():
    """Load crypto from key file for background access (works when gallery is locked)."""
    from modules.private_gallery_crypto import load_key_from_file
    crypto = load_key_from_file(SCRAPER_BRIDGE_KEY_FILE)
    if crypto is None:
        logger.debug("Scraper bridge crypto unavailable - key file missing or invalid")
    return crypto


def get_available_accounts(platform: str, config: dict, db) -> List[Dict[str, Any]]:
    """
    Aggregate usernames from all scraper configs + paid_content_creators for a platform.
    Returns de-duplicated list with source annotations.
    """
    accounts = {}  # username -> set of sources

    if platform == 'instagram':
        # instagram.accounts[].username
        ig_cfg = config.get('instagram', {})
        if ig_cfg.get('enabled', False):
            ig_accounts = ig_cfg.get('accounts', [])
            if not ig_accounts and 'usernames' in ig_cfg:
                ig_accounts = [{'username': u} for u in ig_cfg['usernames']]
            for acc in ig_accounts:
                u = acc.get('username', '').strip().lower()
                if u:
                    accounts.setdefault(u, set()).add('instagram')

        # Collect usernames + phrase_search usernames from each scraper
        for scraper_id in ('fastdl', 'imginn', 'imginn_api', 'instagram_client', 'toolzu'):
            scraper_cfg = config.get(scraper_id, {})
            if not scraper_cfg.get('enabled', False):
                continue
            for u in scraper_cfg.get('usernames', []):
                u = u.strip().lower()
                if u:
                    accounts.setdefault(u, set()).add(scraper_id)
            # phrase_search usernames are also downloadable accounts
            for u in scraper_cfg.get('phrase_search', {}).get('usernames', []):
                u = u.strip().lower()
                if u:
                    accounts.setdefault(u, set()).add(scraper_id)

    elif platform == 'tiktok':
        tt_cfg = config.get('tiktok', {})
        if tt_cfg.get('enabled', False):
            tt_accounts = tt_cfg.get('accounts', [])
            if not tt_accounts and 'usernames' in tt_cfg:
                tt_accounts = [{'username': u} for u in tt_cfg['usernames']]
            for acc in tt_accounts:
                u = acc.get('username', '').strip().lower()
                if u:
                    accounts.setdefault(u, set()).add('tiktok')

    elif platform == 'snapchat':
        # snapchat.usernames
        sc_cfg = config.get('snapchat', {})
        if sc_cfg.get('enabled', False):
            for u in sc_cfg.get('usernames', []):
                u = u.strip().lower()
                if u:
                    accounts.setdefault(u, set()).add('snapchat')

        # snapchat_client.usernames
        sc_client_cfg = config.get('snapchat_client', {})
        if sc_client_cfg.get('enabled', False):
            for u in sc_client_cfg.get('usernames', []):
                u = u.strip().lower()
                if u:
                    accounts.setdefault(u, set()).add('snapchat_client')

    # Add from paid_content_creators table
    try:
        conn = sqlite3.connect(db.db_path, timeout=10)
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        cursor.execute(
            'SELECT username FROM paid_content_creators WHERE platform = ? AND enabled = 1',
            (platform,)
        )
        for row in cursor.fetchall():
            u = row['username'].strip().lower()
            if u:
                accounts.setdefault(u, set()).add('paid_content')
        conn.close()
    except Exception as e:
        logger.debug(f"Could not query paid_content_creators: {e}")

    # Check which are already mapped
    mapped_usernames = set()
    try:
        conn = sqlite3.connect(db.db_path, timeout=10)
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        cursor.execute(
            'SELECT username FROM private_media_scraper_accounts WHERE platform = ?',
            (platform,)
        )
        for row in cursor.fetchall():
            mapped_usernames.add(row['username'].lower())
        conn.close()
    except Exception:
        pass

    result = []
    for username, sources in sorted(accounts.items()):
        result.append({
            'username': username,
            'sources': sorted(sources),
            'is_mapped': username.lower() in mapped_usernames,
        })

    return result


def _ensure_platform_tag(platform: str, db, crypto) -> int:
    """Find or create a tag for the platform in private_gallery_tags."""
    conn = sqlite3.connect(db.db_path, timeout=10)
    conn.row_factory = sqlite3.Row
    try:
        cursor = conn.cursor()
        cursor.execute("SELECT id, encrypted_name FROM private_gallery_tags")
        label = PLATFORM_LABELS.get(platform, platform.title())
        for row in cursor.fetchall():
            try:
                name = crypto.decrypt_field(row['encrypted_name'])
                if name.lower() == label.lower():
                    return row['id']
            except Exception:
                continue

        # Create the tag
        encrypted_name = crypto.encrypt_field(label)
        color = PLATFORM_COLORS.get(platform, '#888888')
        cursor.execute('''
            INSERT INTO private_gallery_tags (encrypted_name, color)
            VALUES (?, ?)
        ''', (encrypted_name, color))
        conn.commit()
        tag_id = cursor.lastrowid
        logger.info(f"Created '{label}' tag with ID {tag_id}")
        return tag_id
    finally:
        conn.close()


def _get_file_info(file_path: Path) -> Dict[str, Any]:
    """Get file type, mime type, and dimensions."""
    ext = file_path.suffix.lower().lstrip('.')
    mime_type, _ = mimetypes.guess_type(str(file_path))
    if not mime_type:
        mime_type = 'application/octet-stream'

    image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
    video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}

    if ext in image_exts:
        file_type = 'image'
    elif ext in video_exts:
        file_type = 'video'
    else:
        file_type = 'other'

    width, height, duration = 0, 0, 0

    if file_type == 'image':
        try:
            from PIL import Image
            with Image.open(file_path) as img:
                width, height = img.size
        except Exception:
            pass
    elif file_type == 'video':
        try:
            result = subprocess.run(
                ['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_streams', str(file_path)],
                capture_output=True, text=True, timeout=15
            )
            if result.returncode == 0:
                import json
                probe = json.loads(result.stdout)
                for stream in probe.get('streams', []):
                    if stream.get('codec_type') == 'video':
                        width = int(stream.get('width', 0))
                        height = int(stream.get('height', 0))
                        dur = stream.get('duration')
                        if dur:
                            duration = int(float(dur))
                        break
        except Exception:
            pass

    return {
        'file_type': file_type,
        'mime_type': mime_type,
        'width': width,
        'height': height,
        'duration': duration,
    }


def _compute_perceptual_hash(file_path: Path) -> Optional[str]:
    """Calculate perceptual hash for an image or video file."""
    try:
        import imagehash
        from PIL import Image
    except ImportError:
        return None

    ext = file_path.suffix.lower().lstrip('.')
    image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
    video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}

    pil_image = None
    try:
        if ext in video_exts:
            try:
                import cv2
            except ImportError:
                return None
            cap = cv2.VideoCapture(str(file_path))
            if not cap.isOpened():
                return None
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            cap.set(cv2.CAP_PROP_POS_FRAMES, int(total_frames * 0.5))
            ret, frame = cap.read()
            cap.release()
            if not ret or frame is None:
                return None
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(frame_rgb)
        elif ext in image_exts:
            pil_image = Image.open(file_path)
        else:
            return None

        return str(imagehash.dhash(pil_image, hash_size=16))
    except Exception:
        return None
    finally:
        if pil_image:
            try:
                pil_image.close()
            except Exception:
                pass


def _generate_thumbnail(file_path: Path, output_path: Path, file_type: str) -> bool:
    """Generate a thumbnail for an image or video."""
    try:
        output_path.parent.mkdir(parents=True, exist_ok=True)

        if file_type == 'image':
            from PIL import Image, ImageOps
            with Image.open(file_path) as img:
                img = ImageOps.exif_transpose(img)
                img.thumbnail((400, 400))
                if img.mode in ('RGBA', 'P'):
                    img = img.convert('RGB')
                img.save(output_path, 'JPEG', quality=85)
            return True

        elif file_type == 'video':
            result = subprocess.run([
                'ffmpeg', '-y', '-i', str(file_path),
                '-ss', '00:00:01', '-vframes', '1',
                '-vf', 'scale=400:-1:force_original_aspect_ratio=decrease',
                str(output_path)
            ], capture_output=True, timeout=30)
            return result.returncode == 0 and output_path.exists()
    except Exception:
        pass
    return False


def import_new_media(platform: str, username: str, person_id: int,
                     last_imported_at: Optional[str], db, crypto,
                     last_imported_file_id: int = 0) -> int:
    """
    Import new media files from file_inventory into the private gallery.
    Returns count of imported files.
    """
    conn = sqlite3.connect(db.db_path, timeout=30)
    conn.row_factory = sqlite3.Row
    try:
        cursor = conn.cursor()

        # Use id-based filtering (reliable, monotonically increasing with insertion order).
        # Falls back to created_date only for legacy accounts without last_imported_file_id.
        if last_imported_file_id and last_imported_file_id > 0:
            cursor.execute('''
                SELECT id, file_path, filename, created_date FROM file_inventory
                WHERE platform = ? AND source = ? AND id > ?
                AND location IN ('final', 'review')
                ORDER BY id ASC
            ''', (platform, username, last_imported_file_id))
        elif last_imported_at:
            cursor.execute('''
                SELECT id, file_path, filename, created_date FROM file_inventory
                WHERE platform = ? AND source = ? AND created_date > ?
                AND location IN ('final', 'review')
                ORDER BY id ASC
            ''', (platform, username, last_imported_at))
        else:
            # First run: only import files from the last 1 hour
            from datetime import timedelta
            cutoff = (datetime.now() - timedelta(hours=1)).isoformat()
            cursor.execute('''
                SELECT id, file_path, filename, created_date FROM file_inventory
                WHERE platform = ? AND source = ? AND created_date > ?
                AND location IN ('final', 'review')
                ORDER BY id ASC
            ''', (platform, username, cutoff))

        files = cursor.fetchall()
    finally:
        conn.close()

    if not files:
        return 0

    # Filter to existing files, track max id for updating last_imported_file_id
    valid_files = []
    max_file_id = last_imported_file_id or 0
    for f in files:
        fp = Path(f['file_path'])
        file_id = f['id']
        if file_id > max_file_id:
            max_file_id = file_id
        if fp.exists() and fp.stat().st_size > 0:
            valid_files.append({'path': fp, 'created_date': f['created_date'], 'id': file_id})

    if not valid_files:
        return 0

    # Get storage path
    conn = sqlite3.connect(db.db_path, timeout=10)
    conn.row_factory = sqlite3.Row
    try:
        cursor = conn.cursor()
        cursor.execute("SELECT value FROM private_media_config WHERE key = 'storage_path'")
        row = cursor.fetchone()
        storage_path = Path(row['value']) if row else Path('/opt/immich/private')
    finally:
        conn.close()

    data_path = storage_path / 'data'
    thumbs_path = storage_path / 'thumbs'
    data_path.mkdir(parents=True, exist_ok=True)
    thumbs_path.mkdir(parents=True, exist_ok=True)

    # Get/create platform tag
    tag_id = _ensure_platform_tag(platform, db, crypto)

    # Create a post for this batch
    now_iso = datetime.now().isoformat()
    encrypted_desc = crypto.encrypt_field(f"{PLATFORM_LABELS.get(platform, platform)} - @{username}")
    encrypted_date = crypto.encrypt_field(now_iso)

    conn = sqlite3.connect(db.db_path, timeout=10)
    conn.row_factory = sqlite3.Row
    try:
        cursor = conn.cursor()
        cursor.execute('''
            INSERT INTO private_media_posts (person_id, encrypted_description, encrypted_media_date, created_at, updated_at)
            VALUES (?, ?, ?, ?, ?)
        ''', (person_id, encrypted_desc, encrypted_date, now_iso, now_iso))
        conn.commit()
        post_id = cursor.lastrowid
    finally:
        conn.close()

    media_count = 0
    latest_date = last_imported_at

    for file_info_entry in valid_files:
        file_path = file_info_entry['path']
        created_date = file_info_entry['created_date']
        # Normalize to string for consistent comparison (PostgreSQL returns datetime objects)
        if hasattr(created_date, 'isoformat'):
            created_date = created_date.isoformat()
        try:
            # Calculate file hash
            sha256 = hashlib.sha256()
            with open(file_path, 'rb') as f:
                for chunk in iter(lambda: f.read(65536), b''):
                    sha256.update(chunk)
            file_hash = sha256.hexdigest()

            # Check for duplicates (scoped by person)
            conn = sqlite3.connect(db.db_path, timeout=10)
            conn.row_factory = sqlite3.Row
            try:
                cursor = conn.cursor()
                cursor.execute(
                    'SELECT id FROM private_media WHERE file_hash = ? AND person_id = ?',
                    (file_hash, person_id)
                )
                if cursor.fetchone():
                    logger.debug(f"Duplicate file skipped: {file_path.name}")
                    if created_date and (not latest_date or created_date > latest_date):
                        latest_date = created_date
                    continue
            finally:
                conn.close()

            # Get file info
            finfo = _get_file_info(file_path)
            file_size = file_path.stat().st_size

            # Compute perceptual hash
            perceptual_hash = _compute_perceptual_hash(file_path)

            # Generate storage ID
            storage_id = str(uuid.uuid4())

            # Generate thumbnail
            temp_thumb = Path(tempfile.gettempdir()) / f"pg_thumb_{storage_id}.jpg"
            _generate_thumbnail(file_path, temp_thumb, finfo['file_type'])

            # Encrypt the file
            encrypted_file = data_path / f"{storage_id}.enc"
            if not crypto.encrypt_file(file_path, encrypted_file):
                logger.error(f"Encryption failed for {file_path.name}")
                continue

            # Encrypt thumbnail
            if temp_thumb.exists():
                encrypted_thumb = thumbs_path / f"{storage_id}.enc"
                crypto.encrypt_file(temp_thumb, encrypted_thumb)
                try:
                    temp_thumb.unlink()
                except Exception:
                    pass

            # Insert media record
            encrypted_filename = crypto.encrypt_field(file_path.name)
            encrypted_source = crypto.encrypt_field(f"@{username}")

            conn = sqlite3.connect(db.db_path, timeout=10)
            conn.row_factory = sqlite3.Row
            try:
                cursor = conn.cursor()
                cursor.execute('''
                    INSERT INTO private_media (
                        post_id, storage_id, encrypted_filename, encrypted_description,
                        file_hash, file_size, file_type, mime_type,
                        width, height, duration, person_id,
                        encrypted_media_date, source_type, encrypted_source_path,
                        perceptual_hash, created_at
                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                ''', (
                    post_id,
                    storage_id,
                    encrypted_filename,
                    None,
                    file_hash,
                    file_size,
                    finfo['file_type'],
                    finfo['mime_type'],
                    finfo['width'],
                    finfo['height'],
                    finfo['duration'],
                    person_id,
                    encrypted_date,
                    platform,
                    encrypted_source,
                    perceptual_hash,
                    now_iso,
                ))
                conn.commit()
            finally:
                conn.close()

            media_count += 1
            if created_date and (not latest_date or created_date > latest_date):
                latest_date = created_date

        except Exception as e:
            logger.error(f"Failed to import {file_path.name}: {e}")

    # Apply platform tag to the post if we imported media
    if media_count > 0:
        conn = sqlite3.connect(db.db_path, timeout=10)
        try:
            cursor = conn.cursor()
            cursor.execute('''
                INSERT OR IGNORE INTO private_media_post_tags (post_id, tag_id)
                VALUES (?, ?)
            ''', (post_id, tag_id))
            conn.commit()
        finally:
            conn.close()

        # Update the mapping row with both timestamp and file id markers
        conn = sqlite3.connect(db.db_path, timeout=10)
        try:
            cursor = conn.cursor()
            cursor.execute('''
                UPDATE private_media_scraper_accounts
                SET last_imported_at = ?,
                    last_imported_file_id = ?,
                    total_media_imported = total_media_imported + ?,
                    updated_at = ?
                WHERE platform = ? AND username = ? AND person_id = ?
            ''', (latest_date or now_iso, max_file_id, media_count, now_iso, platform, username, person_id))
            conn.commit()
        finally:
            conn.close()

        logger.info(f"Imported {media_count} files from {platform}/@{username} to gallery (last_file_id={max_file_id})")
    else:
        # No media imported - still update the file id marker so we don't re-check these files
        if max_file_id > (last_imported_file_id or 0):
            conn = sqlite3.connect(db.db_path, timeout=10)
            try:
                cursor = conn.cursor()
                cursor.execute('''
                    UPDATE private_media_scraper_accounts
                    SET last_imported_file_id = ?
                    WHERE platform = ? AND username = ? AND person_id = ?
                ''', (max_file_id, platform, username, person_id))
                conn.commit()
            finally:
                conn.close()

        # Delete the empty post
        conn = sqlite3.connect(db.db_path, timeout=10)
        try:
            cursor = conn.cursor()
            cursor.execute("DELETE FROM private_media_posts WHERE id = ?", (post_id,))
            conn.commit()
        finally:
            conn.close()

    return media_count


def on_download_complete(task_id: str, download_count: int, db, crypto) -> int:
    """
    Called from scheduler after a task completes.
    Checks ALL mapped accounts for the platform for new media.

    This handles all cases:
    - Batch tasks (fastdl:all, imginn_api:all)
    - Per-user tasks that also download phrase_search users (instagram_client:evalongoria)
    - Simple per-user tasks (toolzu:evalongoria)

    The id-based filtering is cheap — accounts with no new files return quickly.
    """
    if not task_id or ':' not in task_id:
        return 0

    scraper_module = task_id.split(':')[0]

    # Map scraper module to platform
    platform = SCRAPER_TO_PLATFORM.get(scraper_module)
    if not platform:
        return 0

    # Always check ALL mapped accounts for the platform.
    # A single task can download for many users (batch tasks, phrase_search),
    # and id-based filtering makes per-account checks cheap.
    return _import_all_mapped_accounts(platform, db, crypto)


def _import_all_mapped_accounts(platform: str, db, crypto) -> int:
    """
    After a batch task (e.g. fastdl:all), check ALL mapped accounts
    for the platform and import any new media.
    """
    conn = sqlite3.connect(db.db_path, timeout=10)
    conn.row_factory = sqlite3.Row
    try:
        cursor = conn.cursor()
        cursor.execute('''
            SELECT id, username, person_id, last_imported_at, last_imported_file_id
            FROM private_media_scraper_accounts
            WHERE platform = ? AND enabled = 1
        ''', (platform,))
        rows = cursor.fetchall()
    finally:
        conn.close()

    if not rows:
        return 0

    total_imported = 0
    for row in rows:
        try:
            count = import_new_media(
                platform, row['username'], row['person_id'],
                row['last_imported_at'], db, crypto,
                last_imported_file_id=row['last_imported_file_id'] or 0
            )
            total_imported += count
        except Exception as e:
            logger.error(f"Gallery bridge batch import error for {platform}/@{row['username']}: {e}")

    if total_imported > 0:
        logger.info(f"Batch import for {platform}: {total_imported} files across {len(rows)} accounts")

    return total_imported