Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/modules/scraper_gallery_bridge.py
+++ b/modules/scraper_gallery_bridge.py
@@ -0,0 +1,652 @@
+"""
+Scraper Gallery Bridge
+
+Maps scraper accounts (Instagram, TikTok, Snapchat) to private gallery persons.
+After each download session, auto-imports new media as gallery posts.
+"""
+
+import hashlib
+import logging
+import mimetypes
+import sqlite3
+import subprocess
+import tempfile
+import uuid
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+SCRAPER_BRIDGE_KEY_FILE = '/opt/immich/private/.scraper_bridge_key'
+
+# Map scraper module names → platform
+SCRAPER_TO_PLATFORM = {
+    'fastdl': 'instagram',
+    'imginn': 'instagram',
+    'imginn_api': 'instagram',
+    'instagram_client': 'instagram',
+    'toolzu': 'instagram',
+    'instagram': 'instagram',
+    'instagram_unified': 'instagram',
+    'tiktok': 'tiktok',
+    'snapchat': 'snapchat',
+    'snapchat_client': 'snapchat',
+}
+
+PLATFORM_COLORS = {
+    'instagram': '#E1306C',
+    'tiktok': '#00f2ea',
+    'snapchat': '#FFFC00',
+}
+
+PLATFORM_LABELS = {
+    'instagram': 'Instagram',
+    'tiktok': 'TikTok',
+    'snapchat': 'Snapchat',
+}
+
+
+def get_crypto():
+    """Load crypto from key file for background access (works when gallery is locked)."""
+    from modules.private_gallery_crypto import load_key_from_file
+    crypto = load_key_from_file(SCRAPER_BRIDGE_KEY_FILE)
+    if crypto is None:
+        logger.debug("Scraper bridge crypto unavailable - key file missing or invalid")
+    return crypto
+
+
+def get_available_accounts(platform: str, config: dict, db) -> List[Dict[str, Any]]:
+    """
+    Aggregate usernames from all scraper configs + paid_content_creators for a platform.
+    Returns de-duplicated list with source annotations.
+    """
+    accounts = {}  # username -> set of sources
+
+    if platform == 'instagram':
+        # instagram.accounts[].username
+        ig_cfg = config.get('instagram', {})
+        if ig_cfg.get('enabled', False):
+            ig_accounts = ig_cfg.get('accounts', [])
+            if not ig_accounts and 'usernames' in ig_cfg:
+                ig_accounts = [{'username': u} for u in ig_cfg['usernames']]
+            for acc in ig_accounts:
+                u = acc.get('username', '').strip().lower()
+                if u:
+                    accounts.setdefault(u, set()).add('instagram')
+
+        # Collect usernames + phrase_search usernames from each scraper
+        for scraper_id in ('fastdl', 'imginn', 'imginn_api', 'instagram_client', 'toolzu'):
+            scraper_cfg = config.get(scraper_id, {})
+            if not scraper_cfg.get('enabled', False):
+                continue
+            for u in scraper_cfg.get('usernames', []):
+                u = u.strip().lower()
+                if u:
+                    accounts.setdefault(u, set()).add(scraper_id)
+            # phrase_search usernames are also downloadable accounts
+            for u in scraper_cfg.get('phrase_search', {}).get('usernames', []):
+                u = u.strip().lower()
+                if u:
+                    accounts.setdefault(u, set()).add(scraper_id)
+
+    elif platform == 'tiktok':
+        tt_cfg = config.get('tiktok', {})
+        if tt_cfg.get('enabled', False):
+            tt_accounts = tt_cfg.get('accounts', [])
+            if not tt_accounts and 'usernames' in tt_cfg:
+                tt_accounts = [{'username': u} for u in tt_cfg['usernames']]
+            for acc in tt_accounts:
+                u = acc.get('username', '').strip().lower()
+                if u:
+                    accounts.setdefault(u, set()).add('tiktok')
+
+    elif platform == 'snapchat':
+        # snapchat.usernames
+        sc_cfg = config.get('snapchat', {})
+        if sc_cfg.get('enabled', False):
+            for u in sc_cfg.get('usernames', []):
+                u = u.strip().lower()
+                if u:
+                    accounts.setdefault(u, set()).add('snapchat')
+
+        # snapchat_client.usernames
+        sc_client_cfg = config.get('snapchat_client', {})
+        if sc_client_cfg.get('enabled', False):
+            for u in sc_client_cfg.get('usernames', []):
+                u = u.strip().lower()
+                if u:
+                    accounts.setdefault(u, set()).add('snapchat_client')
+
+    # Add from paid_content_creators table
+    try:
+        conn = sqlite3.connect(db.db_path, timeout=10)
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+        cursor.execute(
+            'SELECT username FROM paid_content_creators WHERE platform = ? AND enabled = 1',
+            (platform,)
+        )
+        for row in cursor.fetchall():
+            u = row['username'].strip().lower()
+            if u:
+                accounts.setdefault(u, set()).add('paid_content')
+        conn.close()
+    except Exception as e:
+        logger.debug(f"Could not query paid_content_creators: {e}")
+
+    # Check which are already mapped
+    mapped_usernames = set()
+    try:
+        conn = sqlite3.connect(db.db_path, timeout=10)
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+        cursor.execute(
+            'SELECT username FROM private_media_scraper_accounts WHERE platform = ?',
+            (platform,)
+        )
+        for row in cursor.fetchall():
+            mapped_usernames.add(row['username'].lower())
+        conn.close()
+    except Exception:
+        pass
+
+    result = []
+    for username, sources in sorted(accounts.items()):
+        result.append({
+            'username': username,
+            'sources': sorted(sources),
+            'is_mapped': username.lower() in mapped_usernames,
+        })
+
+    return result
+
+
+def _ensure_platform_tag(platform: str, db, crypto) -> int:
+    """Find or create a tag for the platform in private_gallery_tags."""
+    conn = sqlite3.connect(db.db_path, timeout=10)
+    conn.row_factory = sqlite3.Row
+    try:
+        cursor = conn.cursor()
+        cursor.execute("SELECT id, encrypted_name FROM private_gallery_tags")
+        label = PLATFORM_LABELS.get(platform, platform.title())
+        for row in cursor.fetchall():
+            try:
+                name = crypto.decrypt_field(row['encrypted_name'])
+                if name.lower() == label.lower():
+                    return row['id']
+            except Exception:
+                continue
+
+        # Create the tag
+        encrypted_name = crypto.encrypt_field(label)
+        color = PLATFORM_COLORS.get(platform, '#888888')
+        cursor.execute('''
+            INSERT INTO private_gallery_tags (encrypted_name, color)
+            VALUES (?, ?)
+        ''', (encrypted_name, color))
+        conn.commit()
+        tag_id = cursor.lastrowid
+        logger.info(f"Created '{label}' tag with ID {tag_id}")
+        return tag_id
+    finally:
+        conn.close()
+
+
+def _get_file_info(file_path: Path) -> Dict[str, Any]:
+    """Get file type, mime type, and dimensions."""
+    ext = file_path.suffix.lower().lstrip('.')
+    mime_type, _ = mimetypes.guess_type(str(file_path))
+    if not mime_type:
+        mime_type = 'application/octet-stream'
+
+    image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
+    video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
+
+    if ext in image_exts:
+        file_type = 'image'
+    elif ext in video_exts:
+        file_type = 'video'
+    else:
+        file_type = 'other'
+
+    width, height, duration = 0, 0, 0
+
+    if file_type == 'image':
+        try:
+            from PIL import Image
+            with Image.open(file_path) as img:
+                width, height = img.size
+        except Exception:
+            pass
+    elif file_type == 'video':
+        try:
+            result = subprocess.run(
+                ['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_streams', str(file_path)],
+                capture_output=True, text=True, timeout=15
+            )
+            if result.returncode == 0:
+                import json
+                probe = json.loads(result.stdout)
+                for stream in probe.get('streams', []):
+                    if stream.get('codec_type') == 'video':
+                        width = int(stream.get('width', 0))
+                        height = int(stream.get('height', 0))
+                        dur = stream.get('duration')
+                        if dur:
+                            duration = int(float(dur))
+                        break
+        except Exception:
+            pass
+
+    return {
+        'file_type': file_type,
+        'mime_type': mime_type,
+        'width': width,
+        'height': height,
+        'duration': duration,
+    }
+
+
+def _compute_perceptual_hash(file_path: Path) -> Optional[str]:
+    """Calculate perceptual hash for an image or video file."""
+    try:
+        import imagehash
+        from PIL import Image
+    except ImportError:
+        return None
+
+    ext = file_path.suffix.lower().lstrip('.')
+    image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
+    video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
+
+    pil_image = None
+    try:
+        if ext in video_exts:
+            try:
+                import cv2
+            except ImportError:
+                return None
+            cap = cv2.VideoCapture(str(file_path))
+            if not cap.isOpened():
+                return None
+            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            cap.set(cv2.CAP_PROP_POS_FRAMES, int(total_frames * 0.5))
+            ret, frame = cap.read()
+            cap.release()
+            if not ret or frame is None:
+                return None
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(frame_rgb)
+        elif ext in image_exts:
+            pil_image = Image.open(file_path)
+        else:
+            return None
+
+        return str(imagehash.dhash(pil_image, hash_size=16))
+    except Exception:
+        return None
+    finally:
+        if pil_image:
+            try:
+                pil_image.close()
+            except Exception:
+                pass
+
+
+def _generate_thumbnail(file_path: Path, output_path: Path, file_type: str) -> bool:
+    """Generate a thumbnail for an image or video."""
+    try:
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        if file_type == 'image':
+            from PIL import Image, ImageOps
+            with Image.open(file_path) as img:
+                img = ImageOps.exif_transpose(img)
+                img.thumbnail((400, 400))
+                if img.mode in ('RGBA', 'P'):
+                    img = img.convert('RGB')
+                img.save(output_path, 'JPEG', quality=85)
+            return True
+
+        elif file_type == 'video':
+            result = subprocess.run([
+                'ffmpeg', '-y', '-i', str(file_path),
+                '-ss', '00:00:01', '-vframes', '1',
+                '-vf', 'scale=400:-1:force_original_aspect_ratio=decrease',
+                str(output_path)
+            ], capture_output=True, timeout=30)
+            return result.returncode == 0 and output_path.exists()
+    except Exception:
+        pass
+    return False
+
+
+def import_new_media(platform: str, username: str, person_id: int,
+                     last_imported_at: Optional[str], db, crypto,
+                     last_imported_file_id: int = 0) -> int:
+    """
+    Import new media files from file_inventory into the private gallery.
+    Returns count of imported files.
+    """
+    conn = sqlite3.connect(db.db_path, timeout=30)
+    conn.row_factory = sqlite3.Row
+    try:
+        cursor = conn.cursor()
+
+        # Use id-based filtering (reliable, monotonically increasing with insertion order).
+        # Falls back to created_date only for legacy accounts without last_imported_file_id.
+        if last_imported_file_id and last_imported_file_id > 0:
+            cursor.execute('''
+                SELECT id, file_path, filename, created_date FROM file_inventory
+                WHERE platform = ? AND source = ? AND id > ?
+                AND location IN ('final', 'review')
+                ORDER BY id ASC
+            ''', (platform, username, last_imported_file_id))
+        elif last_imported_at:
+            cursor.execute('''
+                SELECT id, file_path, filename, created_date FROM file_inventory
+                WHERE platform = ? AND source = ? AND created_date > ?
+                AND location IN ('final', 'review')
+                ORDER BY id ASC
+            ''', (platform, username, last_imported_at))
+        else:
+            # First run: only import files from the last 1 hour
+            from datetime import timedelta
+            cutoff = (datetime.now() - timedelta(hours=1)).isoformat()
+            cursor.execute('''
+                SELECT id, file_path, filename, created_date FROM file_inventory
+                WHERE platform = ? AND source = ? AND created_date > ?
+                AND location IN ('final', 'review')
+                ORDER BY id ASC
+            ''', (platform, username, cutoff))
+
+        files = cursor.fetchall()
+    finally:
+        conn.close()
+
+    if not files:
+        return 0
+
+    # Filter to existing files, track max id for updating last_imported_file_id
+    valid_files = []
+    max_file_id = last_imported_file_id or 0
+    for f in files:
+        fp = Path(f['file_path'])
+        file_id = f['id']
+        if file_id > max_file_id:
+            max_file_id = file_id
+        if fp.exists() and fp.stat().st_size > 0:
+            valid_files.append({'path': fp, 'created_date': f['created_date'], 'id': file_id})
+
+    if not valid_files:
+        return 0
+
+    # Get storage path
+    conn = sqlite3.connect(db.db_path, timeout=10)
+    conn.row_factory = sqlite3.Row
+    try:
+        cursor = conn.cursor()
+        cursor.execute("SELECT value FROM private_media_config WHERE key = 'storage_path'")
+        row = cursor.fetchone()
+        storage_path = Path(row['value']) if row else Path('/opt/immich/private')
+    finally:
+        conn.close()
+
+    data_path = storage_path / 'data'
+    thumbs_path = storage_path / 'thumbs'
+    data_path.mkdir(parents=True, exist_ok=True)
+    thumbs_path.mkdir(parents=True, exist_ok=True)
+
+    # Get/create platform tag
+    tag_id = _ensure_platform_tag(platform, db, crypto)
+
+    # Create a post for this batch
+    now_iso = datetime.now().isoformat()
+    encrypted_desc = crypto.encrypt_field(f"{PLATFORM_LABELS.get(platform, platform)} - @{username}")
+    encrypted_date = crypto.encrypt_field(now_iso)
+
+    conn = sqlite3.connect(db.db_path, timeout=10)
+    conn.row_factory = sqlite3.Row
+    try:
+        cursor = conn.cursor()
+        cursor.execute('''
+            INSERT INTO private_media_posts (person_id, encrypted_description, encrypted_media_date, created_at, updated_at)
+            VALUES (?, ?, ?, ?, ?)
+        ''', (person_id, encrypted_desc, encrypted_date, now_iso, now_iso))
+        conn.commit()
+        post_id = cursor.lastrowid
+    finally:
+        conn.close()
+
+    media_count = 0
+    latest_date = last_imported_at
+
+    for file_info_entry in valid_files:
+        file_path = file_info_entry['path']
+        created_date = file_info_entry['created_date']
+        # Normalize to string for consistent comparison (PostgreSQL returns datetime objects)
+        if hasattr(created_date, 'isoformat'):
+            created_date = created_date.isoformat()
+        try:
+            # Calculate file hash
+            sha256 = hashlib.sha256()
+            with open(file_path, 'rb') as f:
+                for chunk in iter(lambda: f.read(65536), b''):
+                    sha256.update(chunk)
+            file_hash = sha256.hexdigest()
+
+            # Check for duplicates (scoped by person)
+            conn = sqlite3.connect(db.db_path, timeout=10)
+            conn.row_factory = sqlite3.Row
+            try:
+                cursor = conn.cursor()
+                cursor.execute(
+                    'SELECT id FROM private_media WHERE file_hash = ? AND person_id = ?',
+                    (file_hash, person_id)
+                )
+                if cursor.fetchone():
+                    logger.debug(f"Duplicate file skipped: {file_path.name}")
+                    if created_date and (not latest_date or created_date > latest_date):
+                        latest_date = created_date
+                    continue
+            finally:
+                conn.close()
+
+            # Get file info
+            finfo = _get_file_info(file_path)
+            file_size = file_path.stat().st_size
+
+            # Compute perceptual hash
+            perceptual_hash = _compute_perceptual_hash(file_path)
+
+            # Generate storage ID
+            storage_id = str(uuid.uuid4())
+
+            # Generate thumbnail
+            temp_thumb = Path(tempfile.gettempdir()) / f"pg_thumb_{storage_id}.jpg"
+            _generate_thumbnail(file_path, temp_thumb, finfo['file_type'])
+
+            # Encrypt the file
+            encrypted_file = data_path / f"{storage_id}.enc"
+            if not crypto.encrypt_file(file_path, encrypted_file):
+                logger.error(f"Encryption failed for {file_path.name}")
+                continue
+
+            # Encrypt thumbnail
+            if temp_thumb.exists():
+                encrypted_thumb = thumbs_path / f"{storage_id}.enc"
+                crypto.encrypt_file(temp_thumb, encrypted_thumb)
+                try:
+                    temp_thumb.unlink()
+                except Exception:
+                    pass
+
+            # Insert media record
+            encrypted_filename = crypto.encrypt_field(file_path.name)
+            encrypted_source = crypto.encrypt_field(f"@{username}")
+
+            conn = sqlite3.connect(db.db_path, timeout=10)
+            conn.row_factory = sqlite3.Row
+            try:
+                cursor = conn.cursor()
+                cursor.execute('''
+                    INSERT INTO private_media (
+                        post_id, storage_id, encrypted_filename, encrypted_description,
+                        file_hash, file_size, file_type, mime_type,
+                        width, height, duration, person_id,
+                        encrypted_media_date, source_type, encrypted_source_path,
+                        perceptual_hash, created_at
+                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                ''', (
+                    post_id,
+                    storage_id,
+                    encrypted_filename,
+                    None,
+                    file_hash,
+                    file_size,
+                    finfo['file_type'],
+                    finfo['mime_type'],
+                    finfo['width'],
+                    finfo['height'],
+                    finfo['duration'],
+                    person_id,
+                    encrypted_date,
+                    platform,
+                    encrypted_source,
+                    perceptual_hash,
+                    now_iso,
+                ))
+                conn.commit()
+            finally:
+                conn.close()
+
+            media_count += 1
+            if created_date and (not latest_date or created_date > latest_date):
+                latest_date = created_date
+
+        except Exception as e:
+            logger.error(f"Failed to import {file_path.name}: {e}")
+
+    # Apply platform tag to the post if we imported media
+    if media_count > 0:
+        conn = sqlite3.connect(db.db_path, timeout=10)
+        try:
+            cursor = conn.cursor()
+            cursor.execute('''
+                INSERT OR IGNORE INTO private_media_post_tags (post_id, tag_id)
+                VALUES (?, ?)
+            ''', (post_id, tag_id))
+            conn.commit()
+        finally:
+            conn.close()
+
+        # Update the mapping row with both timestamp and file id markers
+        conn = sqlite3.connect(db.db_path, timeout=10)
+        try:
+            cursor = conn.cursor()
+            cursor.execute('''
+                UPDATE private_media_scraper_accounts
+                SET last_imported_at = ?,
+                    last_imported_file_id = ?,
+                    total_media_imported = total_media_imported + ?,
+                    updated_at = ?
+                WHERE platform = ? AND username = ? AND person_id = ?
+            ''', (latest_date or now_iso, max_file_id, media_count, now_iso, platform, username, person_id))
+            conn.commit()
+        finally:
+            conn.close()
+
+        logger.info(f"Imported {media_count} files from {platform}/@{username} to gallery (last_file_id={max_file_id})")
+    else:
+        # No media imported - still update the file id marker so we don't re-check these files
+        if max_file_id > (last_imported_file_id or 0):
+            conn = sqlite3.connect(db.db_path, timeout=10)
+            try:
+                cursor = conn.cursor()
+                cursor.execute('''
+                    UPDATE private_media_scraper_accounts
+                    SET last_imported_file_id = ?
+                    WHERE platform = ? AND username = ? AND person_id = ?
+                ''', (max_file_id, platform, username, person_id))
+                conn.commit()
+            finally:
+                conn.close()
+
+        # Delete the empty post
+        conn = sqlite3.connect(db.db_path, timeout=10)
+        try:
+            cursor = conn.cursor()
+            cursor.execute("DELETE FROM private_media_posts WHERE id = ?", (post_id,))
+            conn.commit()
+        finally:
+            conn.close()
+
+    return media_count
+
+
+def on_download_complete(task_id: str, download_count: int, db, crypto) -> int:
+    """
+    Called from scheduler after a task completes.
+    Checks ALL mapped accounts for the platform for new media.
+
+    This handles all cases:
+    - Batch tasks (fastdl:all, imginn_api:all)
+    - Per-user tasks that also download phrase_search users (instagram_client:evalongoria)
+    - Simple per-user tasks (toolzu:evalongoria)
+
+    The id-based filtering is cheap — accounts with no new files return quickly.
+    """
+    if not task_id or ':' not in task_id:
+        return 0
+
+    scraper_module = task_id.split(':')[0]
+
+    # Map scraper module to platform
+    platform = SCRAPER_TO_PLATFORM.get(scraper_module)
+    if not platform:
+        return 0
+
+    # Always check ALL mapped accounts for the platform.
+    # A single task can download for many users (batch tasks, phrase_search),
+    # and id-based filtering makes per-account checks cheap.
+    return _import_all_mapped_accounts(platform, db, crypto)
+
+
+def _import_all_mapped_accounts(platform: str, db, crypto) -> int:
+    """
+    After a batch task (e.g. fastdl:all), check ALL mapped accounts
+    for the platform and import any new media.
+    """
+    conn = sqlite3.connect(db.db_path, timeout=10)
+    conn.row_factory = sqlite3.Row
+    try:
+        cursor = conn.cursor()
+        cursor.execute('''
+            SELECT id, username, person_id, last_imported_at, last_imported_file_id
+            FROM private_media_scraper_accounts
+            WHERE platform = ? AND enabled = 1
+        ''', (platform,))
+        rows = cursor.fetchall()
+    finally:
+        conn.close()
+
+    if not rows:
+        return 0
+
+    total_imported = 0
+    for row in rows:
+        try:
+            count = import_new_media(
+                platform, row['username'], row['person_id'],
+                row['last_imported_at'], db, crypto,
+                last_imported_file_id=row['last_imported_file_id'] or 0
+            )
+            total_imported += count
+        except Exception as e:
+            logger.error(f"Gallery bridge batch import error for {platform}/@{row['username']}: {e}")
+
+    if total_imported > 0:
+        logger.info(f"Batch import for {platform}: {total_imported} files across {len(rows)} accounts")
+
+    return total_imported