""" Scraper Gallery Bridge Maps scraper accounts (Instagram, TikTok, Snapchat) to private gallery persons. After each download session, auto-imports new media as gallery posts. """ import hashlib import logging import mimetypes import sqlite3 import subprocess import tempfile import uuid from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) SCRAPER_BRIDGE_KEY_FILE = '/opt/immich/private/.scraper_bridge_key' # Map scraper module names → platform SCRAPER_TO_PLATFORM = { 'fastdl': 'instagram', 'imginn': 'instagram', 'imginn_api': 'instagram', 'instagram_client': 'instagram', 'toolzu': 'instagram', 'instagram': 'instagram', 'instagram_unified': 'instagram', 'tiktok': 'tiktok', 'snapchat': 'snapchat', 'snapchat_client': 'snapchat', } PLATFORM_COLORS = { 'instagram': '#E1306C', 'tiktok': '#00f2ea', 'snapchat': '#FFFC00', } PLATFORM_LABELS = { 'instagram': 'Instagram', 'tiktok': 'TikTok', 'snapchat': 'Snapchat', } def get_crypto(): """Load crypto from key file for background access (works when gallery is locked).""" from modules.private_gallery_crypto import load_key_from_file crypto = load_key_from_file(SCRAPER_BRIDGE_KEY_FILE) if crypto is None: logger.debug("Scraper bridge crypto unavailable - key file missing or invalid") return crypto def get_available_accounts(platform: str, config: dict, db) -> List[Dict[str, Any]]: """ Aggregate usernames from all scraper configs + paid_content_creators for a platform. Returns de-duplicated list with source annotations. """ accounts = {} # username -> set of sources if platform == 'instagram': # instagram.accounts[].username ig_cfg = config.get('instagram', {}) if ig_cfg.get('enabled', False): ig_accounts = ig_cfg.get('accounts', []) if not ig_accounts and 'usernames' in ig_cfg: ig_accounts = [{'username': u} for u in ig_cfg['usernames']] for acc in ig_accounts: u = acc.get('username', '').strip().lower() if u: accounts.setdefault(u, set()).add('instagram') # Collect usernames + phrase_search usernames from each scraper for scraper_id in ('fastdl', 'imginn', 'imginn_api', 'instagram_client', 'toolzu'): scraper_cfg = config.get(scraper_id, {}) if not scraper_cfg.get('enabled', False): continue for u in scraper_cfg.get('usernames', []): u = u.strip().lower() if u: accounts.setdefault(u, set()).add(scraper_id) # phrase_search usernames are also downloadable accounts for u in scraper_cfg.get('phrase_search', {}).get('usernames', []): u = u.strip().lower() if u: accounts.setdefault(u, set()).add(scraper_id) elif platform == 'tiktok': tt_cfg = config.get('tiktok', {}) if tt_cfg.get('enabled', False): tt_accounts = tt_cfg.get('accounts', []) if not tt_accounts and 'usernames' in tt_cfg: tt_accounts = [{'username': u} for u in tt_cfg['usernames']] for acc in tt_accounts: u = acc.get('username', '').strip().lower() if u: accounts.setdefault(u, set()).add('tiktok') elif platform == 'snapchat': # snapchat.usernames sc_cfg = config.get('snapchat', {}) if sc_cfg.get('enabled', False): for u in sc_cfg.get('usernames', []): u = u.strip().lower() if u: accounts.setdefault(u, set()).add('snapchat') # snapchat_client.usernames sc_client_cfg = config.get('snapchat_client', {}) if sc_client_cfg.get('enabled', False): for u in sc_client_cfg.get('usernames', []): u = u.strip().lower() if u: accounts.setdefault(u, set()).add('snapchat_client') # Add from paid_content_creators table try: conn = sqlite3.connect(db.db_path, timeout=10) conn.row_factory = sqlite3.Row cursor = conn.cursor() cursor.execute( 'SELECT username FROM paid_content_creators WHERE platform = ? AND enabled = 1', (platform,) ) for row in cursor.fetchall(): u = row['username'].strip().lower() if u: accounts.setdefault(u, set()).add('paid_content') conn.close() except Exception as e: logger.debug(f"Could not query paid_content_creators: {e}") # Check which are already mapped mapped_usernames = set() try: conn = sqlite3.connect(db.db_path, timeout=10) conn.row_factory = sqlite3.Row cursor = conn.cursor() cursor.execute( 'SELECT username FROM private_media_scraper_accounts WHERE platform = ?', (platform,) ) for row in cursor.fetchall(): mapped_usernames.add(row['username'].lower()) conn.close() except Exception: pass result = [] for username, sources in sorted(accounts.items()): result.append({ 'username': username, 'sources': sorted(sources), 'is_mapped': username.lower() in mapped_usernames, }) return result def _ensure_platform_tag(platform: str, db, crypto) -> int: """Find or create a tag for the platform in private_gallery_tags.""" conn = sqlite3.connect(db.db_path, timeout=10) conn.row_factory = sqlite3.Row try: cursor = conn.cursor() cursor.execute("SELECT id, encrypted_name FROM private_gallery_tags") label = PLATFORM_LABELS.get(platform, platform.title()) for row in cursor.fetchall(): try: name = crypto.decrypt_field(row['encrypted_name']) if name.lower() == label.lower(): return row['id'] except Exception: continue # Create the tag encrypted_name = crypto.encrypt_field(label) color = PLATFORM_COLORS.get(platform, '#888888') cursor.execute(''' INSERT INTO private_gallery_tags (encrypted_name, color) VALUES (?, ?) ''', (encrypted_name, color)) conn.commit() tag_id = cursor.lastrowid logger.info(f"Created '{label}' tag with ID {tag_id}") return tag_id finally: conn.close() def _get_file_info(file_path: Path) -> Dict[str, Any]: """Get file type, mime type, and dimensions.""" ext = file_path.suffix.lower().lstrip('.') mime_type, _ = mimetypes.guess_type(str(file_path)) if not mime_type: mime_type = 'application/octet-stream' image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'} video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'} if ext in image_exts: file_type = 'image' elif ext in video_exts: file_type = 'video' else: file_type = 'other' width, height, duration = 0, 0, 0 if file_type == 'image': try: from PIL import Image with Image.open(file_path) as img: width, height = img.size except Exception: pass elif file_type == 'video': try: result = subprocess.run( ['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_streams', str(file_path)], capture_output=True, text=True, timeout=15 ) if result.returncode == 0: import json probe = json.loads(result.stdout) for stream in probe.get('streams', []): if stream.get('codec_type') == 'video': width = int(stream.get('width', 0)) height = int(stream.get('height', 0)) dur = stream.get('duration') if dur: duration = int(float(dur)) break except Exception: pass return { 'file_type': file_type, 'mime_type': mime_type, 'width': width, 'height': height, 'duration': duration, } def _compute_perceptual_hash(file_path: Path) -> Optional[str]: """Calculate perceptual hash for an image or video file.""" try: import imagehash from PIL import Image except ImportError: return None ext = file_path.suffix.lower().lstrip('.') image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'} video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'} pil_image = None try: if ext in video_exts: try: import cv2 except ImportError: return None cap = cv2.VideoCapture(str(file_path)) if not cap.isOpened(): return None total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) cap.set(cv2.CAP_PROP_POS_FRAMES, int(total_frames * 0.5)) ret, frame = cap.read() cap.release() if not ret or frame is None: return None frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) pil_image = Image.fromarray(frame_rgb) elif ext in image_exts: pil_image = Image.open(file_path) else: return None return str(imagehash.dhash(pil_image, hash_size=16)) except Exception: return None finally: if pil_image: try: pil_image.close() except Exception: pass def _generate_thumbnail(file_path: Path, output_path: Path, file_type: str) -> bool: """Generate a thumbnail for an image or video.""" try: output_path.parent.mkdir(parents=True, exist_ok=True) if file_type == 'image': from PIL import Image, ImageOps with Image.open(file_path) as img: img = ImageOps.exif_transpose(img) img.thumbnail((400, 400)) if img.mode in ('RGBA', 'P'): img = img.convert('RGB') img.save(output_path, 'JPEG', quality=85) return True elif file_type == 'video': result = subprocess.run([ 'ffmpeg', '-y', '-i', str(file_path), '-ss', '00:00:01', '-vframes', '1', '-vf', 'scale=400:-1:force_original_aspect_ratio=decrease', str(output_path) ], capture_output=True, timeout=30) return result.returncode == 0 and output_path.exists() except Exception: pass return False def import_new_media(platform: str, username: str, person_id: int, last_imported_at: Optional[str], db, crypto, last_imported_file_id: int = 0) -> int: """ Import new media files from file_inventory into the private gallery. Returns count of imported files. """ conn = sqlite3.connect(db.db_path, timeout=30) conn.row_factory = sqlite3.Row try: cursor = conn.cursor() # Use id-based filtering (reliable, monotonically increasing with insertion order). # Falls back to created_date only for legacy accounts without last_imported_file_id. if last_imported_file_id and last_imported_file_id > 0: cursor.execute(''' SELECT id, file_path, filename, created_date FROM file_inventory WHERE platform = ? AND source = ? AND id > ? AND location IN ('final', 'review') ORDER BY id ASC ''', (platform, username, last_imported_file_id)) elif last_imported_at: cursor.execute(''' SELECT id, file_path, filename, created_date FROM file_inventory WHERE platform = ? AND source = ? AND created_date > ? AND location IN ('final', 'review') ORDER BY id ASC ''', (platform, username, last_imported_at)) else: # First run: only import files from the last 1 hour from datetime import timedelta cutoff = (datetime.now() - timedelta(hours=1)).isoformat() cursor.execute(''' SELECT id, file_path, filename, created_date FROM file_inventory WHERE platform = ? AND source = ? AND created_date > ? AND location IN ('final', 'review') ORDER BY id ASC ''', (platform, username, cutoff)) files = cursor.fetchall() finally: conn.close() if not files: return 0 # Filter to existing files, track max id for updating last_imported_file_id valid_files = [] max_file_id = last_imported_file_id or 0 for f in files: fp = Path(f['file_path']) file_id = f['id'] if file_id > max_file_id: max_file_id = file_id if fp.exists() and fp.stat().st_size > 0: valid_files.append({'path': fp, 'created_date': f['created_date'], 'id': file_id}) if not valid_files: return 0 # Get storage path conn = sqlite3.connect(db.db_path, timeout=10) conn.row_factory = sqlite3.Row try: cursor = conn.cursor() cursor.execute("SELECT value FROM private_media_config WHERE key = 'storage_path'") row = cursor.fetchone() storage_path = Path(row['value']) if row else Path('/opt/immich/private') finally: conn.close() data_path = storage_path / 'data' thumbs_path = storage_path / 'thumbs' data_path.mkdir(parents=True, exist_ok=True) thumbs_path.mkdir(parents=True, exist_ok=True) # Get/create platform tag tag_id = _ensure_platform_tag(platform, db, crypto) # Create a post for this batch now_iso = datetime.now().isoformat() encrypted_desc = crypto.encrypt_field(f"{PLATFORM_LABELS.get(platform, platform)} - @{username}") encrypted_date = crypto.encrypt_field(now_iso) conn = sqlite3.connect(db.db_path, timeout=10) conn.row_factory = sqlite3.Row try: cursor = conn.cursor() cursor.execute(''' INSERT INTO private_media_posts (person_id, encrypted_description, encrypted_media_date, created_at, updated_at) VALUES (?, ?, ?, ?, ?) ''', (person_id, encrypted_desc, encrypted_date, now_iso, now_iso)) conn.commit() post_id = cursor.lastrowid finally: conn.close() media_count = 0 latest_date = last_imported_at for file_info_entry in valid_files: file_path = file_info_entry['path'] created_date = file_info_entry['created_date'] # Normalize to string for consistent comparison (PostgreSQL returns datetime objects) if hasattr(created_date, 'isoformat'): created_date = created_date.isoformat() try: # Calculate file hash sha256 = hashlib.sha256() with open(file_path, 'rb') as f: for chunk in iter(lambda: f.read(65536), b''): sha256.update(chunk) file_hash = sha256.hexdigest() # Check for duplicates (scoped by person) conn = sqlite3.connect(db.db_path, timeout=10) conn.row_factory = sqlite3.Row try: cursor = conn.cursor() cursor.execute( 'SELECT id FROM private_media WHERE file_hash = ? AND person_id = ?', (file_hash, person_id) ) if cursor.fetchone(): logger.debug(f"Duplicate file skipped: {file_path.name}") if created_date and (not latest_date or created_date > latest_date): latest_date = created_date continue finally: conn.close() # Get file info finfo = _get_file_info(file_path) file_size = file_path.stat().st_size # Compute perceptual hash perceptual_hash = _compute_perceptual_hash(file_path) # Generate storage ID storage_id = str(uuid.uuid4()) # Generate thumbnail temp_thumb = Path(tempfile.gettempdir()) / f"pg_thumb_{storage_id}.jpg" _generate_thumbnail(file_path, temp_thumb, finfo['file_type']) # Encrypt the file encrypted_file = data_path / f"{storage_id}.enc" if not crypto.encrypt_file(file_path, encrypted_file): logger.error(f"Encryption failed for {file_path.name}") continue # Encrypt thumbnail if temp_thumb.exists(): encrypted_thumb = thumbs_path / f"{storage_id}.enc" crypto.encrypt_file(temp_thumb, encrypted_thumb) try: temp_thumb.unlink() except Exception: pass # Insert media record encrypted_filename = crypto.encrypt_field(file_path.name) encrypted_source = crypto.encrypt_field(f"@{username}") conn = sqlite3.connect(db.db_path, timeout=10) conn.row_factory = sqlite3.Row try: cursor = conn.cursor() cursor.execute(''' INSERT INTO private_media ( post_id, storage_id, encrypted_filename, encrypted_description, file_hash, file_size, file_type, mime_type, width, height, duration, person_id, encrypted_media_date, source_type, encrypted_source_path, perceptual_hash, created_at ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( post_id, storage_id, encrypted_filename, None, file_hash, file_size, finfo['file_type'], finfo['mime_type'], finfo['width'], finfo['height'], finfo['duration'], person_id, encrypted_date, platform, encrypted_source, perceptual_hash, now_iso, )) conn.commit() finally: conn.close() media_count += 1 if created_date and (not latest_date or created_date > latest_date): latest_date = created_date except Exception as e: logger.error(f"Failed to import {file_path.name}: {e}") # Apply platform tag to the post if we imported media if media_count > 0: conn = sqlite3.connect(db.db_path, timeout=10) try: cursor = conn.cursor() cursor.execute(''' INSERT OR IGNORE INTO private_media_post_tags (post_id, tag_id) VALUES (?, ?) ''', (post_id, tag_id)) conn.commit() finally: conn.close() # Update the mapping row with both timestamp and file id markers conn = sqlite3.connect(db.db_path, timeout=10) try: cursor = conn.cursor() cursor.execute(''' UPDATE private_media_scraper_accounts SET last_imported_at = ?, last_imported_file_id = ?, total_media_imported = total_media_imported + ?, updated_at = ? WHERE platform = ? AND username = ? AND person_id = ? ''', (latest_date or now_iso, max_file_id, media_count, now_iso, platform, username, person_id)) conn.commit() finally: conn.close() logger.info(f"Imported {media_count} files from {platform}/@{username} to gallery (last_file_id={max_file_id})") else: # No media imported - still update the file id marker so we don't re-check these files if max_file_id > (last_imported_file_id or 0): conn = sqlite3.connect(db.db_path, timeout=10) try: cursor = conn.cursor() cursor.execute(''' UPDATE private_media_scraper_accounts SET last_imported_file_id = ? WHERE platform = ? AND username = ? AND person_id = ? ''', (max_file_id, platform, username, person_id)) conn.commit() finally: conn.close() # Delete the empty post conn = sqlite3.connect(db.db_path, timeout=10) try: cursor = conn.cursor() cursor.execute("DELETE FROM private_media_posts WHERE id = ?", (post_id,)) conn.commit() finally: conn.close() return media_count def on_download_complete(task_id: str, download_count: int, db, crypto) -> int: """ Called from scheduler after a task completes. Checks ALL mapped accounts for the platform for new media. This handles all cases: - Batch tasks (fastdl:all, imginn_api:all) - Per-user tasks that also download phrase_search users (instagram_client:evalongoria) - Simple per-user tasks (toolzu:evalongoria) The id-based filtering is cheap — accounts with no new files return quickly. """ if not task_id or ':' not in task_id: return 0 scraper_module = task_id.split(':')[0] # Map scraper module to platform platform = SCRAPER_TO_PLATFORM.get(scraper_module) if not platform: return 0 # Always check ALL mapped accounts for the platform. # A single task can download for many users (batch tasks, phrase_search), # and id-based filtering makes per-account checks cheap. return _import_all_mapped_accounts(platform, db, crypto) def _import_all_mapped_accounts(platform: str, db, crypto) -> int: """ After a batch task (e.g. fastdl:all), check ALL mapped accounts for the platform and import any new media. """ conn = sqlite3.connect(db.db_path, timeout=10) conn.row_factory = sqlite3.Row try: cursor = conn.cursor() cursor.execute(''' SELECT id, username, person_id, last_imported_at, last_imported_file_id FROM private_media_scraper_accounts WHERE platform = ? AND enabled = 1 ''', (platform,)) rows = cursor.fetchall() finally: conn.close() if not rows: return 0 total_imported = 0 for row in rows: try: count = import_new_media( platform, row['username'], row['person_id'], row['last_imported_at'], db, crypto, last_imported_file_id=row['last_imported_file_id'] or 0 ) total_imported += count except Exception as e: logger.error(f"Gallery bridge batch import error for {platform}/@{row['username']}: {e}") if total_imported > 0: logger.info(f"Batch import for {platform}: {total_imported} files across {len(rows)} accounts") return total_imported