Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

View File

@@ -0,0 +1,652 @@
"""
Scraper Gallery Bridge
Maps scraper accounts (Instagram, TikTok, Snapchat) to private gallery persons.
After each download session, auto-imports new media as gallery posts.
"""
import hashlib
import logging
import mimetypes
import sqlite3
import subprocess
import tempfile
import uuid
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
SCRAPER_BRIDGE_KEY_FILE = '/opt/immich/private/.scraper_bridge_key'
# Map scraper module names → platform
SCRAPER_TO_PLATFORM = {
'fastdl': 'instagram',
'imginn': 'instagram',
'imginn_api': 'instagram',
'instagram_client': 'instagram',
'toolzu': 'instagram',
'instagram': 'instagram',
'instagram_unified': 'instagram',
'tiktok': 'tiktok',
'snapchat': 'snapchat',
'snapchat_client': 'snapchat',
}
PLATFORM_COLORS = {
'instagram': '#E1306C',
'tiktok': '#00f2ea',
'snapchat': '#FFFC00',
}
PLATFORM_LABELS = {
'instagram': 'Instagram',
'tiktok': 'TikTok',
'snapchat': 'Snapchat',
}
def get_crypto():
"""Load crypto from key file for background access (works when gallery is locked)."""
from modules.private_gallery_crypto import load_key_from_file
crypto = load_key_from_file(SCRAPER_BRIDGE_KEY_FILE)
if crypto is None:
logger.debug("Scraper bridge crypto unavailable - key file missing or invalid")
return crypto
def get_available_accounts(platform: str, config: dict, db) -> List[Dict[str, Any]]:
"""
Aggregate usernames from all scraper configs + paid_content_creators for a platform.
Returns de-duplicated list with source annotations.
"""
accounts = {} # username -> set of sources
if platform == 'instagram':
# instagram.accounts[].username
ig_cfg = config.get('instagram', {})
if ig_cfg.get('enabled', False):
ig_accounts = ig_cfg.get('accounts', [])
if not ig_accounts and 'usernames' in ig_cfg:
ig_accounts = [{'username': u} for u in ig_cfg['usernames']]
for acc in ig_accounts:
u = acc.get('username', '').strip().lower()
if u:
accounts.setdefault(u, set()).add('instagram')
# Collect usernames + phrase_search usernames from each scraper
for scraper_id in ('fastdl', 'imginn', 'imginn_api', 'instagram_client', 'toolzu'):
scraper_cfg = config.get(scraper_id, {})
if not scraper_cfg.get('enabled', False):
continue
for u in scraper_cfg.get('usernames', []):
u = u.strip().lower()
if u:
accounts.setdefault(u, set()).add(scraper_id)
# phrase_search usernames are also downloadable accounts
for u in scraper_cfg.get('phrase_search', {}).get('usernames', []):
u = u.strip().lower()
if u:
accounts.setdefault(u, set()).add(scraper_id)
elif platform == 'tiktok':
tt_cfg = config.get('tiktok', {})
if tt_cfg.get('enabled', False):
tt_accounts = tt_cfg.get('accounts', [])
if not tt_accounts and 'usernames' in tt_cfg:
tt_accounts = [{'username': u} for u in tt_cfg['usernames']]
for acc in tt_accounts:
u = acc.get('username', '').strip().lower()
if u:
accounts.setdefault(u, set()).add('tiktok')
elif platform == 'snapchat':
# snapchat.usernames
sc_cfg = config.get('snapchat', {})
if sc_cfg.get('enabled', False):
for u in sc_cfg.get('usernames', []):
u = u.strip().lower()
if u:
accounts.setdefault(u, set()).add('snapchat')
# snapchat_client.usernames
sc_client_cfg = config.get('snapchat_client', {})
if sc_client_cfg.get('enabled', False):
for u in sc_client_cfg.get('usernames', []):
u = u.strip().lower()
if u:
accounts.setdefault(u, set()).add('snapchat_client')
# Add from paid_content_creators table
try:
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute(
'SELECT username FROM paid_content_creators WHERE platform = ? AND enabled = 1',
(platform,)
)
for row in cursor.fetchall():
u = row['username'].strip().lower()
if u:
accounts.setdefault(u, set()).add('paid_content')
conn.close()
except Exception as e:
logger.debug(f"Could not query paid_content_creators: {e}")
# Check which are already mapped
mapped_usernames = set()
try:
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute(
'SELECT username FROM private_media_scraper_accounts WHERE platform = ?',
(platform,)
)
for row in cursor.fetchall():
mapped_usernames.add(row['username'].lower())
conn.close()
except Exception:
pass
result = []
for username, sources in sorted(accounts.items()):
result.append({
'username': username,
'sources': sorted(sources),
'is_mapped': username.lower() in mapped_usernames,
})
return result
def _ensure_platform_tag(platform: str, db, crypto) -> int:
"""Find or create a tag for the platform in private_gallery_tags."""
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
try:
cursor = conn.cursor()
cursor.execute("SELECT id, encrypted_name FROM private_gallery_tags")
label = PLATFORM_LABELS.get(platform, platform.title())
for row in cursor.fetchall():
try:
name = crypto.decrypt_field(row['encrypted_name'])
if name.lower() == label.lower():
return row['id']
except Exception:
continue
# Create the tag
encrypted_name = crypto.encrypt_field(label)
color = PLATFORM_COLORS.get(platform, '#888888')
cursor.execute('''
INSERT INTO private_gallery_tags (encrypted_name, color)
VALUES (?, ?)
''', (encrypted_name, color))
conn.commit()
tag_id = cursor.lastrowid
logger.info(f"Created '{label}' tag with ID {tag_id}")
return tag_id
finally:
conn.close()
def _get_file_info(file_path: Path) -> Dict[str, Any]:
"""Get file type, mime type, and dimensions."""
ext = file_path.suffix.lower().lstrip('.')
mime_type, _ = mimetypes.guess_type(str(file_path))
if not mime_type:
mime_type = 'application/octet-stream'
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
if ext in image_exts:
file_type = 'image'
elif ext in video_exts:
file_type = 'video'
else:
file_type = 'other'
width, height, duration = 0, 0, 0
if file_type == 'image':
try:
from PIL import Image
with Image.open(file_path) as img:
width, height = img.size
except Exception:
pass
elif file_type == 'video':
try:
result = subprocess.run(
['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_streams', str(file_path)],
capture_output=True, text=True, timeout=15
)
if result.returncode == 0:
import json
probe = json.loads(result.stdout)
for stream in probe.get('streams', []):
if stream.get('codec_type') == 'video':
width = int(stream.get('width', 0))
height = int(stream.get('height', 0))
dur = stream.get('duration')
if dur:
duration = int(float(dur))
break
except Exception:
pass
return {
'file_type': file_type,
'mime_type': mime_type,
'width': width,
'height': height,
'duration': duration,
}
def _compute_perceptual_hash(file_path: Path) -> Optional[str]:
"""Calculate perceptual hash for an image or video file."""
try:
import imagehash
from PIL import Image
except ImportError:
return None
ext = file_path.suffix.lower().lstrip('.')
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
pil_image = None
try:
if ext in video_exts:
try:
import cv2
except ImportError:
return None
cap = cv2.VideoCapture(str(file_path))
if not cap.isOpened():
return None
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
cap.set(cv2.CAP_PROP_POS_FRAMES, int(total_frames * 0.5))
ret, frame = cap.read()
cap.release()
if not ret or frame is None:
return None
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(frame_rgb)
elif ext in image_exts:
pil_image = Image.open(file_path)
else:
return None
return str(imagehash.dhash(pil_image, hash_size=16))
except Exception:
return None
finally:
if pil_image:
try:
pil_image.close()
except Exception:
pass
def _generate_thumbnail(file_path: Path, output_path: Path, file_type: str) -> bool:
"""Generate a thumbnail for an image or video."""
try:
output_path.parent.mkdir(parents=True, exist_ok=True)
if file_type == 'image':
from PIL import Image, ImageOps
with Image.open(file_path) as img:
img = ImageOps.exif_transpose(img)
img.thumbnail((400, 400))
if img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
img.save(output_path, 'JPEG', quality=85)
return True
elif file_type == 'video':
result = subprocess.run([
'ffmpeg', '-y', '-i', str(file_path),
'-ss', '00:00:01', '-vframes', '1',
'-vf', 'scale=400:-1:force_original_aspect_ratio=decrease',
str(output_path)
], capture_output=True, timeout=30)
return result.returncode == 0 and output_path.exists()
except Exception:
pass
return False
def import_new_media(platform: str, username: str, person_id: int,
last_imported_at: Optional[str], db, crypto,
last_imported_file_id: int = 0) -> int:
"""
Import new media files from file_inventory into the private gallery.
Returns count of imported files.
"""
conn = sqlite3.connect(db.db_path, timeout=30)
conn.row_factory = sqlite3.Row
try:
cursor = conn.cursor()
# Use id-based filtering (reliable, monotonically increasing with insertion order).
# Falls back to created_date only for legacy accounts without last_imported_file_id.
if last_imported_file_id and last_imported_file_id > 0:
cursor.execute('''
SELECT id, file_path, filename, created_date FROM file_inventory
WHERE platform = ? AND source = ? AND id > ?
AND location IN ('final', 'review')
ORDER BY id ASC
''', (platform, username, last_imported_file_id))
elif last_imported_at:
cursor.execute('''
SELECT id, file_path, filename, created_date FROM file_inventory
WHERE platform = ? AND source = ? AND created_date > ?
AND location IN ('final', 'review')
ORDER BY id ASC
''', (platform, username, last_imported_at))
else:
# First run: only import files from the last 1 hour
from datetime import timedelta
cutoff = (datetime.now() - timedelta(hours=1)).isoformat()
cursor.execute('''
SELECT id, file_path, filename, created_date FROM file_inventory
WHERE platform = ? AND source = ? AND created_date > ?
AND location IN ('final', 'review')
ORDER BY id ASC
''', (platform, username, cutoff))
files = cursor.fetchall()
finally:
conn.close()
if not files:
return 0
# Filter to existing files, track max id for updating last_imported_file_id
valid_files = []
max_file_id = last_imported_file_id or 0
for f in files:
fp = Path(f['file_path'])
file_id = f['id']
if file_id > max_file_id:
max_file_id = file_id
if fp.exists() and fp.stat().st_size > 0:
valid_files.append({'path': fp, 'created_date': f['created_date'], 'id': file_id})
if not valid_files:
return 0
# Get storage path
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
try:
cursor = conn.cursor()
cursor.execute("SELECT value FROM private_media_config WHERE key = 'storage_path'")
row = cursor.fetchone()
storage_path = Path(row['value']) if row else Path('/opt/immich/private')
finally:
conn.close()
data_path = storage_path / 'data'
thumbs_path = storage_path / 'thumbs'
data_path.mkdir(parents=True, exist_ok=True)
thumbs_path.mkdir(parents=True, exist_ok=True)
# Get/create platform tag
tag_id = _ensure_platform_tag(platform, db, crypto)
# Create a post for this batch
now_iso = datetime.now().isoformat()
encrypted_desc = crypto.encrypt_field(f"{PLATFORM_LABELS.get(platform, platform)} - @{username}")
encrypted_date = crypto.encrypt_field(now_iso)
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
try:
cursor = conn.cursor()
cursor.execute('''
INSERT INTO private_media_posts (person_id, encrypted_description, encrypted_media_date, created_at, updated_at)
VALUES (?, ?, ?, ?, ?)
''', (person_id, encrypted_desc, encrypted_date, now_iso, now_iso))
conn.commit()
post_id = cursor.lastrowid
finally:
conn.close()
media_count = 0
latest_date = last_imported_at
for file_info_entry in valid_files:
file_path = file_info_entry['path']
created_date = file_info_entry['created_date']
# Normalize to string for consistent comparison (PostgreSQL returns datetime objects)
if hasattr(created_date, 'isoformat'):
created_date = created_date.isoformat()
try:
# Calculate file hash
sha256 = hashlib.sha256()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(65536), b''):
sha256.update(chunk)
file_hash = sha256.hexdigest()
# Check for duplicates (scoped by person)
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
try:
cursor = conn.cursor()
cursor.execute(
'SELECT id FROM private_media WHERE file_hash = ? AND person_id = ?',
(file_hash, person_id)
)
if cursor.fetchone():
logger.debug(f"Duplicate file skipped: {file_path.name}")
if created_date and (not latest_date or created_date > latest_date):
latest_date = created_date
continue
finally:
conn.close()
# Get file info
finfo = _get_file_info(file_path)
file_size = file_path.stat().st_size
# Compute perceptual hash
perceptual_hash = _compute_perceptual_hash(file_path)
# Generate storage ID
storage_id = str(uuid.uuid4())
# Generate thumbnail
temp_thumb = Path(tempfile.gettempdir()) / f"pg_thumb_{storage_id}.jpg"
_generate_thumbnail(file_path, temp_thumb, finfo['file_type'])
# Encrypt the file
encrypted_file = data_path / f"{storage_id}.enc"
if not crypto.encrypt_file(file_path, encrypted_file):
logger.error(f"Encryption failed for {file_path.name}")
continue
# Encrypt thumbnail
if temp_thumb.exists():
encrypted_thumb = thumbs_path / f"{storage_id}.enc"
crypto.encrypt_file(temp_thumb, encrypted_thumb)
try:
temp_thumb.unlink()
except Exception:
pass
# Insert media record
encrypted_filename = crypto.encrypt_field(file_path.name)
encrypted_source = crypto.encrypt_field(f"@{username}")
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
try:
cursor = conn.cursor()
cursor.execute('''
INSERT INTO private_media (
post_id, storage_id, encrypted_filename, encrypted_description,
file_hash, file_size, file_type, mime_type,
width, height, duration, person_id,
encrypted_media_date, source_type, encrypted_source_path,
perceptual_hash, created_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
post_id,
storage_id,
encrypted_filename,
None,
file_hash,
file_size,
finfo['file_type'],
finfo['mime_type'],
finfo['width'],
finfo['height'],
finfo['duration'],
person_id,
encrypted_date,
platform,
encrypted_source,
perceptual_hash,
now_iso,
))
conn.commit()
finally:
conn.close()
media_count += 1
if created_date and (not latest_date or created_date > latest_date):
latest_date = created_date
except Exception as e:
logger.error(f"Failed to import {file_path.name}: {e}")
# Apply platform tag to the post if we imported media
if media_count > 0:
conn = sqlite3.connect(db.db_path, timeout=10)
try:
cursor = conn.cursor()
cursor.execute('''
INSERT OR IGNORE INTO private_media_post_tags (post_id, tag_id)
VALUES (?, ?)
''', (post_id, tag_id))
conn.commit()
finally:
conn.close()
# Update the mapping row with both timestamp and file id markers
conn = sqlite3.connect(db.db_path, timeout=10)
try:
cursor = conn.cursor()
cursor.execute('''
UPDATE private_media_scraper_accounts
SET last_imported_at = ?,
last_imported_file_id = ?,
total_media_imported = total_media_imported + ?,
updated_at = ?
WHERE platform = ? AND username = ? AND person_id = ?
''', (latest_date or now_iso, max_file_id, media_count, now_iso, platform, username, person_id))
conn.commit()
finally:
conn.close()
logger.info(f"Imported {media_count} files from {platform}/@{username} to gallery (last_file_id={max_file_id})")
else:
# No media imported - still update the file id marker so we don't re-check these files
if max_file_id > (last_imported_file_id or 0):
conn = sqlite3.connect(db.db_path, timeout=10)
try:
cursor = conn.cursor()
cursor.execute('''
UPDATE private_media_scraper_accounts
SET last_imported_file_id = ?
WHERE platform = ? AND username = ? AND person_id = ?
''', (max_file_id, platform, username, person_id))
conn.commit()
finally:
conn.close()
# Delete the empty post
conn = sqlite3.connect(db.db_path, timeout=10)
try:
cursor = conn.cursor()
cursor.execute("DELETE FROM private_media_posts WHERE id = ?", (post_id,))
conn.commit()
finally:
conn.close()
return media_count
def on_download_complete(task_id: str, download_count: int, db, crypto) -> int:
"""
Called from scheduler after a task completes.
Checks ALL mapped accounts for the platform for new media.
This handles all cases:
- Batch tasks (fastdl:all, imginn_api:all)
- Per-user tasks that also download phrase_search users (instagram_client:evalongoria)
- Simple per-user tasks (toolzu:evalongoria)
The id-based filtering is cheap — accounts with no new files return quickly.
"""
if not task_id or ':' not in task_id:
return 0
scraper_module = task_id.split(':')[0]
# Map scraper module to platform
platform = SCRAPER_TO_PLATFORM.get(scraper_module)
if not platform:
return 0
# Always check ALL mapped accounts for the platform.
# A single task can download for many users (batch tasks, phrase_search),
# and id-based filtering makes per-account checks cheap.
return _import_all_mapped_accounts(platform, db, crypto)
def _import_all_mapped_accounts(platform: str, db, crypto) -> int:
"""
After a batch task (e.g. fastdl:all), check ALL mapped accounts
for the platform and import any new media.
"""
conn = sqlite3.connect(db.db_path, timeout=10)
conn.row_factory = sqlite3.Row
try:
cursor = conn.cursor()
cursor.execute('''
SELECT id, username, person_id, last_imported_at, last_imported_file_id
FROM private_media_scraper_accounts
WHERE platform = ? AND enabled = 1
''', (platform,))
rows = cursor.fetchall()
finally:
conn.close()
if not rows:
return 0
total_imported = 0
for row in rows:
try:
count = import_new_media(
platform, row['username'], row['person_id'],
row['last_imported_at'], db, crypto,
last_imported_file_id=row['last_imported_file_id'] or 0
)
total_imported += count
except Exception as e:
logger.error(f"Gallery bridge batch import error for {platform}/@{row['username']}: {e}")
if total_imported > 0:
logger.info(f"Batch import for {platform}: {total_imported} files across {len(rows)} accounts")
return total_imported