#!/usr/bin/env python3 """ Reddit Community Monitor Module Monitors specified Reddit communities (subreddits) for new posts, downloads all media (including imgur/redgifs attachments via gallery-dl), and automatically creates private gallery posts for mapped persons. Design: - Each community (subreddit) is mapped to a person in the private gallery - Uses gallery-dl for downloading with full Reddit/imgur/redgifs support - All new posts are tagged with a "reddit" tag - Configurable check intervals and lookback periods - Supports cookie authentication for age-gated content """ import asyncio import hashlib import json import mimetypes import os import re import sqlite3 import subprocess import tempfile import uuid from datetime import datetime, timedelta from pathlib import Path from typing import Dict, List, Optional, Any, Set from modules.universal_logger import get_logger logger = get_logger('RedditMonitor') # Key file path for background crypto access REDDIT_MONITOR_KEY_FILE = '/opt/immich/private/.reddit_monitor_key' class RedditCommunityMonitor: """ Background monitor for Reddit communities. Downloads media from subreddits and imports to private gallery. """ def __init__(self, db_path: str, activity_manager=None): self.db_path = db_path self.activity_manager = activity_manager self.gallery_dl_path = '/opt/media-downloader/venv/bin/gallery-dl' def _get_connection(self) -> sqlite3.Connection: """Get a database connection with row factory.""" conn = sqlite3.connect(self.db_path) conn.row_factory = sqlite3.Row conn.execute("PRAGMA foreign_keys = ON") return conn # ========================================================================= # SETTINGS METHODS # ========================================================================= def get_settings(self) -> Dict: """Get Reddit monitor settings from private_media_config.""" conn = self._get_connection() try: cursor = conn.cursor() cursor.execute( "SELECT key, value FROM private_media_config WHERE key LIKE 'reddit_monitor_%'" ) rows = cursor.fetchall() settings = {} for row in rows: key = row['key'].replace('reddit_monitor_', '') value = row['value'] if value == 'true': value = True elif value == 'false': value = False elif value and value.isdigit(): value = int(value) settings[key] = value # Defaults return { 'enabled': settings.get('enabled', False), 'check_interval_hours': settings.get('check_interval_hours', 4), 'lookback_days': settings.get('lookback_days', 3), 'last_checked': settings.get('last_checked', None), } finally: conn.close() def update_settings(self, **kwargs) -> bool: """Update Reddit monitor settings.""" conn = self._get_connection() try: cursor = conn.cursor() for key, value in kwargs.items(): db_key = f'reddit_monitor_{key}' if isinstance(value, bool): db_value = 'true' if value else 'false' else: db_value = str(value) if value is not None else '' cursor.execute(''' INSERT OR REPLACE INTO private_media_config (key, value, updated_at) VALUES (?, ?, CURRENT_TIMESTAMP) ''', (db_key, db_value)) conn.commit() return True except Exception as e: logger.error(f"Failed to update settings: {e}") return False finally: conn.close() # ========================================================================= # COOKIE MANAGEMENT # ========================================================================= def save_cookies(self, crypto, cookies_json: str) -> bool: """Save cookies encrypted in the config table.""" conn = self._get_connection() try: encrypted = crypto.encrypt_field(cookies_json) cursor = conn.cursor() cursor.execute(''' INSERT OR REPLACE INTO private_media_config (key, value, updated_at) VALUES ('reddit_monitor_encrypted_cookies', ?, CURRENT_TIMESTAMP) ''', (encrypted,)) conn.commit() logger.info("Reddit cookies saved (encrypted)") return True except Exception as e: logger.error(f"Failed to save cookies: {e}") return False finally: conn.close() def has_cookies(self, crypto) -> bool: """Check if encrypted cookies exist and are valid.""" conn = self._get_connection() try: cursor = conn.cursor() cursor.execute( "SELECT value FROM private_media_config WHERE key = 'reddit_monitor_encrypted_cookies'" ) row = cursor.fetchone() if not row or not row['value']: return False # Try to decrypt to verify they're valid try: decrypted = crypto.decrypt_field(row['value']) return bool(decrypted and decrypted.strip()) except Exception: return False finally: conn.close() def delete_cookies(self) -> bool: """Delete stored cookies.""" conn = self._get_connection() try: cursor = conn.cursor() cursor.execute( "DELETE FROM private_media_config WHERE key = 'reddit_monitor_encrypted_cookies'" ) conn.commit() logger.info("Reddit cookies deleted") return True except Exception as e: logger.error(f"Failed to delete cookies: {e}") return False finally: conn.close() def _get_cookies_json(self, crypto) -> Optional[str]: """Load and decrypt stored cookies JSON.""" conn = self._get_connection() try: cursor = conn.cursor() cursor.execute( "SELECT value FROM private_media_config WHERE key = 'reddit_monitor_encrypted_cookies'" ) row = cursor.fetchone() if not row or not row['value']: return None try: return crypto.decrypt_field(row['value']) except Exception as e: logger.error(f"Failed to decrypt cookies: {e}") return None finally: conn.close() def _write_netscape_cookie_file(self, cookies_json: str, output_path: str) -> bool: """Convert JSON cookies array to Netscape cookie file format for gallery-dl.""" try: cookies = json.loads(cookies_json) if not isinstance(cookies, list): logger.error("Cookies is not a JSON array") return False with open(output_path, 'w') as f: f.write("# Netscape HTTP Cookie File\n") f.write("# https://curl.haxx.se/docs/http-cookies.html\n\n") for cookie in cookies: domain = cookie.get('domain', '') # Ensure domain starts with . for domain-wide cookies include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE' path = cookie.get('path', '/') secure = 'TRUE' if cookie.get('secure', False) else 'FALSE' # Convert expiry - use 0 for session cookies expires = cookie.get('expirationDate', cookie.get('expiry', cookie.get('expires', 0))) if expires is None: expires = 0 expires = str(int(float(expires))) name = cookie.get('name', '') value = cookie.get('value', '') f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n") return True except Exception as e: logger.error(f"Failed to write Netscape cookie file: {e}") return False # ========================================================================= # COMMUNITY MAPPING METHODS # ========================================================================= def get_all_communities(self) -> List[Dict]: """Get all community mappings with person info and live media count.""" conn = self._get_connection() try: cursor = conn.cursor() cursor.execute(''' SELECT c.*, p.encrypted_name as person_encrypted_name, r.encrypted_name as relationship_encrypted_name, r.color as relationship_color, (SELECT COUNT(*) FROM private_media m WHERE m.person_id = c.person_id AND m.source_type = 'reddit') as actual_media_count FROM private_media_reddit_communities c LEFT JOIN private_media_persons p ON c.person_id = p.id LEFT JOIN private_media_relationships r ON p.relationship_id = r.id ORDER BY c.subreddit_name ''') communities = [] for row in cursor.fetchall(): d = dict(row) d['total_media_found'] = d.pop('actual_media_count', 0) communities.append(d) return communities finally: conn.close() def get_community(self, community_id: int) -> Optional[Dict]: """Get a single community mapping.""" conn = self._get_connection() try: cursor = conn.cursor() cursor.execute(''' SELECT c.*, p.encrypted_name as person_encrypted_name, r.encrypted_name as relationship_encrypted_name, r.color as relationship_color FROM private_media_reddit_communities c LEFT JOIN private_media_persons p ON c.person_id = p.id LEFT JOIN private_media_relationships r ON p.relationship_id = r.id WHERE c.id = ? ''', (community_id,)) row = cursor.fetchone() return dict(row) if row else None finally: conn.close() def add_community(self, subreddit_name: str, person_id: int) -> int: """Add a new community mapping. Returns the new ID.""" # Strip r/ prefix if present subreddit_name = re.sub(r'^r/', '', subreddit_name.strip()) conn = self._get_connection() try: cursor = conn.cursor() cursor.execute(''' INSERT INTO private_media_reddit_communities (subreddit_name, person_id) VALUES (?, ?) ''', (subreddit_name, person_id)) conn.commit() return cursor.lastrowid finally: conn.close() def update_community(self, community_id: int, **kwargs) -> bool: """Update a community mapping.""" conn = self._get_connection() try: cursor = conn.cursor() updates = [] values = [] for key, value in kwargs.items(): if key == 'subreddit_name' and value is not None: value = re.sub(r'^r/', '', value.strip()) if key == 'enabled': value = 1 if value else 0 updates.append(f'{key} = ?') values.append(value) if not updates: return False updates.append("updated_at = CURRENT_TIMESTAMP") values.append(community_id) cursor.execute( f"UPDATE private_media_reddit_communities SET {', '.join(updates)} WHERE id = ?", values ) conn.commit() return cursor.rowcount > 0 except Exception as e: logger.error(f"Failed to update community {community_id}: {e}") return False finally: conn.close() def delete_community(self, community_id: int) -> bool: """Delete a community mapping.""" conn = self._get_connection() try: cursor = conn.cursor() cursor.execute( "DELETE FROM private_media_reddit_communities WHERE id = ?", (community_id,) ) conn.commit() return cursor.rowcount > 0 finally: conn.close() def get_communities_for_person(self, person_id: int) -> List[Dict]: """Get all communities mapped to a person.""" conn = self._get_connection() try: cursor = conn.cursor() cursor.execute( "SELECT * FROM private_media_reddit_communities WHERE person_id = ?", (person_id,) ) return [dict(row) for row in cursor.fetchall()] finally: conn.close() def get_history(self, community_id: int) -> List[Dict]: """Get download history for a community.""" conn = self._get_connection() try: cursor = conn.cursor() cursor.execute(''' SELECT * FROM private_media_reddit_history WHERE community_id = ? ORDER BY processed_at DESC LIMIT 100 ''', (community_id,)) return [dict(row) for row in cursor.fetchall()] finally: conn.close() # ========================================================================= # CRYPTO ACCESS # ========================================================================= def _get_crypto(self): """Load crypto from key file for background access.""" from modules.private_gallery_crypto import load_key_from_file crypto = load_key_from_file(REDDIT_MONITOR_KEY_FILE) if crypto is None: logger.warning("Reddit monitor crypto unavailable - key file missing or invalid") return crypto # ========================================================================= # DOWNLOAD METHODS # ========================================================================= async def check_all_now(self, from_scheduler: bool = False) -> int: """ Check all enabled communities for new posts. Args: from_scheduler: Whether this was triggered by the scheduler Returns: Total count of new media items imported """ settings = self.get_settings() if from_scheduler and not settings.get('enabled'): logger.debug("Reddit monitor is disabled") return 0 crypto = self._get_crypto() if crypto is None: logger.warning("Skipping Reddit check: encryption key not available") return 0 # Get enabled communities conn = self._get_connection() try: cursor = conn.cursor() cursor.execute( "SELECT * FROM private_media_reddit_communities WHERE enabled = 1" ) communities = [dict(row) for row in cursor.fetchall()] finally: conn.close() if not communities: logger.debug("No enabled Reddit communities to check") return 0 # Start background task tracking if self.activity_manager: self.activity_manager.start_background_task( 'reddit_monitor', 'reddit_community_monitor', 'Reddit Community Monitor', 'Running', {'total_communities': len(communities), 'media_found': 0} ) total_media = 0 affected_person_ids: Set[int] = set() lookback_days = settings.get('lookback_days', 3) # Load cookies from encrypted storage cookies_json = self._get_cookies_json(crypto) # Crash recovery checkpoint from modules.task_checkpoint import TaskCheckpoint checkpoint = TaskCheckpoint('reddit_monitor', 'background') checkpoint.start(total_items=len(communities)) if checkpoint.is_recovering(): logger.info("Reddit monitor: recovering — skipping already-checked communities") try: for idx, community in enumerate(communities): subreddit = community['subreddit_name'] person_id = community['person_id'] community_id = community['id'] if checkpoint.is_completed(str(community_id)): continue checkpoint.set_current(str(community_id)) try: # Use longer lookback for communities that have never imported anything effective_lookback = lookback_days if community.get('total_media_found', 0) == 0 and not community.get('last_checked'): effective_lookback = 30 logger.info(f"First check for r/{subreddit}, using 30-day lookback") media_count = await self._check_community( community_id, subreddit, person_id, effective_lookback, cookies_json, crypto, community_idx=idx, total_communities=len(communities), running_media_total=total_media ) total_media += media_count if media_count > 0: affected_person_ids.add(person_id) if self.activity_manager: self.activity_manager.update_background_task( 'reddit_monitor', f'Found {media_count} new media in r/{subreddit}', idx + 1, len(communities), {'total_communities': len(communities), 'media_found': total_media, 'current_community': f'r/{subreddit}', 'phase': 'found', 'last_found': media_count} ) except Exception as e: logger.error(f"Error checking r/{subreddit}: {e}") import traceback logger.debug(f"Traceback: {traceback.format_exc()}") checkpoint.mark_completed(str(community_id)) # Checkpoint complete checkpoint.finish() # Update last_checked timestamp self.update_settings(last_checked=datetime.now().isoformat()) # Auto-dedup for persons that received new media if affected_person_ids: if self.activity_manager: self.activity_manager.update_background_task( 'reddit_monitor', 'Deduplicating...', len(communities), len(communities), {'phase': 'deduplicating', 'media_found': total_media} ) dedup_deleted = self._run_dedup_for_persons(affected_person_ids, crypto) if dedup_deleted > 0: logger.info(f"Reddit monitor: auto-dedup removed {dedup_deleted} duplicates") finally: if self.activity_manager: self.activity_manager.stop_background_task('reddit_monitor') if total_media > 0: logger.info(f"Reddit monitor: imported {total_media} new media items") else: logger.debug("Reddit monitor: no new media found") return total_media async def download_full_community(self, community_id: int) -> int: """ Download all available media from a community (no date filter). Args: community_id: ID of the community to download Returns: Count of new media items imported """ community = self.get_community(community_id) if not community: logger.error(f"Community {community_id} not found") return 0 crypto = self._get_crypto() if crypto is None: logger.warning("Cannot download: encryption key not available") return 0 cookies_json = self._get_cookies_json(crypto) subreddit = community['subreddit_name'] if self.activity_manager: self.activity_manager.start_background_task( 'reddit_monitor', 'reddit_community_monitor', 'Reddit Community Monitor', 'Running', {'total_communities': 1, 'media_found': 0, 'current_community': f'r/{subreddit}', 'full_download': True} ) try: media_count = await self._check_community( community_id, subreddit, community['person_id'], None, cookies_json, crypto, community_idx=0, total_communities=1, running_media_total=0 ) return media_count finally: if self.activity_manager: self.activity_manager.stop_background_task('reddit_monitor') async def check_single_community(self, community_id: int) -> int: """ Check a single community for new posts (using lookback_days filter). Args: community_id: ID of the community to check Returns: Count of new media items imported """ community = self.get_community(community_id) if not community: logger.error(f"Community {community_id} not found") return 0 crypto = self._get_crypto() if crypto is None: logger.warning("Cannot check: encryption key not available") return 0 settings = self.get_settings() lookback_days = settings.get('lookback_days', 3) cookies_json = self._get_cookies_json(crypto) subreddit = community['subreddit_name'] if self.activity_manager: self.activity_manager.start_background_task( 'reddit_monitor', 'reddit_community_monitor', 'Reddit Community Monitor', 'Running', {'total_communities': 1, 'media_found': 0, 'current_community': f'r/{subreddit}'} ) try: # Use longer lookback for communities that have never imported anything effective_lookback = lookback_days if community.get('total_media_found', 0) == 0 and not community.get('last_checked'): effective_lookback = 30 logger.info(f"First check for r/{subreddit}, using 30-day lookback") media_count = await self._check_community( community_id, subreddit, community['person_id'], effective_lookback, cookies_json, crypto, community_idx=0, total_communities=1, running_media_total=0 ) return media_count finally: if self.activity_manager: self.activity_manager.stop_background_task('reddit_monitor') async def check_communities_by_person(self, person_id: int) -> int: """ Check all enabled communities for a given person. Args: person_id: ID of the person whose communities to check Returns: Total count of new media items imported """ crypto = self._get_crypto() if crypto is None: logger.warning("Cannot check: encryption key not available") return 0 # Get enabled communities for this person conn = self._get_connection() try: cursor = conn.cursor() cursor.execute( "SELECT * FROM private_media_reddit_communities WHERE person_id = ? AND enabled = 1", (person_id,) ) communities = [dict(row) for row in cursor.fetchall()] finally: conn.close() if not communities: logger.debug(f"No enabled communities for person {person_id}") return 0 settings = self.get_settings() lookback_days = settings.get('lookback_days', 3) cookies_json = self._get_cookies_json(crypto) if self.activity_manager: self.activity_manager.start_background_task( 'reddit_monitor', 'reddit_community_monitor', 'Reddit Community Monitor', 'Running', {'total_communities': len(communities), 'media_found': 0} ) total_media = 0 try: for idx, community in enumerate(communities): subreddit = community['subreddit_name'] community_id = community['id'] try: effective_lookback = lookback_days if community.get('total_media_found', 0) == 0 and not community.get('last_checked'): effective_lookback = 30 logger.info(f"First check for r/{subreddit}, using 30-day lookback") media_count = await self._check_community( community_id, subreddit, community['person_id'], effective_lookback, cookies_json, crypto, community_idx=idx, total_communities=len(communities), running_media_total=total_media ) total_media += media_count if media_count > 0 and self.activity_manager: self.activity_manager.update_background_task( 'reddit_monitor', f'Found {media_count} new media in r/{subreddit}', idx + 1, len(communities), {'total_communities': len(communities), 'media_found': total_media, 'current_community': f'r/{subreddit}', 'phase': 'found', 'last_found': media_count} ) except Exception as e: logger.error(f"Error checking r/{subreddit}: {e}") import traceback logger.debug(f"Traceback: {traceback.format_exc()}") # Auto-dedup for this person if new media was imported if total_media > 0: if self.activity_manager: self.activity_manager.update_background_task( 'reddit_monitor', 'Deduplicating...', len(communities), len(communities), {'phase': 'deduplicating', 'media_found': total_media} ) dedup_deleted = self._run_dedup_for_persons({person_id}, crypto) if dedup_deleted > 0: logger.info(f"Reddit person check: auto-dedup removed {dedup_deleted} duplicates") finally: if self.activity_manager: self.activity_manager.stop_background_task('reddit_monitor') if total_media > 0: logger.info(f"Reddit person check: imported {total_media} new media items") return total_media def _update_status(self, status_text: str, community_idx: int, total_communities: int, extra: Dict = None): """Helper to update background task status with detailed info.""" if not self.activity_manager: return data = { 'total_communities': total_communities, 'media_found': extra.get('media_found', 0) if extra else 0, } if extra: data.update(extra) self.activity_manager.update_background_task( 'reddit_monitor', status_text, community_idx, total_communities, data ) async def _check_community( self, community_id: int, subreddit: str, person_id: int, lookback_days: Optional[int], cookies_json: Optional[str], crypto, community_idx: int = 0, total_communities: int = 1, running_media_total: int = 0 ) -> int: """Check a single community and import new media.""" with tempfile.TemporaryDirectory(prefix=f'reddit_{subreddit}_') as temp_dir: # Phase: Downloading self._update_status( f'Downloading from r/{subreddit}...', community_idx, total_communities, {'current_community': f'r/{subreddit}', 'phase': 'downloading', 'media_found': running_media_total} ) # Run gallery-dl files = await self._run_gallery_dl( subreddit, temp_dir, lookback_days, cookies_json ) if not files: logger.debug(f"No files downloaded from r/{subreddit}") self._update_status( f'No new files in r/{subreddit}', community_idx, total_communities, {'current_community': f'r/{subreddit}', 'phase': 'done', 'media_found': running_media_total} ) # Still update last_checked so we know we tried conn = self._get_connection() try: cursor = conn.cursor() cursor.execute(''' UPDATE private_media_reddit_communities SET last_checked = CURRENT_TIMESTAMP WHERE id = ? ''', (community_id,)) conn.commit() finally: conn.close() return 0 # Phase: Processing self._update_status( f'Downloaded {len(files)} files from r/{subreddit}, grouping by post...', community_idx, total_communities, {'current_community': f'r/{subreddit}', 'phase': 'processing', 'files_downloaded': len(files), 'media_found': running_media_total} ) # Group files by Reddit post ID posts = self._group_files_by_post(files, temp_dir) # Get or create "reddit" tag reddit_tag_id = self._ensure_reddit_tag(crypto) # Filter out already-processed posts new_posts = {} for reddit_post_id, post_data in posts.items(): if not self._is_post_processed(community_id, reddit_post_id): new_posts[reddit_post_id] = post_data if not new_posts: self._update_status( f'No new posts in r/{subreddit} ({len(posts)} already imported)', community_idx, total_communities, {'current_community': f'r/{subreddit}', 'phase': 'done', 'files_downloaded': len(files), 'media_found': running_media_total} ) # Still need to update total_media = 0 path below posts_to_import = {} else: posts_to_import = new_posts # Import each post total_media = 0 for post_num, (reddit_post_id, post_data) in enumerate(posts_to_import.items(), 1): num_files = len(post_data['files']) self._update_status( f'Importing post {post_num}/{len(posts_to_import)} from r/{subreddit} ({num_files} files)', community_idx, total_communities, {'current_community': f'r/{subreddit}', 'phase': 'importing', 'files_downloaded': len(files), 'posts_imported': post_num, 'posts_total': len(posts_to_import), 'media_found': running_media_total + total_media} ) media_count = self._import_post_to_gallery( post_data, person_id, reddit_tag_id, crypto, subreddit, community_id, reddit_post_id, community_idx, total_communities, running_media_total + total_media ) total_media += media_count # Update community stats if total_media > 0: conn = self._get_connection() try: cursor = conn.cursor() cursor.execute(''' UPDATE private_media_reddit_communities SET total_media_found = total_media_found + ?, last_checked = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP WHERE id = ? ''', (total_media, community_id)) conn.commit() finally: conn.close() else: conn = self._get_connection() try: cursor = conn.cursor() cursor.execute(''' UPDATE private_media_reddit_communities SET last_checked = CURRENT_TIMESTAMP WHERE id = ? ''', (community_id,)) conn.commit() finally: conn.close() return total_media # ========================================================================= # HELPER METHODS # ========================================================================= async def _run_gallery_dl( self, subreddit: str, temp_dir: str, lookback_days: Optional[int] = None, cookies_json: Optional[str] = None ) -> List[Path]: """ Run gallery-dl to download media from a subreddit. Returns: List of downloaded file paths """ # Use a persistent download archive so gallery-dl skips already-downloaded URLs archive_dir = os.path.join(os.path.dirname(self.db_path) if '/' in self.db_path else '/opt/media-downloader/data', 'cache') os.makedirs(archive_dir, exist_ok=True) archive_path = os.path.join(archive_dir, 'reddit_gallery_dl_archive.db') cmd = [ self.gallery_dl_path, '--write-metadata', '--download-archive', archive_path, '-d', temp_dir, ] # Use REST API mode instead of OAuth API to avoid shared rate limits. # The default OAuth client-id is shared by all gallery-dl users globally, # causing 429 rate limits with many subreddits. REST mode uses www.reddit.com # directly with cookies for auth, bypassing OAuth rate limits entirely. cmd.extend(['-o', 'extractor.reddit.api=rest']) # Limit to 200 most recent posts per subreddit to avoid timeout from full history pagination cmd.extend(['--range', '1-200']) cmd.append(f'https://www.reddit.com/r/{subreddit}/new/') if lookback_days: cutoff = (datetime.now() - timedelta(days=lookback_days)).strftime('%Y-%m-%d') cmd.extend(['--filter', f"date >= datetime.strptime('{cutoff}', '%Y-%m-%d')"]) # Write JSON cookies to a temp Netscape cookie file # Ensure temp_dir exists (can be cleaned by systemd-tmpfiles or race conditions) temp_cookie_file = None if cookies_json: os.makedirs(temp_dir, exist_ok=True) temp_cookie_file = os.path.join(temp_dir, '.cookies.txt') if self._write_netscape_cookie_file(cookies_json, temp_cookie_file): cmd.extend(['--cookies', temp_cookie_file]) logger.info(f"Running gallery-dl for r/{subreddit}") logger.debug(f"Command: {' '.join(cmd)}") try: loop = asyncio.get_event_loop() result = await loop.run_in_executor( None, lambda: subprocess.run( cmd, capture_output=True, text=True, timeout=600 # 10 minute timeout ) ) # gallery-dl exit codes are bitflags: 1=some errors, 4=some skipped, 8=all skipped # Code 4 (skipped) and 5 (skipped+errors) are normal when files already exist if result.returncode not in (0, 1, 4, 5): logger.warning(f"gallery-dl returned code {result.returncode} for r/{subreddit}") if result.stderr: logger.debug(f"gallery-dl stderr: {result.stderr[:500]}") except subprocess.TimeoutExpired: logger.error(f"gallery-dl timed out for r/{subreddit}") return [] except Exception as e: logger.error(f"gallery-dl failed for r/{subreddit}: {e}") return [] # Collect all non-JSON, non-cookie files from the temp directory downloaded = [] skip_exts = {'.json', '.txt'} skip_names = {'.cookies.txt'} for root, dirs, filenames in os.walk(temp_dir): for fname in filenames: if fname in skip_names or fname.startswith('.'): continue if fname.endswith('.json'): continue downloaded.append(Path(root) / fname) logger.info(f"Downloaded {len(downloaded)} files from r/{subreddit}") return downloaded def _group_files_by_post( self, files: List[Path], temp_dir: str ) -> Dict[str, Dict]: """ Group downloaded files by their Reddit post ID using metadata JSON sidecars. Returns: Dict mapping reddit_post_id -> { 'files': [Path], 'title': str, 'date': str, 'source_url': str } """ posts: Dict[str, Dict] = {} for file_path in files: # Look for matching metadata JSON sidecar json_path = file_path.with_suffix(file_path.suffix + '.json') if not json_path.exists(): # Try without double extension json_path = file_path.with_suffix('.json') metadata = {} if json_path.exists(): try: with open(json_path, 'r', encoding='utf-8') as f: metadata = json.load(f) except (json.JSONDecodeError, Exception) as e: logger.debug(f"Failed to parse metadata for {file_path.name}: {e}") # Extract Reddit post ID - gallery-dl uses various field names reddit_post_id = None for key in ('id', 'reddit_id', 'parent_id'): if key in metadata: reddit_post_id = str(metadata[key]) break if not reddit_post_id: # Use filename-based grouping as fallback # gallery-dl typically names files like: subreddit_postid_num.ext parts = file_path.stem.split('_') if len(parts) >= 2: reddit_post_id = parts[-2] if len(parts) >= 3 else parts[-1] else: reddit_post_id = file_path.stem # Extract post date (ensure ISO format in local time for frontend) # gallery-dl stores Reddit dates in UTC — convert to local time post_date = None if 'date' in metadata: date_val = metadata['date'] if isinstance(date_val, str): try: from datetime import timezone as tz for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'): try: utc_dt = datetime.strptime(date_val, fmt).replace(tzinfo=tz.utc) post_date = utc_dt.astimezone().strftime('%Y-%m-%dT%H:%M:%S') break except ValueError: continue if not post_date: post_date = date_val # fallback to raw string except Exception: post_date = date_val elif isinstance(date_val, (int, float)): try: post_date = datetime.fromtimestamp(date_val).isoformat() except (ValueError, OSError): pass if not post_date and 'created_utc' in metadata: try: post_date = datetime.fromtimestamp(metadata['created_utc']).isoformat() except (ValueError, OSError): pass if not post_date: post_date = datetime.now().isoformat() # Extract title title = metadata.get('title', metadata.get('description', '')) # Build source URL subreddit = metadata.get('subreddit', '') source_url = f"https://www.reddit.com/r/{subreddit}/comments/{reddit_post_id}" if subreddit else '' if reddit_post_id not in posts: posts[reddit_post_id] = { 'files': [], 'title': title, 'date': post_date, 'source_url': source_url, } posts[reddit_post_id]['files'].append(file_path) return posts def _is_post_processed(self, community_id: int, reddit_post_id: str) -> bool: """Check if a Reddit post has already been processed.""" conn = self._get_connection() try: cursor = conn.cursor() cursor.execute( "SELECT id FROM private_media_reddit_history WHERE community_id = ? AND reddit_post_id = ?", (community_id, reddit_post_id) ) return cursor.fetchone() is not None finally: conn.close() def _ensure_reddit_tag(self, crypto) -> int: """Find or create a 'reddit' tag in private_gallery_tags.""" conn = self._get_connection() try: cursor = conn.cursor() cursor.execute("SELECT id, encrypted_name FROM private_gallery_tags") for row in cursor.fetchall(): try: name = crypto.decrypt_field(row['encrypted_name']) if name.lower() == 'reddit': return row['id'] except Exception: continue # Create the tag encrypted_name = crypto.encrypt_field('Reddit') cursor.execute(''' INSERT INTO private_gallery_tags (encrypted_name, color) VALUES (?, '#ff4500') ''', (encrypted_name,)) conn.commit() tag_id = cursor.lastrowid logger.info(f"Created 'Reddit' tag with ID {tag_id}") return tag_id finally: conn.close() def _import_post_to_gallery( self, post_data: Dict, person_id: int, reddit_tag_id: int, crypto, subreddit: str, community_id: int, reddit_post_id: str, community_idx: int = 0, total_communities: int = 1, running_media_total: int = 0 ) -> int: """ Import a Reddit post's media files into the private gallery. Returns: Number of media files successfully imported """ files = post_data['files'] title = post_data['title'] post_date = post_data['date'] source_url = post_data.get('source_url', '') if not files: return 0 # Get storage path from config conn = self._get_connection() try: cursor = conn.cursor() cursor.execute("SELECT value FROM private_media_config WHERE key = 'storage_path'") row = cursor.fetchone() storage_path = Path(row['value']) if row else Path('/opt/immich/private') finally: conn.close() data_path = storage_path / 'data' thumbs_path = storage_path / 'thumbs' data_path.mkdir(parents=True, exist_ok=True) thumbs_path.mkdir(parents=True, exist_ok=True) # Create a post encrypted_desc = crypto.encrypt_field(title) if title else None encrypted_date = crypto.encrypt_field(post_date) if post_date else crypto.encrypt_field(datetime.now().isoformat()) now_iso = datetime.now().isoformat() conn = self._get_connection() try: cursor = conn.cursor() cursor.execute(''' INSERT INTO private_media_posts (person_id, encrypted_description, encrypted_media_date, created_at, updated_at) VALUES (?, ?, ?, ?, ?) ''', (person_id, encrypted_desc, encrypted_date, now_iso, now_iso)) conn.commit() post_id = cursor.lastrowid finally: conn.close() media_count = 0 media_ids = [] total_files = len(files) for file_idx, file_path in enumerate(files, 1): try: if not file_path.exists() or file_path.stat().st_size == 0: continue # Update status: encrypting/importing file self._update_status( f'Encrypting file {file_idx}/{total_files} from r/{subreddit}', community_idx, total_communities, {'current_community': f'r/{subreddit}', 'phase': 'encrypting', 'current_file': file_idx, 'total_files': total_files, 'media_found': running_media_total + media_count} ) # Calculate file hash sha256 = hashlib.sha256() with open(file_path, 'rb') as f: for chunk in iter(lambda: f.read(65536), b''): sha256.update(chunk) file_hash = sha256.hexdigest() # Check for duplicates (scoped by person) conn = self._get_connection() try: cursor = conn.cursor() cursor.execute( 'SELECT id FROM private_media WHERE file_hash = ? AND person_id = ?', (file_hash, person_id) ) if cursor.fetchone(): logger.debug(f"Duplicate file skipped: {file_path.name}") continue finally: conn.close() # Get file info file_info = self._get_file_info(file_path) file_size = file_path.stat().st_size # Compute perceptual hash perceptual_hash = self._compute_perceptual_hash(file_path) # Generate storage ID storage_id = str(uuid.uuid4()) # Generate thumbnail temp_thumb = Path(tempfile.gettempdir()) / f"pg_thumb_{storage_id}.jpg" self._generate_thumbnail(file_path, temp_thumb, file_info['file_type']) # Encrypt the file encrypted_file = data_path / f"{storage_id}.enc" if not crypto.encrypt_file(file_path, encrypted_file): logger.error(f"Encryption failed for {file_path.name}") continue # Encrypt thumbnail if temp_thumb.exists(): encrypted_thumb = thumbs_path / f"{storage_id}.enc" crypto.encrypt_file(temp_thumb, encrypted_thumb) try: temp_thumb.unlink() except Exception: pass # Insert media record encrypted_filename = crypto.encrypt_field(file_path.name) encrypted_source = crypto.encrypt_field(source_url) conn = self._get_connection() try: cursor = conn.cursor() cursor.execute(''' INSERT INTO private_media ( post_id, storage_id, encrypted_filename, encrypted_description, file_hash, file_size, file_type, mime_type, width, height, duration, person_id, encrypted_media_date, source_type, encrypted_source_path, perceptual_hash, created_at ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( post_id, storage_id, encrypted_filename, None, file_hash, file_size, file_info['file_type'], file_info['mime_type'], file_info['width'], file_info['height'], file_info['duration'], person_id, encrypted_date, 'reddit', encrypted_source, perceptual_hash, now_iso )) media_id = cursor.lastrowid media_ids.append(media_id) conn.commit() finally: conn.close() media_count += 1 except Exception as e: logger.error(f"Failed to import {file_path.name}: {e}") import traceback logger.debug(f"Traceback: {traceback.format_exc()}") # Apply reddit tag to the post if media_count > 0: conn = self._get_connection() try: cursor = conn.cursor() cursor.execute(''' INSERT OR IGNORE INTO private_media_post_tags (post_id, tag_id) VALUES (?, ?) ''', (post_id, reddit_tag_id)) conn.commit() finally: conn.close() else: # Delete the empty post conn = self._get_connection() try: cursor = conn.cursor() cursor.execute("DELETE FROM private_media_posts WHERE id = ?", (post_id,)) conn.commit() finally: conn.close() # Record in history only if we successfully imported media if media_count > 0: conn = self._get_connection() try: cursor = conn.cursor() cursor.execute(''' INSERT OR IGNORE INTO private_media_reddit_history (community_id, reddit_post_id, media_count) VALUES (?, ?, ?) ''', (community_id, reddit_post_id, media_count)) conn.commit() finally: conn.close() if media_count > 0: logger.info(f"Imported {media_count} files from r/{subreddit} post {reddit_post_id}") return media_count def _get_file_info(self, file_path: Path) -> Dict[str, Any]: """Get file type, mime type, and dimensions.""" ext = file_path.suffix.lower().lstrip('.') mime_type, _ = mimetypes.guess_type(str(file_path)) if not mime_type: mime_type = 'application/octet-stream' image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'} video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'} if ext in image_exts: file_type = 'image' elif ext in video_exts: file_type = 'video' else: file_type = 'other' info = { 'file_type': file_type, 'mime_type': mime_type, 'width': None, 'height': None, 'duration': None } if file_type == 'image': try: from PIL import Image with Image.open(file_path) as img: info['width'], info['height'] = img.size except Exception: pass if file_type == 'video': try: result = subprocess.run([ 'ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_streams', '-show_format', str(file_path) ], capture_output=True, text=True, timeout=30) if result.returncode == 0: data = json.loads(result.stdout) for stream in data.get('streams', []): if stream.get('codec_type') == 'video': info['width'] = stream.get('width') info['height'] = stream.get('height') break if 'format' in data: duration = data['format'].get('duration') if duration: info['duration'] = float(duration) except Exception: pass return info def _run_dedup_for_persons(self, person_ids: Set[int], crypto) -> int: """ Run perceptual dedup for the given person IDs, auto-deleting duplicates. Uses the same algorithm as the dashboard's dedup scanner. Returns total number of duplicates deleted. """ if not person_ids: return 0 # Read config for threshold and storage path conn = self._get_connection() try: cursor = conn.cursor() cursor.execute("SELECT key, value FROM private_media_config WHERE key IN ('duplicate_auto_select_distance', 'storage_path')") config = {row['key']: row['value'] for row in cursor.fetchall()} finally: conn.close() threshold = int(config.get('duplicate_auto_select_distance', '2')) storage_path = Path(config.get('storage_path', '/opt/immich/private')) data_path = storage_path / 'data' thumbs_path = storage_path / 'thumbs' total_deleted = 0 for person_id in person_ids: try: total_deleted += self._dedup_person(person_id, crypto, threshold, data_path, thumbs_path, storage_path) except Exception as e: logger.error(f"Dedup failed for person {person_id}: {e}") import traceback logger.debug(f"Dedup traceback: {traceback.format_exc()}") return total_deleted def _dedup_person(self, person_id: int, crypto, threshold: int, data_path: Path, thumbs_path: Path, storage_path: Path) -> int: """Run dedup for a single person. Returns number of duplicates deleted.""" # Fetch all media with perceptual hashes for this person conn = self._get_connection() try: cursor = conn.cursor() cursor.execute(''' SELECT id, post_id, storage_id, file_type, perceptual_hash, width, height FROM private_media WHERE post_id IN (SELECT id FROM private_media_posts WHERE person_id = ?) AND file_type IN ('image', 'video') AND perceptual_hash IS NOT NULL AND perceptual_hash != '' ''', (person_id,)) all_media = [dict(row) for row in cursor.fetchall()] finally: conn.close() if len(all_media) < 2: return 0 # Pre-compute integer values for fast XOR-based hamming distance hash_ints = {} for m in all_media: try: hash_ints[m['id']] = int(m['perceptual_hash'], 16) except (ValueError, TypeError): pass # Union-Find for grouping duplicates parent = {m['id']: m['id'] for m in all_media} def find(x): while parent[x] != x: parent[x] = parent[parent[x]] x = parent[x] return x def union(x, y): px, py = find(x), find(y) if px != py: parent[px] = py # Compare all pairs for i in range(len(all_media)): id_i = all_media[i]['id'] if id_i not in hash_ints: continue hi = hash_ints[id_i] for j in range(i + 1, len(all_media)): id_j = all_media[j]['id'] if id_j not in hash_ints: continue dist = bin(hi ^ hash_ints[id_j]).count('1') if dist <= threshold: union(id_i, id_j) # Group by root groups: Dict[int, list] = {} for m in all_media: root = find(m['id']) if root not in groups: groups[root] = [] groups[root].append(m) # Filter to actual duplicate groups (size > 1) duplicate_groups = [g for g in groups.values() if len(g) > 1] if not duplicate_groups: return 0 # In each group: keep highest resolution, mark rest for deletion to_delete = [] for group in duplicate_groups: # Sort by resolution (width * height) descending, keep first group.sort(key=lambda m: (m['width'] or 0) * (m['height'] or 0), reverse=True) to_delete.extend(group[1:]) # All except the highest resolution if not to_delete: return 0 # Delete duplicate files and DB records deleted = 0 conn = self._get_connection() try: cursor = conn.cursor() for media in to_delete: storage_id = media['storage_id'] # Delete encrypted data file data_file = data_path / f"{storage_id}.enc" if data_file.exists(): data_file.unlink() # Delete thumbnail file thumb_file = thumbs_path / f"{storage_id}.enc" if thumb_file.exists(): thumb_file.unlink() # Delete DB record cursor.execute('DELETE FROM private_media WHERE id = ?', (media['id'],)) deleted += 1 conn.commit() finally: conn.close() # Clean up empty reddit-tagged posts self._cleanup_empty_reddit_posts_after_dedup(crypto, storage_path) logger.info(f"Dedup: deleted {deleted} duplicates across {len(duplicate_groups)} groups for person {person_id}") return deleted def _cleanup_empty_reddit_posts_after_dedup(self, crypto, storage_path: Path): """Delete reddit-tagged posts that have no remaining media after dedup.""" try: conn = self._get_connection() try: # Find the reddit tag ID cursor = conn.cursor() cursor.execute("SELECT id, encrypted_name FROM private_gallery_tags") reddit_tag_id = None for row in cursor.fetchall(): try: name = crypto.decrypt_field(row['encrypted_name']) if name and name.lower() == 'reddit': reddit_tag_id = row['id'] break except Exception: continue finally: conn.close() if reddit_tag_id is None: return # Find empty reddit-tagged posts conn = self._get_connection() try: cursor = conn.cursor() cursor.execute(''' SELECT p.id FROM private_media_posts p JOIN private_media_post_tags pt ON pt.post_id = p.id WHERE pt.tag_id = ? AND NOT EXISTS (SELECT 1 FROM private_media m WHERE m.post_id = p.id) ''', (reddit_tag_id,)) empty_posts = [row['id'] for row in cursor.fetchall()] finally: conn.close() if not empty_posts: return # Delete empty posts conn = self._get_connection() try: cursor = conn.cursor() for post_id in empty_posts: cursor.execute('DELETE FROM private_media_post_tags WHERE post_id = ?', (post_id,)) cursor.execute('DELETE FROM private_media_posts WHERE id = ?', (post_id,)) conn.commit() finally: conn.close() logger.info(f"Dedup cleanup: removed {len(empty_posts)} empty reddit-tagged posts") except Exception as e: logger.error(f"Failed to cleanup empty reddit posts after dedup: {e}") def _compute_perceptual_hash(self, file_path: Path) -> Optional[str]: """Calculate perceptual hash for an image or video file.""" try: import imagehash from PIL import Image except ImportError: return None ext = file_path.suffix.lower().lstrip('.') image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'} video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'} pil_image = None frame = None frame_rgb = None try: if ext in video_exts: try: import cv2 except ImportError: return None cap = cv2.VideoCapture(str(file_path)) if not cap.isOpened(): return None total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) cap.set(cv2.CAP_PROP_POS_FRAMES, int(total_frames * 0.5)) ret, frame = cap.read() cap.release() if not ret or frame is None: return None frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) pil_image = Image.fromarray(frame_rgb) elif ext in image_exts: pil_image = Image.open(file_path) else: return None phash = str(imagehash.dhash(pil_image, hash_size=16)) return phash except Exception: return None finally: if pil_image is not None: pil_image.close() del pil_image if frame_rgb is not None: del frame_rgb if frame is not None: del frame def _generate_thumbnail(self, file_path: Path, output_path: Path, file_type: str) -> bool: """Generate a thumbnail for an image or video.""" try: output_path.parent.mkdir(parents=True, exist_ok=True) if file_type == 'image': from PIL import Image, ImageOps with Image.open(file_path) as img: img = ImageOps.exif_transpose(img) img.thumbnail((400, 400)) if img.mode in ('RGBA', 'P'): img = img.convert('RGB') img.save(output_path, 'JPEG', quality=85) return True elif file_type == 'video': result = subprocess.run([ 'ffmpeg', '-y', '-i', str(file_path), '-ss', '00:00:01', '-vframes', '1', '-vf', 'scale=400:-1:force_original_aspect_ratio=decrease', str(output_path) ], capture_output=True, timeout=30) return result.returncode == 0 and output_path.exists() except Exception as e: logger.error(f"Thumbnail generation failed: {e}") return False