Files
media-downloader/modules/reddit_community_monitor.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

1613 lines
63 KiB
Python

#!/usr/bin/env python3
"""
Reddit Community Monitor Module
Monitors specified Reddit communities (subreddits) for new posts,
downloads all media (including imgur/redgifs attachments via gallery-dl),
and automatically creates private gallery posts for mapped persons.
Design:
- Each community (subreddit) is mapped to a person in the private gallery
- Uses gallery-dl for downloading with full Reddit/imgur/redgifs support
- All new posts are tagged with a "reddit" tag
- Configurable check intervals and lookback periods
- Supports cookie authentication for age-gated content
"""
import asyncio
import hashlib
import json
import mimetypes
import os
import re
import sqlite3
import subprocess
import tempfile
import uuid
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Any, Set
from modules.universal_logger import get_logger
logger = get_logger('RedditMonitor')
# Key file path for background crypto access
REDDIT_MONITOR_KEY_FILE = '/opt/immich/private/.reddit_monitor_key'
class RedditCommunityMonitor:
"""
Background monitor for Reddit communities.
Downloads media from subreddits and imports to private gallery.
"""
def __init__(self, db_path: str, activity_manager=None):
self.db_path = db_path
self.activity_manager = activity_manager
self.gallery_dl_path = '/opt/media-downloader/venv/bin/gallery-dl'
def _get_connection(self) -> sqlite3.Connection:
"""Get a database connection with row factory."""
conn = sqlite3.connect(self.db_path)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON")
return conn
# =========================================================================
# SETTINGS METHODS
# =========================================================================
def get_settings(self) -> Dict:
"""Get Reddit monitor settings from private_media_config."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute(
"SELECT key, value FROM private_media_config WHERE key LIKE 'reddit_monitor_%'"
)
rows = cursor.fetchall()
settings = {}
for row in rows:
key = row['key'].replace('reddit_monitor_', '')
value = row['value']
if value == 'true':
value = True
elif value == 'false':
value = False
elif value and value.isdigit():
value = int(value)
settings[key] = value
# Defaults
return {
'enabled': settings.get('enabled', False),
'check_interval_hours': settings.get('check_interval_hours', 4),
'lookback_days': settings.get('lookback_days', 3),
'last_checked': settings.get('last_checked', None),
}
finally:
conn.close()
def update_settings(self, **kwargs) -> bool:
"""Update Reddit monitor settings."""
conn = self._get_connection()
try:
cursor = conn.cursor()
for key, value in kwargs.items():
db_key = f'reddit_monitor_{key}'
if isinstance(value, bool):
db_value = 'true' if value else 'false'
else:
db_value = str(value) if value is not None else ''
cursor.execute('''
INSERT OR REPLACE INTO private_media_config (key, value, updated_at)
VALUES (?, ?, CURRENT_TIMESTAMP)
''', (db_key, db_value))
conn.commit()
return True
except Exception as e:
logger.error(f"Failed to update settings: {e}")
return False
finally:
conn.close()
# =========================================================================
# COOKIE MANAGEMENT
# =========================================================================
def save_cookies(self, crypto, cookies_json: str) -> bool:
"""Save cookies encrypted in the config table."""
conn = self._get_connection()
try:
encrypted = crypto.encrypt_field(cookies_json)
cursor = conn.cursor()
cursor.execute('''
INSERT OR REPLACE INTO private_media_config (key, value, updated_at)
VALUES ('reddit_monitor_encrypted_cookies', ?, CURRENT_TIMESTAMP)
''', (encrypted,))
conn.commit()
logger.info("Reddit cookies saved (encrypted)")
return True
except Exception as e:
logger.error(f"Failed to save cookies: {e}")
return False
finally:
conn.close()
def has_cookies(self, crypto) -> bool:
"""Check if encrypted cookies exist and are valid."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute(
"SELECT value FROM private_media_config WHERE key = 'reddit_monitor_encrypted_cookies'"
)
row = cursor.fetchone()
if not row or not row['value']:
return False
# Try to decrypt to verify they're valid
try:
decrypted = crypto.decrypt_field(row['value'])
return bool(decrypted and decrypted.strip())
except Exception:
return False
finally:
conn.close()
def delete_cookies(self) -> bool:
"""Delete stored cookies."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute(
"DELETE FROM private_media_config WHERE key = 'reddit_monitor_encrypted_cookies'"
)
conn.commit()
logger.info("Reddit cookies deleted")
return True
except Exception as e:
logger.error(f"Failed to delete cookies: {e}")
return False
finally:
conn.close()
def _get_cookies_json(self, crypto) -> Optional[str]:
"""Load and decrypt stored cookies JSON."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute(
"SELECT value FROM private_media_config WHERE key = 'reddit_monitor_encrypted_cookies'"
)
row = cursor.fetchone()
if not row or not row['value']:
return None
try:
return crypto.decrypt_field(row['value'])
except Exception as e:
logger.error(f"Failed to decrypt cookies: {e}")
return None
finally:
conn.close()
def _write_netscape_cookie_file(self, cookies_json: str, output_path: str) -> bool:
"""Convert JSON cookies array to Netscape cookie file format for gallery-dl."""
try:
cookies = json.loads(cookies_json)
if not isinstance(cookies, list):
logger.error("Cookies is not a JSON array")
return False
with open(output_path, 'w') as f:
f.write("# Netscape HTTP Cookie File\n")
f.write("# https://curl.haxx.se/docs/http-cookies.html\n\n")
for cookie in cookies:
domain = cookie.get('domain', '')
# Ensure domain starts with . for domain-wide cookies
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
path = cookie.get('path', '/')
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
# Convert expiry - use 0 for session cookies
expires = cookie.get('expirationDate', cookie.get('expiry', cookie.get('expires', 0)))
if expires is None:
expires = 0
expires = str(int(float(expires)))
name = cookie.get('name', '')
value = cookie.get('value', '')
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n")
return True
except Exception as e:
logger.error(f"Failed to write Netscape cookie file: {e}")
return False
# =========================================================================
# COMMUNITY MAPPING METHODS
# =========================================================================
def get_all_communities(self) -> List[Dict]:
"""Get all community mappings with person info and live media count."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute('''
SELECT c.*, p.encrypted_name as person_encrypted_name,
r.encrypted_name as relationship_encrypted_name,
r.color as relationship_color,
(SELECT COUNT(*) FROM private_media m
WHERE m.person_id = c.person_id AND m.source_type = 'reddit') as actual_media_count
FROM private_media_reddit_communities c
LEFT JOIN private_media_persons p ON c.person_id = p.id
LEFT JOIN private_media_relationships r ON p.relationship_id = r.id
ORDER BY c.subreddit_name
''')
communities = []
for row in cursor.fetchall():
d = dict(row)
d['total_media_found'] = d.pop('actual_media_count', 0)
communities.append(d)
return communities
finally:
conn.close()
def get_community(self, community_id: int) -> Optional[Dict]:
"""Get a single community mapping."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute('''
SELECT c.*, p.encrypted_name as person_encrypted_name,
r.encrypted_name as relationship_encrypted_name,
r.color as relationship_color
FROM private_media_reddit_communities c
LEFT JOIN private_media_persons p ON c.person_id = p.id
LEFT JOIN private_media_relationships r ON p.relationship_id = r.id
WHERE c.id = ?
''', (community_id,))
row = cursor.fetchone()
return dict(row) if row else None
finally:
conn.close()
def add_community(self, subreddit_name: str, person_id: int) -> int:
"""Add a new community mapping. Returns the new ID."""
# Strip r/ prefix if present
subreddit_name = re.sub(r'^r/', '', subreddit_name.strip())
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute('''
INSERT INTO private_media_reddit_communities (subreddit_name, person_id)
VALUES (?, ?)
''', (subreddit_name, person_id))
conn.commit()
return cursor.lastrowid
finally:
conn.close()
def update_community(self, community_id: int, **kwargs) -> bool:
"""Update a community mapping."""
conn = self._get_connection()
try:
cursor = conn.cursor()
updates = []
values = []
for key, value in kwargs.items():
if key == 'subreddit_name' and value is not None:
value = re.sub(r'^r/', '', value.strip())
if key == 'enabled':
value = 1 if value else 0
updates.append(f'{key} = ?')
values.append(value)
if not updates:
return False
updates.append("updated_at = CURRENT_TIMESTAMP")
values.append(community_id)
cursor.execute(
f"UPDATE private_media_reddit_communities SET {', '.join(updates)} WHERE id = ?",
values
)
conn.commit()
return cursor.rowcount > 0
except Exception as e:
logger.error(f"Failed to update community {community_id}: {e}")
return False
finally:
conn.close()
def delete_community(self, community_id: int) -> bool:
"""Delete a community mapping."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute(
"DELETE FROM private_media_reddit_communities WHERE id = ?",
(community_id,)
)
conn.commit()
return cursor.rowcount > 0
finally:
conn.close()
def get_communities_for_person(self, person_id: int) -> List[Dict]:
"""Get all communities mapped to a person."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute(
"SELECT * FROM private_media_reddit_communities WHERE person_id = ?",
(person_id,)
)
return [dict(row) for row in cursor.fetchall()]
finally:
conn.close()
def get_history(self, community_id: int) -> List[Dict]:
"""Get download history for a community."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute('''
SELECT * FROM private_media_reddit_history
WHERE community_id = ?
ORDER BY processed_at DESC
LIMIT 100
''', (community_id,))
return [dict(row) for row in cursor.fetchall()]
finally:
conn.close()
# =========================================================================
# CRYPTO ACCESS
# =========================================================================
def _get_crypto(self):
"""Load crypto from key file for background access."""
from modules.private_gallery_crypto import load_key_from_file
crypto = load_key_from_file(REDDIT_MONITOR_KEY_FILE)
if crypto is None:
logger.warning("Reddit monitor crypto unavailable - key file missing or invalid")
return crypto
# =========================================================================
# DOWNLOAD METHODS
# =========================================================================
async def check_all_now(self, from_scheduler: bool = False) -> int:
"""
Check all enabled communities for new posts.
Args:
from_scheduler: Whether this was triggered by the scheduler
Returns:
Total count of new media items imported
"""
settings = self.get_settings()
if from_scheduler and not settings.get('enabled'):
logger.debug("Reddit monitor is disabled")
return 0
crypto = self._get_crypto()
if crypto is None:
logger.warning("Skipping Reddit check: encryption key not available")
return 0
# Get enabled communities
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute(
"SELECT * FROM private_media_reddit_communities WHERE enabled = 1"
)
communities = [dict(row) for row in cursor.fetchall()]
finally:
conn.close()
if not communities:
logger.debug("No enabled Reddit communities to check")
return 0
# Start background task tracking
if self.activity_manager:
self.activity_manager.start_background_task(
'reddit_monitor',
'reddit_community_monitor',
'Reddit Community Monitor',
'Running',
{'total_communities': len(communities), 'media_found': 0}
)
total_media = 0
affected_person_ids: Set[int] = set()
lookback_days = settings.get('lookback_days', 3)
# Load cookies from encrypted storage
cookies_json = self._get_cookies_json(crypto)
# Crash recovery checkpoint
from modules.task_checkpoint import TaskCheckpoint
checkpoint = TaskCheckpoint('reddit_monitor', 'background')
checkpoint.start(total_items=len(communities))
if checkpoint.is_recovering():
logger.info("Reddit monitor: recovering — skipping already-checked communities")
try:
for idx, community in enumerate(communities):
subreddit = community['subreddit_name']
person_id = community['person_id']
community_id = community['id']
if checkpoint.is_completed(str(community_id)):
continue
checkpoint.set_current(str(community_id))
try:
# Use longer lookback for communities that have never imported anything
effective_lookback = lookback_days
if community.get('total_media_found', 0) == 0 and not community.get('last_checked'):
effective_lookback = 30
logger.info(f"First check for r/{subreddit}, using 30-day lookback")
media_count = await self._check_community(
community_id, subreddit, person_id,
effective_lookback, cookies_json, crypto,
community_idx=idx, total_communities=len(communities),
running_media_total=total_media
)
total_media += media_count
if media_count > 0:
affected_person_ids.add(person_id)
if self.activity_manager:
self.activity_manager.update_background_task(
'reddit_monitor',
f'Found {media_count} new media in r/{subreddit}',
idx + 1, len(communities),
{'total_communities': len(communities), 'media_found': total_media, 'current_community': f'r/{subreddit}', 'phase': 'found', 'last_found': media_count}
)
except Exception as e:
logger.error(f"Error checking r/{subreddit}: {e}")
import traceback
logger.debug(f"Traceback: {traceback.format_exc()}")
checkpoint.mark_completed(str(community_id))
# Checkpoint complete
checkpoint.finish()
# Update last_checked timestamp
self.update_settings(last_checked=datetime.now().isoformat())
# Auto-dedup for persons that received new media
if affected_person_ids:
if self.activity_manager:
self.activity_manager.update_background_task(
'reddit_monitor', 'Deduplicating...',
len(communities), len(communities),
{'phase': 'deduplicating', 'media_found': total_media}
)
dedup_deleted = self._run_dedup_for_persons(affected_person_ids, crypto)
if dedup_deleted > 0:
logger.info(f"Reddit monitor: auto-dedup removed {dedup_deleted} duplicates")
finally:
if self.activity_manager:
self.activity_manager.stop_background_task('reddit_monitor')
if total_media > 0:
logger.info(f"Reddit monitor: imported {total_media} new media items")
else:
logger.debug("Reddit monitor: no new media found")
return total_media
async def download_full_community(self, community_id: int) -> int:
"""
Download all available media from a community (no date filter).
Args:
community_id: ID of the community to download
Returns:
Count of new media items imported
"""
community = self.get_community(community_id)
if not community:
logger.error(f"Community {community_id} not found")
return 0
crypto = self._get_crypto()
if crypto is None:
logger.warning("Cannot download: encryption key not available")
return 0
cookies_json = self._get_cookies_json(crypto)
subreddit = community['subreddit_name']
if self.activity_manager:
self.activity_manager.start_background_task(
'reddit_monitor',
'reddit_community_monitor',
'Reddit Community Monitor',
'Running',
{'total_communities': 1, 'media_found': 0, 'current_community': f'r/{subreddit}', 'full_download': True}
)
try:
media_count = await self._check_community(
community_id, subreddit,
community['person_id'], None, cookies_json, crypto,
community_idx=0, total_communities=1, running_media_total=0
)
return media_count
finally:
if self.activity_manager:
self.activity_manager.stop_background_task('reddit_monitor')
async def check_single_community(self, community_id: int) -> int:
"""
Check a single community for new posts (using lookback_days filter).
Args:
community_id: ID of the community to check
Returns:
Count of new media items imported
"""
community = self.get_community(community_id)
if not community:
logger.error(f"Community {community_id} not found")
return 0
crypto = self._get_crypto()
if crypto is None:
logger.warning("Cannot check: encryption key not available")
return 0
settings = self.get_settings()
lookback_days = settings.get('lookback_days', 3)
cookies_json = self._get_cookies_json(crypto)
subreddit = community['subreddit_name']
if self.activity_manager:
self.activity_manager.start_background_task(
'reddit_monitor',
'reddit_community_monitor',
'Reddit Community Monitor',
'Running',
{'total_communities': 1, 'media_found': 0, 'current_community': f'r/{subreddit}'}
)
try:
# Use longer lookback for communities that have never imported anything
effective_lookback = lookback_days
if community.get('total_media_found', 0) == 0 and not community.get('last_checked'):
effective_lookback = 30
logger.info(f"First check for r/{subreddit}, using 30-day lookback")
media_count = await self._check_community(
community_id, subreddit,
community['person_id'], effective_lookback, cookies_json, crypto,
community_idx=0, total_communities=1, running_media_total=0
)
return media_count
finally:
if self.activity_manager:
self.activity_manager.stop_background_task('reddit_monitor')
async def check_communities_by_person(self, person_id: int) -> int:
"""
Check all enabled communities for a given person.
Args:
person_id: ID of the person whose communities to check
Returns:
Total count of new media items imported
"""
crypto = self._get_crypto()
if crypto is None:
logger.warning("Cannot check: encryption key not available")
return 0
# Get enabled communities for this person
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute(
"SELECT * FROM private_media_reddit_communities WHERE person_id = ? AND enabled = 1",
(person_id,)
)
communities = [dict(row) for row in cursor.fetchall()]
finally:
conn.close()
if not communities:
logger.debug(f"No enabled communities for person {person_id}")
return 0
settings = self.get_settings()
lookback_days = settings.get('lookback_days', 3)
cookies_json = self._get_cookies_json(crypto)
if self.activity_manager:
self.activity_manager.start_background_task(
'reddit_monitor',
'reddit_community_monitor',
'Reddit Community Monitor',
'Running',
{'total_communities': len(communities), 'media_found': 0}
)
total_media = 0
try:
for idx, community in enumerate(communities):
subreddit = community['subreddit_name']
community_id = community['id']
try:
effective_lookback = lookback_days
if community.get('total_media_found', 0) == 0 and not community.get('last_checked'):
effective_lookback = 30
logger.info(f"First check for r/{subreddit}, using 30-day lookback")
media_count = await self._check_community(
community_id, subreddit, community['person_id'],
effective_lookback, cookies_json, crypto,
community_idx=idx, total_communities=len(communities),
running_media_total=total_media
)
total_media += media_count
if media_count > 0 and self.activity_manager:
self.activity_manager.update_background_task(
'reddit_monitor',
f'Found {media_count} new media in r/{subreddit}',
idx + 1, len(communities),
{'total_communities': len(communities), 'media_found': total_media,
'current_community': f'r/{subreddit}', 'phase': 'found', 'last_found': media_count}
)
except Exception as e:
logger.error(f"Error checking r/{subreddit}: {e}")
import traceback
logger.debug(f"Traceback: {traceback.format_exc()}")
# Auto-dedup for this person if new media was imported
if total_media > 0:
if self.activity_manager:
self.activity_manager.update_background_task(
'reddit_monitor', 'Deduplicating...',
len(communities), len(communities),
{'phase': 'deduplicating', 'media_found': total_media}
)
dedup_deleted = self._run_dedup_for_persons({person_id}, crypto)
if dedup_deleted > 0:
logger.info(f"Reddit person check: auto-dedup removed {dedup_deleted} duplicates")
finally:
if self.activity_manager:
self.activity_manager.stop_background_task('reddit_monitor')
if total_media > 0:
logger.info(f"Reddit person check: imported {total_media} new media items")
return total_media
def _update_status(self, status_text: str, community_idx: int, total_communities: int, extra: Dict = None):
"""Helper to update background task status with detailed info."""
if not self.activity_manager:
return
data = {
'total_communities': total_communities,
'media_found': extra.get('media_found', 0) if extra else 0,
}
if extra:
data.update(extra)
self.activity_manager.update_background_task(
'reddit_monitor', status_text,
community_idx, total_communities, data
)
async def _check_community(
self, community_id: int, subreddit: str, person_id: int,
lookback_days: Optional[int], cookies_json: Optional[str], crypto,
community_idx: int = 0, total_communities: int = 1, running_media_total: int = 0
) -> int:
"""Check a single community and import new media."""
with tempfile.TemporaryDirectory(prefix=f'reddit_{subreddit}_') as temp_dir:
# Phase: Downloading
self._update_status(
f'Downloading from r/{subreddit}...', community_idx, total_communities,
{'current_community': f'r/{subreddit}', 'phase': 'downloading',
'media_found': running_media_total}
)
# Run gallery-dl
files = await self._run_gallery_dl(
subreddit, temp_dir, lookback_days, cookies_json
)
if not files:
logger.debug(f"No files downloaded from r/{subreddit}")
self._update_status(
f'No new files in r/{subreddit}', community_idx, total_communities,
{'current_community': f'r/{subreddit}', 'phase': 'done',
'media_found': running_media_total}
)
# Still update last_checked so we know we tried
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute('''
UPDATE private_media_reddit_communities
SET last_checked = CURRENT_TIMESTAMP
WHERE id = ?
''', (community_id,))
conn.commit()
finally:
conn.close()
return 0
# Phase: Processing
self._update_status(
f'Downloaded {len(files)} files from r/{subreddit}, grouping by post...',
community_idx, total_communities,
{'current_community': f'r/{subreddit}', 'phase': 'processing',
'files_downloaded': len(files), 'media_found': running_media_total}
)
# Group files by Reddit post ID
posts = self._group_files_by_post(files, temp_dir)
# Get or create "reddit" tag
reddit_tag_id = self._ensure_reddit_tag(crypto)
# Filter out already-processed posts
new_posts = {}
for reddit_post_id, post_data in posts.items():
if not self._is_post_processed(community_id, reddit_post_id):
new_posts[reddit_post_id] = post_data
if not new_posts:
self._update_status(
f'No new posts in r/{subreddit} ({len(posts)} already imported)',
community_idx, total_communities,
{'current_community': f'r/{subreddit}', 'phase': 'done',
'files_downloaded': len(files), 'media_found': running_media_total}
)
# Still need to update total_media = 0 path below
posts_to_import = {}
else:
posts_to_import = new_posts
# Import each post
total_media = 0
for post_num, (reddit_post_id, post_data) in enumerate(posts_to_import.items(), 1):
num_files = len(post_data['files'])
self._update_status(
f'Importing post {post_num}/{len(posts_to_import)} from r/{subreddit} ({num_files} files)',
community_idx, total_communities,
{'current_community': f'r/{subreddit}', 'phase': 'importing',
'files_downloaded': len(files), 'posts_imported': post_num,
'posts_total': len(posts_to_import),
'media_found': running_media_total + total_media}
)
media_count = self._import_post_to_gallery(
post_data, person_id, reddit_tag_id, crypto,
subreddit, community_id, reddit_post_id,
community_idx, total_communities, running_media_total + total_media
)
total_media += media_count
# Update community stats
if total_media > 0:
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute('''
UPDATE private_media_reddit_communities
SET total_media_found = total_media_found + ?,
last_checked = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
WHERE id = ?
''', (total_media, community_id))
conn.commit()
finally:
conn.close()
else:
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute('''
UPDATE private_media_reddit_communities
SET last_checked = CURRENT_TIMESTAMP
WHERE id = ?
''', (community_id,))
conn.commit()
finally:
conn.close()
return total_media
# =========================================================================
# HELPER METHODS
# =========================================================================
async def _run_gallery_dl(
self, subreddit: str, temp_dir: str,
lookback_days: Optional[int] = None, cookies_json: Optional[str] = None
) -> List[Path]:
"""
Run gallery-dl to download media from a subreddit.
Returns:
List of downloaded file paths
"""
# Use a persistent download archive so gallery-dl skips already-downloaded URLs
archive_dir = os.path.join(os.path.dirname(self.db_path) if '/' in self.db_path else '/opt/media-downloader/data', 'cache')
os.makedirs(archive_dir, exist_ok=True)
archive_path = os.path.join(archive_dir, 'reddit_gallery_dl_archive.db')
cmd = [
self.gallery_dl_path,
'--write-metadata',
'--download-archive', archive_path,
'-d', temp_dir,
]
# Use REST API mode instead of OAuth API to avoid shared rate limits.
# The default OAuth client-id is shared by all gallery-dl users globally,
# causing 429 rate limits with many subreddits. REST mode uses www.reddit.com
# directly with cookies for auth, bypassing OAuth rate limits entirely.
cmd.extend(['-o', 'extractor.reddit.api=rest'])
# Limit to 200 most recent posts per subreddit to avoid timeout from full history pagination
cmd.extend(['--range', '1-200'])
cmd.append(f'https://www.reddit.com/r/{subreddit}/new/')
if lookback_days:
cutoff = (datetime.now() - timedelta(days=lookback_days)).strftime('%Y-%m-%d')
cmd.extend(['--filter', f"date >= datetime.strptime('{cutoff}', '%Y-%m-%d')"])
# Write JSON cookies to a temp Netscape cookie file
# Ensure temp_dir exists (can be cleaned by systemd-tmpfiles or race conditions)
temp_cookie_file = None
if cookies_json:
os.makedirs(temp_dir, exist_ok=True)
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
if self._write_netscape_cookie_file(cookies_json, temp_cookie_file):
cmd.extend(['--cookies', temp_cookie_file])
logger.info(f"Running gallery-dl for r/{subreddit}")
logger.debug(f"Command: {' '.join(cmd)}")
try:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
lambda: subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=600 # 10 minute timeout
)
)
# gallery-dl exit codes are bitflags: 1=some errors, 4=some skipped, 8=all skipped
# Code 4 (skipped) and 5 (skipped+errors) are normal when files already exist
if result.returncode not in (0, 1, 4, 5):
logger.warning(f"gallery-dl returned code {result.returncode} for r/{subreddit}")
if result.stderr:
logger.debug(f"gallery-dl stderr: {result.stderr[:500]}")
except subprocess.TimeoutExpired:
logger.error(f"gallery-dl timed out for r/{subreddit}")
return []
except Exception as e:
logger.error(f"gallery-dl failed for r/{subreddit}: {e}")
return []
# Collect all non-JSON, non-cookie files from the temp directory
downloaded = []
skip_exts = {'.json', '.txt'}
skip_names = {'.cookies.txt'}
for root, dirs, filenames in os.walk(temp_dir):
for fname in filenames:
if fname in skip_names or fname.startswith('.'):
continue
if fname.endswith('.json'):
continue
downloaded.append(Path(root) / fname)
logger.info(f"Downloaded {len(downloaded)} files from r/{subreddit}")
return downloaded
def _group_files_by_post(
self, files: List[Path], temp_dir: str
) -> Dict[str, Dict]:
"""
Group downloaded files by their Reddit post ID using metadata JSON sidecars.
Returns:
Dict mapping reddit_post_id -> {
'files': [Path],
'title': str,
'date': str,
'source_url': str
}
"""
posts: Dict[str, Dict] = {}
for file_path in files:
# Look for matching metadata JSON sidecar
json_path = file_path.with_suffix(file_path.suffix + '.json')
if not json_path.exists():
# Try without double extension
json_path = file_path.with_suffix('.json')
metadata = {}
if json_path.exists():
try:
with open(json_path, 'r', encoding='utf-8') as f:
metadata = json.load(f)
except (json.JSONDecodeError, Exception) as e:
logger.debug(f"Failed to parse metadata for {file_path.name}: {e}")
# Extract Reddit post ID - gallery-dl uses various field names
reddit_post_id = None
for key in ('id', 'reddit_id', 'parent_id'):
if key in metadata:
reddit_post_id = str(metadata[key])
break
if not reddit_post_id:
# Use filename-based grouping as fallback
# gallery-dl typically names files like: subreddit_postid_num.ext
parts = file_path.stem.split('_')
if len(parts) >= 2:
reddit_post_id = parts[-2] if len(parts) >= 3 else parts[-1]
else:
reddit_post_id = file_path.stem
# Extract post date (ensure ISO format in local time for frontend)
# gallery-dl stores Reddit dates in UTC — convert to local time
post_date = None
if 'date' in metadata:
date_val = metadata['date']
if isinstance(date_val, str):
try:
from datetime import timezone as tz
for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
try:
utc_dt = datetime.strptime(date_val, fmt).replace(tzinfo=tz.utc)
post_date = utc_dt.astimezone().strftime('%Y-%m-%dT%H:%M:%S')
break
except ValueError:
continue
if not post_date:
post_date = date_val # fallback to raw string
except Exception:
post_date = date_val
elif isinstance(date_val, (int, float)):
try:
post_date = datetime.fromtimestamp(date_val).isoformat()
except (ValueError, OSError):
pass
if not post_date and 'created_utc' in metadata:
try:
post_date = datetime.fromtimestamp(metadata['created_utc']).isoformat()
except (ValueError, OSError):
pass
if not post_date:
post_date = datetime.now().isoformat()
# Extract title
title = metadata.get('title', metadata.get('description', ''))
# Build source URL
subreddit = metadata.get('subreddit', '')
source_url = f"https://www.reddit.com/r/{subreddit}/comments/{reddit_post_id}" if subreddit else ''
if reddit_post_id not in posts:
posts[reddit_post_id] = {
'files': [],
'title': title,
'date': post_date,
'source_url': source_url,
}
posts[reddit_post_id]['files'].append(file_path)
return posts
def _is_post_processed(self, community_id: int, reddit_post_id: str) -> bool:
"""Check if a Reddit post has already been processed."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute(
"SELECT id FROM private_media_reddit_history WHERE community_id = ? AND reddit_post_id = ?",
(community_id, reddit_post_id)
)
return cursor.fetchone() is not None
finally:
conn.close()
def _ensure_reddit_tag(self, crypto) -> int:
"""Find or create a 'reddit' tag in private_gallery_tags."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute("SELECT id, encrypted_name FROM private_gallery_tags")
for row in cursor.fetchall():
try:
name = crypto.decrypt_field(row['encrypted_name'])
if name.lower() == 'reddit':
return row['id']
except Exception:
continue
# Create the tag
encrypted_name = crypto.encrypt_field('Reddit')
cursor.execute('''
INSERT INTO private_gallery_tags (encrypted_name, color)
VALUES (?, '#ff4500')
''', (encrypted_name,))
conn.commit()
tag_id = cursor.lastrowid
logger.info(f"Created 'Reddit' tag with ID {tag_id}")
return tag_id
finally:
conn.close()
def _import_post_to_gallery(
self, post_data: Dict, person_id: int, reddit_tag_id: int,
crypto, subreddit: str, community_id: int, reddit_post_id: str,
community_idx: int = 0, total_communities: int = 1, running_media_total: int = 0
) -> int:
"""
Import a Reddit post's media files into the private gallery.
Returns:
Number of media files successfully imported
"""
files = post_data['files']
title = post_data['title']
post_date = post_data['date']
source_url = post_data.get('source_url', '')
if not files:
return 0
# Get storage path from config
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute("SELECT value FROM private_media_config WHERE key = 'storage_path'")
row = cursor.fetchone()
storage_path = Path(row['value']) if row else Path('/opt/immich/private')
finally:
conn.close()
data_path = storage_path / 'data'
thumbs_path = storage_path / 'thumbs'
data_path.mkdir(parents=True, exist_ok=True)
thumbs_path.mkdir(parents=True, exist_ok=True)
# Create a post
encrypted_desc = crypto.encrypt_field(title) if title else None
encrypted_date = crypto.encrypt_field(post_date) if post_date else crypto.encrypt_field(datetime.now().isoformat())
now_iso = datetime.now().isoformat()
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute('''
INSERT INTO private_media_posts (person_id, encrypted_description, encrypted_media_date, created_at, updated_at)
VALUES (?, ?, ?, ?, ?)
''', (person_id, encrypted_desc, encrypted_date, now_iso, now_iso))
conn.commit()
post_id = cursor.lastrowid
finally:
conn.close()
media_count = 0
media_ids = []
total_files = len(files)
for file_idx, file_path in enumerate(files, 1):
try:
if not file_path.exists() or file_path.stat().st_size == 0:
continue
# Update status: encrypting/importing file
self._update_status(
f'Encrypting file {file_idx}/{total_files} from r/{subreddit}',
community_idx, total_communities,
{'current_community': f'r/{subreddit}', 'phase': 'encrypting',
'current_file': file_idx, 'total_files': total_files,
'media_found': running_media_total + media_count}
)
# Calculate file hash
sha256 = hashlib.sha256()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(65536), b''):
sha256.update(chunk)
file_hash = sha256.hexdigest()
# Check for duplicates (scoped by person)
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute(
'SELECT id FROM private_media WHERE file_hash = ? AND person_id = ?',
(file_hash, person_id)
)
if cursor.fetchone():
logger.debug(f"Duplicate file skipped: {file_path.name}")
continue
finally:
conn.close()
# Get file info
file_info = self._get_file_info(file_path)
file_size = file_path.stat().st_size
# Compute perceptual hash
perceptual_hash = self._compute_perceptual_hash(file_path)
# Generate storage ID
storage_id = str(uuid.uuid4())
# Generate thumbnail
temp_thumb = Path(tempfile.gettempdir()) / f"pg_thumb_{storage_id}.jpg"
self._generate_thumbnail(file_path, temp_thumb, file_info['file_type'])
# Encrypt the file
encrypted_file = data_path / f"{storage_id}.enc"
if not crypto.encrypt_file(file_path, encrypted_file):
logger.error(f"Encryption failed for {file_path.name}")
continue
# Encrypt thumbnail
if temp_thumb.exists():
encrypted_thumb = thumbs_path / f"{storage_id}.enc"
crypto.encrypt_file(temp_thumb, encrypted_thumb)
try:
temp_thumb.unlink()
except Exception:
pass
# Insert media record
encrypted_filename = crypto.encrypt_field(file_path.name)
encrypted_source = crypto.encrypt_field(source_url)
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute('''
INSERT INTO private_media (
post_id, storage_id, encrypted_filename, encrypted_description,
file_hash, file_size, file_type, mime_type,
width, height, duration, person_id,
encrypted_media_date, source_type, encrypted_source_path,
perceptual_hash, created_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
post_id,
storage_id,
encrypted_filename,
None,
file_hash,
file_size,
file_info['file_type'],
file_info['mime_type'],
file_info['width'],
file_info['height'],
file_info['duration'],
person_id,
encrypted_date,
'reddit',
encrypted_source,
perceptual_hash,
now_iso
))
media_id = cursor.lastrowid
media_ids.append(media_id)
conn.commit()
finally:
conn.close()
media_count += 1
except Exception as e:
logger.error(f"Failed to import {file_path.name}: {e}")
import traceback
logger.debug(f"Traceback: {traceback.format_exc()}")
# Apply reddit tag to the post
if media_count > 0:
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute('''
INSERT OR IGNORE INTO private_media_post_tags (post_id, tag_id)
VALUES (?, ?)
''', (post_id, reddit_tag_id))
conn.commit()
finally:
conn.close()
else:
# Delete the empty post
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute("DELETE FROM private_media_posts WHERE id = ?", (post_id,))
conn.commit()
finally:
conn.close()
# Record in history only if we successfully imported media
if media_count > 0:
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute('''
INSERT OR IGNORE INTO private_media_reddit_history
(community_id, reddit_post_id, media_count)
VALUES (?, ?, ?)
''', (community_id, reddit_post_id, media_count))
conn.commit()
finally:
conn.close()
if media_count > 0:
logger.info(f"Imported {media_count} files from r/{subreddit} post {reddit_post_id}")
return media_count
def _get_file_info(self, file_path: Path) -> Dict[str, Any]:
"""Get file type, mime type, and dimensions."""
ext = file_path.suffix.lower().lstrip('.')
mime_type, _ = mimetypes.guess_type(str(file_path))
if not mime_type:
mime_type = 'application/octet-stream'
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
if ext in image_exts:
file_type = 'image'
elif ext in video_exts:
file_type = 'video'
else:
file_type = 'other'
info = {
'file_type': file_type,
'mime_type': mime_type,
'width': None,
'height': None,
'duration': None
}
if file_type == 'image':
try:
from PIL import Image
with Image.open(file_path) as img:
info['width'], info['height'] = img.size
except Exception:
pass
if file_type == 'video':
try:
result = subprocess.run([
'ffprobe', '-v', 'quiet', '-print_format', 'json',
'-show_streams', '-show_format', str(file_path)
], capture_output=True, text=True, timeout=30)
if result.returncode == 0:
data = json.loads(result.stdout)
for stream in data.get('streams', []):
if stream.get('codec_type') == 'video':
info['width'] = stream.get('width')
info['height'] = stream.get('height')
break
if 'format' in data:
duration = data['format'].get('duration')
if duration:
info['duration'] = float(duration)
except Exception:
pass
return info
def _run_dedup_for_persons(self, person_ids: Set[int], crypto) -> int:
"""
Run perceptual dedup for the given person IDs, auto-deleting duplicates.
Uses the same algorithm as the dashboard's dedup scanner.
Returns total number of duplicates deleted.
"""
if not person_ids:
return 0
# Read config for threshold and storage path
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute("SELECT key, value FROM private_media_config WHERE key IN ('duplicate_auto_select_distance', 'storage_path')")
config = {row['key']: row['value'] for row in cursor.fetchall()}
finally:
conn.close()
threshold = int(config.get('duplicate_auto_select_distance', '2'))
storage_path = Path(config.get('storage_path', '/opt/immich/private'))
data_path = storage_path / 'data'
thumbs_path = storage_path / 'thumbs'
total_deleted = 0
for person_id in person_ids:
try:
total_deleted += self._dedup_person(person_id, crypto, threshold, data_path, thumbs_path, storage_path)
except Exception as e:
logger.error(f"Dedup failed for person {person_id}: {e}")
import traceback
logger.debug(f"Dedup traceback: {traceback.format_exc()}")
return total_deleted
def _dedup_person(self, person_id: int, crypto, threshold: int, data_path: Path, thumbs_path: Path, storage_path: Path) -> int:
"""Run dedup for a single person. Returns number of duplicates deleted."""
# Fetch all media with perceptual hashes for this person
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute('''
SELECT id, post_id, storage_id, file_type, perceptual_hash, width, height
FROM private_media
WHERE post_id IN (SELECT id FROM private_media_posts WHERE person_id = ?)
AND file_type IN ('image', 'video')
AND perceptual_hash IS NOT NULL
AND perceptual_hash != ''
''', (person_id,))
all_media = [dict(row) for row in cursor.fetchall()]
finally:
conn.close()
if len(all_media) < 2:
return 0
# Pre-compute integer values for fast XOR-based hamming distance
hash_ints = {}
for m in all_media:
try:
hash_ints[m['id']] = int(m['perceptual_hash'], 16)
except (ValueError, TypeError):
pass
# Union-Find for grouping duplicates
parent = {m['id']: m['id'] for m in all_media}
def find(x):
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(x, y):
px, py = find(x), find(y)
if px != py:
parent[px] = py
# Compare all pairs
for i in range(len(all_media)):
id_i = all_media[i]['id']
if id_i not in hash_ints:
continue
hi = hash_ints[id_i]
for j in range(i + 1, len(all_media)):
id_j = all_media[j]['id']
if id_j not in hash_ints:
continue
dist = bin(hi ^ hash_ints[id_j]).count('1')
if dist <= threshold:
union(id_i, id_j)
# Group by root
groups: Dict[int, list] = {}
for m in all_media:
root = find(m['id'])
if root not in groups:
groups[root] = []
groups[root].append(m)
# Filter to actual duplicate groups (size > 1)
duplicate_groups = [g for g in groups.values() if len(g) > 1]
if not duplicate_groups:
return 0
# In each group: keep highest resolution, mark rest for deletion
to_delete = []
for group in duplicate_groups:
# Sort by resolution (width * height) descending, keep first
group.sort(key=lambda m: (m['width'] or 0) * (m['height'] or 0), reverse=True)
to_delete.extend(group[1:]) # All except the highest resolution
if not to_delete:
return 0
# Delete duplicate files and DB records
deleted = 0
conn = self._get_connection()
try:
cursor = conn.cursor()
for media in to_delete:
storage_id = media['storage_id']
# Delete encrypted data file
data_file = data_path / f"{storage_id}.enc"
if data_file.exists():
data_file.unlink()
# Delete thumbnail file
thumb_file = thumbs_path / f"{storage_id}.enc"
if thumb_file.exists():
thumb_file.unlink()
# Delete DB record
cursor.execute('DELETE FROM private_media WHERE id = ?', (media['id'],))
deleted += 1
conn.commit()
finally:
conn.close()
# Clean up empty reddit-tagged posts
self._cleanup_empty_reddit_posts_after_dedup(crypto, storage_path)
logger.info(f"Dedup: deleted {deleted} duplicates across {len(duplicate_groups)} groups for person {person_id}")
return deleted
def _cleanup_empty_reddit_posts_after_dedup(self, crypto, storage_path: Path):
"""Delete reddit-tagged posts that have no remaining media after dedup."""
try:
conn = self._get_connection()
try:
# Find the reddit tag ID
cursor = conn.cursor()
cursor.execute("SELECT id, encrypted_name FROM private_gallery_tags")
reddit_tag_id = None
for row in cursor.fetchall():
try:
name = crypto.decrypt_field(row['encrypted_name'])
if name and name.lower() == 'reddit':
reddit_tag_id = row['id']
break
except Exception:
continue
finally:
conn.close()
if reddit_tag_id is None:
return
# Find empty reddit-tagged posts
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute('''
SELECT p.id FROM private_media_posts p
JOIN private_media_post_tags pt ON pt.post_id = p.id
WHERE pt.tag_id = ?
AND NOT EXISTS (SELECT 1 FROM private_media m WHERE m.post_id = p.id)
''', (reddit_tag_id,))
empty_posts = [row['id'] for row in cursor.fetchall()]
finally:
conn.close()
if not empty_posts:
return
# Delete empty posts
conn = self._get_connection()
try:
cursor = conn.cursor()
for post_id in empty_posts:
cursor.execute('DELETE FROM private_media_post_tags WHERE post_id = ?', (post_id,))
cursor.execute('DELETE FROM private_media_posts WHERE id = ?', (post_id,))
conn.commit()
finally:
conn.close()
logger.info(f"Dedup cleanup: removed {len(empty_posts)} empty reddit-tagged posts")
except Exception as e:
logger.error(f"Failed to cleanup empty reddit posts after dedup: {e}")
def _compute_perceptual_hash(self, file_path: Path) -> Optional[str]:
"""Calculate perceptual hash for an image or video file."""
try:
import imagehash
from PIL import Image
except ImportError:
return None
ext = file_path.suffix.lower().lstrip('.')
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
pil_image = None
frame = None
frame_rgb = None
try:
if ext in video_exts:
try:
import cv2
except ImportError:
return None
cap = cv2.VideoCapture(str(file_path))
if not cap.isOpened():
return None
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
cap.set(cv2.CAP_PROP_POS_FRAMES, int(total_frames * 0.5))
ret, frame = cap.read()
cap.release()
if not ret or frame is None:
return None
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(frame_rgb)
elif ext in image_exts:
pil_image = Image.open(file_path)
else:
return None
phash = str(imagehash.dhash(pil_image, hash_size=16))
return phash
except Exception:
return None
finally:
if pil_image is not None:
pil_image.close()
del pil_image
if frame_rgb is not None:
del frame_rgb
if frame is not None:
del frame
def _generate_thumbnail(self, file_path: Path, output_path: Path, file_type: str) -> bool:
"""Generate a thumbnail for an image or video."""
try:
output_path.parent.mkdir(parents=True, exist_ok=True)
if file_type == 'image':
from PIL import Image, ImageOps
with Image.open(file_path) as img:
img = ImageOps.exif_transpose(img)
img.thumbnail((400, 400))
if img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
img.save(output_path, 'JPEG', quality=85)
return True
elif file_type == 'video':
result = subprocess.run([
'ffmpeg', '-y', '-i', str(file_path),
'-ss', '00:00:01', '-vframes', '1',
'-vf', 'scale=400:-1:force_original_aspect_ratio=decrease',
str(output_path)
], capture_output=True, timeout=30)
return result.returncode == 0 and output_path.exists()
except Exception as e:
logger.error(f"Thumbnail generation failed: {e}")
return False