1613 lines
63 KiB
Python
1613 lines
63 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Reddit Community Monitor Module
|
|
|
|
Monitors specified Reddit communities (subreddits) for new posts,
|
|
downloads all media (including imgur/redgifs attachments via gallery-dl),
|
|
and automatically creates private gallery posts for mapped persons.
|
|
|
|
Design:
|
|
- Each community (subreddit) is mapped to a person in the private gallery
|
|
- Uses gallery-dl for downloading with full Reddit/imgur/redgifs support
|
|
- All new posts are tagged with a "reddit" tag
|
|
- Configurable check intervals and lookback periods
|
|
- Supports cookie authentication for age-gated content
|
|
"""
|
|
|
|
import asyncio
|
|
import hashlib
|
|
import json
|
|
import mimetypes
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import subprocess
|
|
import tempfile
|
|
import uuid
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any, Set
|
|
|
|
from modules.universal_logger import get_logger
|
|
|
|
logger = get_logger('RedditMonitor')
|
|
|
|
# Key file path for background crypto access
|
|
REDDIT_MONITOR_KEY_FILE = '/opt/immich/private/.reddit_monitor_key'
|
|
|
|
|
|
class RedditCommunityMonitor:
|
|
"""
|
|
Background monitor for Reddit communities.
|
|
Downloads media from subreddits and imports to private gallery.
|
|
"""
|
|
|
|
def __init__(self, db_path: str, activity_manager=None):
|
|
self.db_path = db_path
|
|
self.activity_manager = activity_manager
|
|
self.gallery_dl_path = '/opt/media-downloader/venv/bin/gallery-dl'
|
|
|
|
def _get_connection(self) -> sqlite3.Connection:
|
|
"""Get a database connection with row factory."""
|
|
conn = sqlite3.connect(self.db_path)
|
|
conn.row_factory = sqlite3.Row
|
|
conn.execute("PRAGMA foreign_keys = ON")
|
|
return conn
|
|
|
|
# =========================================================================
|
|
# SETTINGS METHODS
|
|
# =========================================================================
|
|
|
|
def get_settings(self) -> Dict:
|
|
"""Get Reddit monitor settings from private_media_config."""
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT key, value FROM private_media_config WHERE key LIKE 'reddit_monitor_%'"
|
|
)
|
|
rows = cursor.fetchall()
|
|
settings = {}
|
|
for row in rows:
|
|
key = row['key'].replace('reddit_monitor_', '')
|
|
value = row['value']
|
|
if value == 'true':
|
|
value = True
|
|
elif value == 'false':
|
|
value = False
|
|
elif value and value.isdigit():
|
|
value = int(value)
|
|
settings[key] = value
|
|
|
|
# Defaults
|
|
return {
|
|
'enabled': settings.get('enabled', False),
|
|
'check_interval_hours': settings.get('check_interval_hours', 4),
|
|
'lookback_days': settings.get('lookback_days', 3),
|
|
'last_checked': settings.get('last_checked', None),
|
|
}
|
|
finally:
|
|
conn.close()
|
|
|
|
def update_settings(self, **kwargs) -> bool:
|
|
"""Update Reddit monitor settings."""
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
for key, value in kwargs.items():
|
|
db_key = f'reddit_monitor_{key}'
|
|
if isinstance(value, bool):
|
|
db_value = 'true' if value else 'false'
|
|
else:
|
|
db_value = str(value) if value is not None else ''
|
|
cursor.execute('''
|
|
INSERT OR REPLACE INTO private_media_config (key, value, updated_at)
|
|
VALUES (?, ?, CURRENT_TIMESTAMP)
|
|
''', (db_key, db_value))
|
|
conn.commit()
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to update settings: {e}")
|
|
return False
|
|
finally:
|
|
conn.close()
|
|
|
|
# =========================================================================
|
|
# COOKIE MANAGEMENT
|
|
# =========================================================================
|
|
|
|
def save_cookies(self, crypto, cookies_json: str) -> bool:
|
|
"""Save cookies encrypted in the config table."""
|
|
conn = self._get_connection()
|
|
try:
|
|
encrypted = crypto.encrypt_field(cookies_json)
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
INSERT OR REPLACE INTO private_media_config (key, value, updated_at)
|
|
VALUES ('reddit_monitor_encrypted_cookies', ?, CURRENT_TIMESTAMP)
|
|
''', (encrypted,))
|
|
conn.commit()
|
|
logger.info("Reddit cookies saved (encrypted)")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to save cookies: {e}")
|
|
return False
|
|
finally:
|
|
conn.close()
|
|
|
|
def has_cookies(self, crypto) -> bool:
|
|
"""Check if encrypted cookies exist and are valid."""
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT value FROM private_media_config WHERE key = 'reddit_monitor_encrypted_cookies'"
|
|
)
|
|
row = cursor.fetchone()
|
|
if not row or not row['value']:
|
|
return False
|
|
# Try to decrypt to verify they're valid
|
|
try:
|
|
decrypted = crypto.decrypt_field(row['value'])
|
|
return bool(decrypted and decrypted.strip())
|
|
except Exception:
|
|
return False
|
|
finally:
|
|
conn.close()
|
|
|
|
def delete_cookies(self) -> bool:
|
|
"""Delete stored cookies."""
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"DELETE FROM private_media_config WHERE key = 'reddit_monitor_encrypted_cookies'"
|
|
)
|
|
conn.commit()
|
|
logger.info("Reddit cookies deleted")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to delete cookies: {e}")
|
|
return False
|
|
finally:
|
|
conn.close()
|
|
|
|
def _get_cookies_json(self, crypto) -> Optional[str]:
|
|
"""Load and decrypt stored cookies JSON."""
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT value FROM private_media_config WHERE key = 'reddit_monitor_encrypted_cookies'"
|
|
)
|
|
row = cursor.fetchone()
|
|
if not row or not row['value']:
|
|
return None
|
|
try:
|
|
return crypto.decrypt_field(row['value'])
|
|
except Exception as e:
|
|
logger.error(f"Failed to decrypt cookies: {e}")
|
|
return None
|
|
finally:
|
|
conn.close()
|
|
|
|
def _write_netscape_cookie_file(self, cookies_json: str, output_path: str) -> bool:
|
|
"""Convert JSON cookies array to Netscape cookie file format for gallery-dl."""
|
|
try:
|
|
cookies = json.loads(cookies_json)
|
|
if not isinstance(cookies, list):
|
|
logger.error("Cookies is not a JSON array")
|
|
return False
|
|
|
|
with open(output_path, 'w') as f:
|
|
f.write("# Netscape HTTP Cookie File\n")
|
|
f.write("# https://curl.haxx.se/docs/http-cookies.html\n\n")
|
|
for cookie in cookies:
|
|
domain = cookie.get('domain', '')
|
|
# Ensure domain starts with . for domain-wide cookies
|
|
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
|
|
path = cookie.get('path', '/')
|
|
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
|
|
# Convert expiry - use 0 for session cookies
|
|
expires = cookie.get('expirationDate', cookie.get('expiry', cookie.get('expires', 0)))
|
|
if expires is None:
|
|
expires = 0
|
|
expires = str(int(float(expires)))
|
|
name = cookie.get('name', '')
|
|
value = cookie.get('value', '')
|
|
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n")
|
|
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to write Netscape cookie file: {e}")
|
|
return False
|
|
|
|
# =========================================================================
|
|
# COMMUNITY MAPPING METHODS
|
|
# =========================================================================
|
|
|
|
def get_all_communities(self) -> List[Dict]:
|
|
"""Get all community mappings with person info and live media count."""
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
SELECT c.*, p.encrypted_name as person_encrypted_name,
|
|
r.encrypted_name as relationship_encrypted_name,
|
|
r.color as relationship_color,
|
|
(SELECT COUNT(*) FROM private_media m
|
|
WHERE m.person_id = c.person_id AND m.source_type = 'reddit') as actual_media_count
|
|
FROM private_media_reddit_communities c
|
|
LEFT JOIN private_media_persons p ON c.person_id = p.id
|
|
LEFT JOIN private_media_relationships r ON p.relationship_id = r.id
|
|
ORDER BY c.subreddit_name
|
|
''')
|
|
communities = []
|
|
for row in cursor.fetchall():
|
|
d = dict(row)
|
|
d['total_media_found'] = d.pop('actual_media_count', 0)
|
|
communities.append(d)
|
|
return communities
|
|
finally:
|
|
conn.close()
|
|
|
|
def get_community(self, community_id: int) -> Optional[Dict]:
|
|
"""Get a single community mapping."""
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
SELECT c.*, p.encrypted_name as person_encrypted_name,
|
|
r.encrypted_name as relationship_encrypted_name,
|
|
r.color as relationship_color
|
|
FROM private_media_reddit_communities c
|
|
LEFT JOIN private_media_persons p ON c.person_id = p.id
|
|
LEFT JOIN private_media_relationships r ON p.relationship_id = r.id
|
|
WHERE c.id = ?
|
|
''', (community_id,))
|
|
row = cursor.fetchone()
|
|
return dict(row) if row else None
|
|
finally:
|
|
conn.close()
|
|
|
|
def add_community(self, subreddit_name: str, person_id: int) -> int:
|
|
"""Add a new community mapping. Returns the new ID."""
|
|
# Strip r/ prefix if present
|
|
subreddit_name = re.sub(r'^r/', '', subreddit_name.strip())
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
INSERT INTO private_media_reddit_communities (subreddit_name, person_id)
|
|
VALUES (?, ?)
|
|
''', (subreddit_name, person_id))
|
|
conn.commit()
|
|
return cursor.lastrowid
|
|
finally:
|
|
conn.close()
|
|
|
|
def update_community(self, community_id: int, **kwargs) -> bool:
|
|
"""Update a community mapping."""
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
updates = []
|
|
values = []
|
|
for key, value in kwargs.items():
|
|
if key == 'subreddit_name' and value is not None:
|
|
value = re.sub(r'^r/', '', value.strip())
|
|
if key == 'enabled':
|
|
value = 1 if value else 0
|
|
updates.append(f'{key} = ?')
|
|
values.append(value)
|
|
if not updates:
|
|
return False
|
|
updates.append("updated_at = CURRENT_TIMESTAMP")
|
|
values.append(community_id)
|
|
cursor.execute(
|
|
f"UPDATE private_media_reddit_communities SET {', '.join(updates)} WHERE id = ?",
|
|
values
|
|
)
|
|
conn.commit()
|
|
return cursor.rowcount > 0
|
|
except Exception as e:
|
|
logger.error(f"Failed to update community {community_id}: {e}")
|
|
return False
|
|
finally:
|
|
conn.close()
|
|
|
|
def delete_community(self, community_id: int) -> bool:
|
|
"""Delete a community mapping."""
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"DELETE FROM private_media_reddit_communities WHERE id = ?",
|
|
(community_id,)
|
|
)
|
|
conn.commit()
|
|
return cursor.rowcount > 0
|
|
finally:
|
|
conn.close()
|
|
|
|
def get_communities_for_person(self, person_id: int) -> List[Dict]:
|
|
"""Get all communities mapped to a person."""
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT * FROM private_media_reddit_communities WHERE person_id = ?",
|
|
(person_id,)
|
|
)
|
|
return [dict(row) for row in cursor.fetchall()]
|
|
finally:
|
|
conn.close()
|
|
|
|
def get_history(self, community_id: int) -> List[Dict]:
|
|
"""Get download history for a community."""
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
SELECT * FROM private_media_reddit_history
|
|
WHERE community_id = ?
|
|
ORDER BY processed_at DESC
|
|
LIMIT 100
|
|
''', (community_id,))
|
|
return [dict(row) for row in cursor.fetchall()]
|
|
finally:
|
|
conn.close()
|
|
|
|
# =========================================================================
|
|
# CRYPTO ACCESS
|
|
# =========================================================================
|
|
|
|
def _get_crypto(self):
|
|
"""Load crypto from key file for background access."""
|
|
from modules.private_gallery_crypto import load_key_from_file
|
|
crypto = load_key_from_file(REDDIT_MONITOR_KEY_FILE)
|
|
if crypto is None:
|
|
logger.warning("Reddit monitor crypto unavailable - key file missing or invalid")
|
|
return crypto
|
|
|
|
# =========================================================================
|
|
# DOWNLOAD METHODS
|
|
# =========================================================================
|
|
|
|
async def check_all_now(self, from_scheduler: bool = False) -> int:
|
|
"""
|
|
Check all enabled communities for new posts.
|
|
|
|
Args:
|
|
from_scheduler: Whether this was triggered by the scheduler
|
|
|
|
Returns:
|
|
Total count of new media items imported
|
|
"""
|
|
settings = self.get_settings()
|
|
if from_scheduler and not settings.get('enabled'):
|
|
logger.debug("Reddit monitor is disabled")
|
|
return 0
|
|
|
|
crypto = self._get_crypto()
|
|
if crypto is None:
|
|
logger.warning("Skipping Reddit check: encryption key not available")
|
|
return 0
|
|
|
|
# Get enabled communities
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT * FROM private_media_reddit_communities WHERE enabled = 1"
|
|
)
|
|
communities = [dict(row) for row in cursor.fetchall()]
|
|
finally:
|
|
conn.close()
|
|
|
|
if not communities:
|
|
logger.debug("No enabled Reddit communities to check")
|
|
return 0
|
|
|
|
# Start background task tracking
|
|
if self.activity_manager:
|
|
self.activity_manager.start_background_task(
|
|
'reddit_monitor',
|
|
'reddit_community_monitor',
|
|
'Reddit Community Monitor',
|
|
'Running',
|
|
{'total_communities': len(communities), 'media_found': 0}
|
|
)
|
|
|
|
total_media = 0
|
|
affected_person_ids: Set[int] = set()
|
|
lookback_days = settings.get('lookback_days', 3)
|
|
|
|
# Load cookies from encrypted storage
|
|
cookies_json = self._get_cookies_json(crypto)
|
|
|
|
# Crash recovery checkpoint
|
|
from modules.task_checkpoint import TaskCheckpoint
|
|
checkpoint = TaskCheckpoint('reddit_monitor', 'background')
|
|
checkpoint.start(total_items=len(communities))
|
|
if checkpoint.is_recovering():
|
|
logger.info("Reddit monitor: recovering — skipping already-checked communities")
|
|
|
|
try:
|
|
for idx, community in enumerate(communities):
|
|
subreddit = community['subreddit_name']
|
|
person_id = community['person_id']
|
|
community_id = community['id']
|
|
|
|
if checkpoint.is_completed(str(community_id)):
|
|
continue
|
|
|
|
checkpoint.set_current(str(community_id))
|
|
|
|
try:
|
|
# Use longer lookback for communities that have never imported anything
|
|
effective_lookback = lookback_days
|
|
if community.get('total_media_found', 0) == 0 and not community.get('last_checked'):
|
|
effective_lookback = 30
|
|
logger.info(f"First check for r/{subreddit}, using 30-day lookback")
|
|
|
|
media_count = await self._check_community(
|
|
community_id, subreddit, person_id,
|
|
effective_lookback, cookies_json, crypto,
|
|
community_idx=idx, total_communities=len(communities),
|
|
running_media_total=total_media
|
|
)
|
|
total_media += media_count
|
|
|
|
if media_count > 0:
|
|
affected_person_ids.add(person_id)
|
|
if self.activity_manager:
|
|
self.activity_manager.update_background_task(
|
|
'reddit_monitor',
|
|
f'Found {media_count} new media in r/{subreddit}',
|
|
idx + 1, len(communities),
|
|
{'total_communities': len(communities), 'media_found': total_media, 'current_community': f'r/{subreddit}', 'phase': 'found', 'last_found': media_count}
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error checking r/{subreddit}: {e}")
|
|
import traceback
|
|
logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
|
|
checkpoint.mark_completed(str(community_id))
|
|
|
|
# Checkpoint complete
|
|
checkpoint.finish()
|
|
|
|
# Update last_checked timestamp
|
|
self.update_settings(last_checked=datetime.now().isoformat())
|
|
|
|
# Auto-dedup for persons that received new media
|
|
if affected_person_ids:
|
|
if self.activity_manager:
|
|
self.activity_manager.update_background_task(
|
|
'reddit_monitor', 'Deduplicating...',
|
|
len(communities), len(communities),
|
|
{'phase': 'deduplicating', 'media_found': total_media}
|
|
)
|
|
dedup_deleted = self._run_dedup_for_persons(affected_person_ids, crypto)
|
|
if dedup_deleted > 0:
|
|
logger.info(f"Reddit monitor: auto-dedup removed {dedup_deleted} duplicates")
|
|
|
|
finally:
|
|
if self.activity_manager:
|
|
self.activity_manager.stop_background_task('reddit_monitor')
|
|
|
|
if total_media > 0:
|
|
logger.info(f"Reddit monitor: imported {total_media} new media items")
|
|
else:
|
|
logger.debug("Reddit monitor: no new media found")
|
|
|
|
return total_media
|
|
|
|
async def download_full_community(self, community_id: int) -> int:
|
|
"""
|
|
Download all available media from a community (no date filter).
|
|
|
|
Args:
|
|
community_id: ID of the community to download
|
|
|
|
Returns:
|
|
Count of new media items imported
|
|
"""
|
|
community = self.get_community(community_id)
|
|
if not community:
|
|
logger.error(f"Community {community_id} not found")
|
|
return 0
|
|
|
|
crypto = self._get_crypto()
|
|
if crypto is None:
|
|
logger.warning("Cannot download: encryption key not available")
|
|
return 0
|
|
|
|
cookies_json = self._get_cookies_json(crypto)
|
|
|
|
subreddit = community['subreddit_name']
|
|
if self.activity_manager:
|
|
self.activity_manager.start_background_task(
|
|
'reddit_monitor',
|
|
'reddit_community_monitor',
|
|
'Reddit Community Monitor',
|
|
'Running',
|
|
{'total_communities': 1, 'media_found': 0, 'current_community': f'r/{subreddit}', 'full_download': True}
|
|
)
|
|
|
|
try:
|
|
media_count = await self._check_community(
|
|
community_id, subreddit,
|
|
community['person_id'], None, cookies_json, crypto,
|
|
community_idx=0, total_communities=1, running_media_total=0
|
|
)
|
|
return media_count
|
|
finally:
|
|
if self.activity_manager:
|
|
self.activity_manager.stop_background_task('reddit_monitor')
|
|
|
|
async def check_single_community(self, community_id: int) -> int:
|
|
"""
|
|
Check a single community for new posts (using lookback_days filter).
|
|
|
|
Args:
|
|
community_id: ID of the community to check
|
|
|
|
Returns:
|
|
Count of new media items imported
|
|
"""
|
|
community = self.get_community(community_id)
|
|
if not community:
|
|
logger.error(f"Community {community_id} not found")
|
|
return 0
|
|
|
|
crypto = self._get_crypto()
|
|
if crypto is None:
|
|
logger.warning("Cannot check: encryption key not available")
|
|
return 0
|
|
|
|
settings = self.get_settings()
|
|
lookback_days = settings.get('lookback_days', 3)
|
|
cookies_json = self._get_cookies_json(crypto)
|
|
|
|
subreddit = community['subreddit_name']
|
|
if self.activity_manager:
|
|
self.activity_manager.start_background_task(
|
|
'reddit_monitor',
|
|
'reddit_community_monitor',
|
|
'Reddit Community Monitor',
|
|
'Running',
|
|
{'total_communities': 1, 'media_found': 0, 'current_community': f'r/{subreddit}'}
|
|
)
|
|
|
|
try:
|
|
# Use longer lookback for communities that have never imported anything
|
|
effective_lookback = lookback_days
|
|
if community.get('total_media_found', 0) == 0 and not community.get('last_checked'):
|
|
effective_lookback = 30
|
|
logger.info(f"First check for r/{subreddit}, using 30-day lookback")
|
|
|
|
media_count = await self._check_community(
|
|
community_id, subreddit,
|
|
community['person_id'], effective_lookback, cookies_json, crypto,
|
|
community_idx=0, total_communities=1, running_media_total=0
|
|
)
|
|
return media_count
|
|
finally:
|
|
if self.activity_manager:
|
|
self.activity_manager.stop_background_task('reddit_monitor')
|
|
|
|
async def check_communities_by_person(self, person_id: int) -> int:
|
|
"""
|
|
Check all enabled communities for a given person.
|
|
|
|
Args:
|
|
person_id: ID of the person whose communities to check
|
|
|
|
Returns:
|
|
Total count of new media items imported
|
|
"""
|
|
crypto = self._get_crypto()
|
|
if crypto is None:
|
|
logger.warning("Cannot check: encryption key not available")
|
|
return 0
|
|
|
|
# Get enabled communities for this person
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT * FROM private_media_reddit_communities WHERE person_id = ? AND enabled = 1",
|
|
(person_id,)
|
|
)
|
|
communities = [dict(row) for row in cursor.fetchall()]
|
|
finally:
|
|
conn.close()
|
|
|
|
if not communities:
|
|
logger.debug(f"No enabled communities for person {person_id}")
|
|
return 0
|
|
|
|
settings = self.get_settings()
|
|
lookback_days = settings.get('lookback_days', 3)
|
|
cookies_json = self._get_cookies_json(crypto)
|
|
|
|
if self.activity_manager:
|
|
self.activity_manager.start_background_task(
|
|
'reddit_monitor',
|
|
'reddit_community_monitor',
|
|
'Reddit Community Monitor',
|
|
'Running',
|
|
{'total_communities': len(communities), 'media_found': 0}
|
|
)
|
|
|
|
total_media = 0
|
|
try:
|
|
for idx, community in enumerate(communities):
|
|
subreddit = community['subreddit_name']
|
|
community_id = community['id']
|
|
|
|
try:
|
|
effective_lookback = lookback_days
|
|
if community.get('total_media_found', 0) == 0 and not community.get('last_checked'):
|
|
effective_lookback = 30
|
|
logger.info(f"First check for r/{subreddit}, using 30-day lookback")
|
|
|
|
media_count = await self._check_community(
|
|
community_id, subreddit, community['person_id'],
|
|
effective_lookback, cookies_json, crypto,
|
|
community_idx=idx, total_communities=len(communities),
|
|
running_media_total=total_media
|
|
)
|
|
total_media += media_count
|
|
|
|
if media_count > 0 and self.activity_manager:
|
|
self.activity_manager.update_background_task(
|
|
'reddit_monitor',
|
|
f'Found {media_count} new media in r/{subreddit}',
|
|
idx + 1, len(communities),
|
|
{'total_communities': len(communities), 'media_found': total_media,
|
|
'current_community': f'r/{subreddit}', 'phase': 'found', 'last_found': media_count}
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error checking r/{subreddit}: {e}")
|
|
import traceback
|
|
logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
|
|
# Auto-dedup for this person if new media was imported
|
|
if total_media > 0:
|
|
if self.activity_manager:
|
|
self.activity_manager.update_background_task(
|
|
'reddit_monitor', 'Deduplicating...',
|
|
len(communities), len(communities),
|
|
{'phase': 'deduplicating', 'media_found': total_media}
|
|
)
|
|
dedup_deleted = self._run_dedup_for_persons({person_id}, crypto)
|
|
if dedup_deleted > 0:
|
|
logger.info(f"Reddit person check: auto-dedup removed {dedup_deleted} duplicates")
|
|
|
|
finally:
|
|
if self.activity_manager:
|
|
self.activity_manager.stop_background_task('reddit_monitor')
|
|
|
|
if total_media > 0:
|
|
logger.info(f"Reddit person check: imported {total_media} new media items")
|
|
|
|
return total_media
|
|
|
|
def _update_status(self, status_text: str, community_idx: int, total_communities: int, extra: Dict = None):
|
|
"""Helper to update background task status with detailed info."""
|
|
if not self.activity_manager:
|
|
return
|
|
data = {
|
|
'total_communities': total_communities,
|
|
'media_found': extra.get('media_found', 0) if extra else 0,
|
|
}
|
|
if extra:
|
|
data.update(extra)
|
|
self.activity_manager.update_background_task(
|
|
'reddit_monitor', status_text,
|
|
community_idx, total_communities, data
|
|
)
|
|
|
|
async def _check_community(
|
|
self, community_id: int, subreddit: str, person_id: int,
|
|
lookback_days: Optional[int], cookies_json: Optional[str], crypto,
|
|
community_idx: int = 0, total_communities: int = 1, running_media_total: int = 0
|
|
) -> int:
|
|
"""Check a single community and import new media."""
|
|
with tempfile.TemporaryDirectory(prefix=f'reddit_{subreddit}_') as temp_dir:
|
|
# Phase: Downloading
|
|
self._update_status(
|
|
f'Downloading from r/{subreddit}...', community_idx, total_communities,
|
|
{'current_community': f'r/{subreddit}', 'phase': 'downloading',
|
|
'media_found': running_media_total}
|
|
)
|
|
|
|
# Run gallery-dl
|
|
files = await self._run_gallery_dl(
|
|
subreddit, temp_dir, lookback_days, cookies_json
|
|
)
|
|
if not files:
|
|
logger.debug(f"No files downloaded from r/{subreddit}")
|
|
self._update_status(
|
|
f'No new files in r/{subreddit}', community_idx, total_communities,
|
|
{'current_community': f'r/{subreddit}', 'phase': 'done',
|
|
'media_found': running_media_total}
|
|
)
|
|
# Still update last_checked so we know we tried
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
UPDATE private_media_reddit_communities
|
|
SET last_checked = CURRENT_TIMESTAMP
|
|
WHERE id = ?
|
|
''', (community_id,))
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
return 0
|
|
|
|
# Phase: Processing
|
|
self._update_status(
|
|
f'Downloaded {len(files)} files from r/{subreddit}, grouping by post...',
|
|
community_idx, total_communities,
|
|
{'current_community': f'r/{subreddit}', 'phase': 'processing',
|
|
'files_downloaded': len(files), 'media_found': running_media_total}
|
|
)
|
|
|
|
# Group files by Reddit post ID
|
|
posts = self._group_files_by_post(files, temp_dir)
|
|
|
|
# Get or create "reddit" tag
|
|
reddit_tag_id = self._ensure_reddit_tag(crypto)
|
|
|
|
# Filter out already-processed posts
|
|
new_posts = {}
|
|
for reddit_post_id, post_data in posts.items():
|
|
if not self._is_post_processed(community_id, reddit_post_id):
|
|
new_posts[reddit_post_id] = post_data
|
|
|
|
if not new_posts:
|
|
self._update_status(
|
|
f'No new posts in r/{subreddit} ({len(posts)} already imported)',
|
|
community_idx, total_communities,
|
|
{'current_community': f'r/{subreddit}', 'phase': 'done',
|
|
'files_downloaded': len(files), 'media_found': running_media_total}
|
|
)
|
|
# Still need to update total_media = 0 path below
|
|
posts_to_import = {}
|
|
else:
|
|
posts_to_import = new_posts
|
|
|
|
# Import each post
|
|
total_media = 0
|
|
for post_num, (reddit_post_id, post_data) in enumerate(posts_to_import.items(), 1):
|
|
num_files = len(post_data['files'])
|
|
self._update_status(
|
|
f'Importing post {post_num}/{len(posts_to_import)} from r/{subreddit} ({num_files} files)',
|
|
community_idx, total_communities,
|
|
{'current_community': f'r/{subreddit}', 'phase': 'importing',
|
|
'files_downloaded': len(files), 'posts_imported': post_num,
|
|
'posts_total': len(posts_to_import),
|
|
'media_found': running_media_total + total_media}
|
|
)
|
|
|
|
media_count = self._import_post_to_gallery(
|
|
post_data, person_id, reddit_tag_id, crypto,
|
|
subreddit, community_id, reddit_post_id,
|
|
community_idx, total_communities, running_media_total + total_media
|
|
)
|
|
total_media += media_count
|
|
|
|
# Update community stats
|
|
if total_media > 0:
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
UPDATE private_media_reddit_communities
|
|
SET total_media_found = total_media_found + ?,
|
|
last_checked = CURRENT_TIMESTAMP,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = ?
|
|
''', (total_media, community_id))
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
else:
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
UPDATE private_media_reddit_communities
|
|
SET last_checked = CURRENT_TIMESTAMP
|
|
WHERE id = ?
|
|
''', (community_id,))
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
return total_media
|
|
|
|
# =========================================================================
|
|
# HELPER METHODS
|
|
# =========================================================================
|
|
|
|
async def _run_gallery_dl(
|
|
self, subreddit: str, temp_dir: str,
|
|
lookback_days: Optional[int] = None, cookies_json: Optional[str] = None
|
|
) -> List[Path]:
|
|
"""
|
|
Run gallery-dl to download media from a subreddit.
|
|
|
|
Returns:
|
|
List of downloaded file paths
|
|
"""
|
|
# Use a persistent download archive so gallery-dl skips already-downloaded URLs
|
|
archive_dir = os.path.join(os.path.dirname(self.db_path) if '/' in self.db_path else '/opt/media-downloader/data', 'cache')
|
|
os.makedirs(archive_dir, exist_ok=True)
|
|
archive_path = os.path.join(archive_dir, 'reddit_gallery_dl_archive.db')
|
|
|
|
cmd = [
|
|
self.gallery_dl_path,
|
|
'--write-metadata',
|
|
'--download-archive', archive_path,
|
|
'-d', temp_dir,
|
|
]
|
|
|
|
# Use REST API mode instead of OAuth API to avoid shared rate limits.
|
|
# The default OAuth client-id is shared by all gallery-dl users globally,
|
|
# causing 429 rate limits with many subreddits. REST mode uses www.reddit.com
|
|
# directly with cookies for auth, bypassing OAuth rate limits entirely.
|
|
cmd.extend(['-o', 'extractor.reddit.api=rest'])
|
|
|
|
# Limit to 200 most recent posts per subreddit to avoid timeout from full history pagination
|
|
cmd.extend(['--range', '1-200'])
|
|
|
|
cmd.append(f'https://www.reddit.com/r/{subreddit}/new/')
|
|
|
|
if lookback_days:
|
|
cutoff = (datetime.now() - timedelta(days=lookback_days)).strftime('%Y-%m-%d')
|
|
cmd.extend(['--filter', f"date >= datetime.strptime('{cutoff}', '%Y-%m-%d')"])
|
|
|
|
# Write JSON cookies to a temp Netscape cookie file
|
|
# Ensure temp_dir exists (can be cleaned by systemd-tmpfiles or race conditions)
|
|
temp_cookie_file = None
|
|
if cookies_json:
|
|
os.makedirs(temp_dir, exist_ok=True)
|
|
temp_cookie_file = os.path.join(temp_dir, '.cookies.txt')
|
|
if self._write_netscape_cookie_file(cookies_json, temp_cookie_file):
|
|
cmd.extend(['--cookies', temp_cookie_file])
|
|
|
|
logger.info(f"Running gallery-dl for r/{subreddit}")
|
|
logger.debug(f"Command: {' '.join(cmd)}")
|
|
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
result = await loop.run_in_executor(
|
|
None,
|
|
lambda: subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=600 # 10 minute timeout
|
|
)
|
|
)
|
|
|
|
# gallery-dl exit codes are bitflags: 1=some errors, 4=some skipped, 8=all skipped
|
|
# Code 4 (skipped) and 5 (skipped+errors) are normal when files already exist
|
|
if result.returncode not in (0, 1, 4, 5):
|
|
logger.warning(f"gallery-dl returned code {result.returncode} for r/{subreddit}")
|
|
if result.stderr:
|
|
logger.debug(f"gallery-dl stderr: {result.stderr[:500]}")
|
|
|
|
except subprocess.TimeoutExpired:
|
|
logger.error(f"gallery-dl timed out for r/{subreddit}")
|
|
return []
|
|
except Exception as e:
|
|
logger.error(f"gallery-dl failed for r/{subreddit}: {e}")
|
|
return []
|
|
|
|
# Collect all non-JSON, non-cookie files from the temp directory
|
|
downloaded = []
|
|
skip_exts = {'.json', '.txt'}
|
|
skip_names = {'.cookies.txt'}
|
|
for root, dirs, filenames in os.walk(temp_dir):
|
|
for fname in filenames:
|
|
if fname in skip_names or fname.startswith('.'):
|
|
continue
|
|
if fname.endswith('.json'):
|
|
continue
|
|
downloaded.append(Path(root) / fname)
|
|
|
|
logger.info(f"Downloaded {len(downloaded)} files from r/{subreddit}")
|
|
return downloaded
|
|
|
|
def _group_files_by_post(
|
|
self, files: List[Path], temp_dir: str
|
|
) -> Dict[str, Dict]:
|
|
"""
|
|
Group downloaded files by their Reddit post ID using metadata JSON sidecars.
|
|
|
|
Returns:
|
|
Dict mapping reddit_post_id -> {
|
|
'files': [Path],
|
|
'title': str,
|
|
'date': str,
|
|
'source_url': str
|
|
}
|
|
"""
|
|
posts: Dict[str, Dict] = {}
|
|
|
|
for file_path in files:
|
|
# Look for matching metadata JSON sidecar
|
|
json_path = file_path.with_suffix(file_path.suffix + '.json')
|
|
if not json_path.exists():
|
|
# Try without double extension
|
|
json_path = file_path.with_suffix('.json')
|
|
|
|
metadata = {}
|
|
if json_path.exists():
|
|
try:
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
metadata = json.load(f)
|
|
except (json.JSONDecodeError, Exception) as e:
|
|
logger.debug(f"Failed to parse metadata for {file_path.name}: {e}")
|
|
|
|
# Extract Reddit post ID - gallery-dl uses various field names
|
|
reddit_post_id = None
|
|
for key in ('id', 'reddit_id', 'parent_id'):
|
|
if key in metadata:
|
|
reddit_post_id = str(metadata[key])
|
|
break
|
|
|
|
if not reddit_post_id:
|
|
# Use filename-based grouping as fallback
|
|
# gallery-dl typically names files like: subreddit_postid_num.ext
|
|
parts = file_path.stem.split('_')
|
|
if len(parts) >= 2:
|
|
reddit_post_id = parts[-2] if len(parts) >= 3 else parts[-1]
|
|
else:
|
|
reddit_post_id = file_path.stem
|
|
|
|
# Extract post date (ensure ISO format in local time for frontend)
|
|
# gallery-dl stores Reddit dates in UTC — convert to local time
|
|
post_date = None
|
|
if 'date' in metadata:
|
|
date_val = metadata['date']
|
|
if isinstance(date_val, str):
|
|
try:
|
|
from datetime import timezone as tz
|
|
for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
|
|
try:
|
|
utc_dt = datetime.strptime(date_val, fmt).replace(tzinfo=tz.utc)
|
|
post_date = utc_dt.astimezone().strftime('%Y-%m-%dT%H:%M:%S')
|
|
break
|
|
except ValueError:
|
|
continue
|
|
if not post_date:
|
|
post_date = date_val # fallback to raw string
|
|
except Exception:
|
|
post_date = date_val
|
|
elif isinstance(date_val, (int, float)):
|
|
try:
|
|
post_date = datetime.fromtimestamp(date_val).isoformat()
|
|
except (ValueError, OSError):
|
|
pass
|
|
|
|
if not post_date and 'created_utc' in metadata:
|
|
try:
|
|
post_date = datetime.fromtimestamp(metadata['created_utc']).isoformat()
|
|
except (ValueError, OSError):
|
|
pass
|
|
|
|
if not post_date:
|
|
post_date = datetime.now().isoformat()
|
|
|
|
# Extract title
|
|
title = metadata.get('title', metadata.get('description', ''))
|
|
|
|
# Build source URL
|
|
subreddit = metadata.get('subreddit', '')
|
|
source_url = f"https://www.reddit.com/r/{subreddit}/comments/{reddit_post_id}" if subreddit else ''
|
|
|
|
if reddit_post_id not in posts:
|
|
posts[reddit_post_id] = {
|
|
'files': [],
|
|
'title': title,
|
|
'date': post_date,
|
|
'source_url': source_url,
|
|
}
|
|
|
|
posts[reddit_post_id]['files'].append(file_path)
|
|
|
|
return posts
|
|
|
|
def _is_post_processed(self, community_id: int, reddit_post_id: str) -> bool:
|
|
"""Check if a Reddit post has already been processed."""
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT id FROM private_media_reddit_history WHERE community_id = ? AND reddit_post_id = ?",
|
|
(community_id, reddit_post_id)
|
|
)
|
|
return cursor.fetchone() is not None
|
|
finally:
|
|
conn.close()
|
|
|
|
def _ensure_reddit_tag(self, crypto) -> int:
|
|
"""Find or create a 'reddit' tag in private_gallery_tags."""
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT id, encrypted_name FROM private_gallery_tags")
|
|
for row in cursor.fetchall():
|
|
try:
|
|
name = crypto.decrypt_field(row['encrypted_name'])
|
|
if name.lower() == 'reddit':
|
|
return row['id']
|
|
except Exception:
|
|
continue
|
|
|
|
# Create the tag
|
|
encrypted_name = crypto.encrypt_field('Reddit')
|
|
cursor.execute('''
|
|
INSERT INTO private_gallery_tags (encrypted_name, color)
|
|
VALUES (?, '#ff4500')
|
|
''', (encrypted_name,))
|
|
conn.commit()
|
|
tag_id = cursor.lastrowid
|
|
logger.info(f"Created 'Reddit' tag with ID {tag_id}")
|
|
return tag_id
|
|
finally:
|
|
conn.close()
|
|
|
|
def _import_post_to_gallery(
|
|
self, post_data: Dict, person_id: int, reddit_tag_id: int,
|
|
crypto, subreddit: str, community_id: int, reddit_post_id: str,
|
|
community_idx: int = 0, total_communities: int = 1, running_media_total: int = 0
|
|
) -> int:
|
|
"""
|
|
Import a Reddit post's media files into the private gallery.
|
|
|
|
Returns:
|
|
Number of media files successfully imported
|
|
"""
|
|
files = post_data['files']
|
|
title = post_data['title']
|
|
post_date = post_data['date']
|
|
source_url = post_data.get('source_url', '')
|
|
|
|
if not files:
|
|
return 0
|
|
|
|
# Get storage path from config
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT value FROM private_media_config WHERE key = 'storage_path'")
|
|
row = cursor.fetchone()
|
|
storage_path = Path(row['value']) if row else Path('/opt/immich/private')
|
|
finally:
|
|
conn.close()
|
|
|
|
data_path = storage_path / 'data'
|
|
thumbs_path = storage_path / 'thumbs'
|
|
data_path.mkdir(parents=True, exist_ok=True)
|
|
thumbs_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create a post
|
|
encrypted_desc = crypto.encrypt_field(title) if title else None
|
|
encrypted_date = crypto.encrypt_field(post_date) if post_date else crypto.encrypt_field(datetime.now().isoformat())
|
|
now_iso = datetime.now().isoformat()
|
|
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
INSERT INTO private_media_posts (person_id, encrypted_description, encrypted_media_date, created_at, updated_at)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
''', (person_id, encrypted_desc, encrypted_date, now_iso, now_iso))
|
|
conn.commit()
|
|
post_id = cursor.lastrowid
|
|
finally:
|
|
conn.close()
|
|
|
|
media_count = 0
|
|
media_ids = []
|
|
total_files = len(files)
|
|
|
|
for file_idx, file_path in enumerate(files, 1):
|
|
try:
|
|
if not file_path.exists() or file_path.stat().st_size == 0:
|
|
continue
|
|
|
|
# Update status: encrypting/importing file
|
|
self._update_status(
|
|
f'Encrypting file {file_idx}/{total_files} from r/{subreddit}',
|
|
community_idx, total_communities,
|
|
{'current_community': f'r/{subreddit}', 'phase': 'encrypting',
|
|
'current_file': file_idx, 'total_files': total_files,
|
|
'media_found': running_media_total + media_count}
|
|
)
|
|
|
|
# Calculate file hash
|
|
sha256 = hashlib.sha256()
|
|
with open(file_path, 'rb') as f:
|
|
for chunk in iter(lambda: f.read(65536), b''):
|
|
sha256.update(chunk)
|
|
file_hash = sha256.hexdigest()
|
|
|
|
# Check for duplicates (scoped by person)
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
'SELECT id FROM private_media WHERE file_hash = ? AND person_id = ?',
|
|
(file_hash, person_id)
|
|
)
|
|
if cursor.fetchone():
|
|
logger.debug(f"Duplicate file skipped: {file_path.name}")
|
|
continue
|
|
finally:
|
|
conn.close()
|
|
|
|
# Get file info
|
|
file_info = self._get_file_info(file_path)
|
|
file_size = file_path.stat().st_size
|
|
|
|
# Compute perceptual hash
|
|
perceptual_hash = self._compute_perceptual_hash(file_path)
|
|
|
|
# Generate storage ID
|
|
storage_id = str(uuid.uuid4())
|
|
|
|
# Generate thumbnail
|
|
temp_thumb = Path(tempfile.gettempdir()) / f"pg_thumb_{storage_id}.jpg"
|
|
self._generate_thumbnail(file_path, temp_thumb, file_info['file_type'])
|
|
|
|
# Encrypt the file
|
|
encrypted_file = data_path / f"{storage_id}.enc"
|
|
if not crypto.encrypt_file(file_path, encrypted_file):
|
|
logger.error(f"Encryption failed for {file_path.name}")
|
|
continue
|
|
|
|
# Encrypt thumbnail
|
|
if temp_thumb.exists():
|
|
encrypted_thumb = thumbs_path / f"{storage_id}.enc"
|
|
crypto.encrypt_file(temp_thumb, encrypted_thumb)
|
|
try:
|
|
temp_thumb.unlink()
|
|
except Exception:
|
|
pass
|
|
|
|
# Insert media record
|
|
encrypted_filename = crypto.encrypt_field(file_path.name)
|
|
encrypted_source = crypto.encrypt_field(source_url)
|
|
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
INSERT INTO private_media (
|
|
post_id, storage_id, encrypted_filename, encrypted_description,
|
|
file_hash, file_size, file_type, mime_type,
|
|
width, height, duration, person_id,
|
|
encrypted_media_date, source_type, encrypted_source_path,
|
|
perceptual_hash, created_at
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
''', (
|
|
post_id,
|
|
storage_id,
|
|
encrypted_filename,
|
|
None,
|
|
file_hash,
|
|
file_size,
|
|
file_info['file_type'],
|
|
file_info['mime_type'],
|
|
file_info['width'],
|
|
file_info['height'],
|
|
file_info['duration'],
|
|
person_id,
|
|
encrypted_date,
|
|
'reddit',
|
|
encrypted_source,
|
|
perceptual_hash,
|
|
now_iso
|
|
))
|
|
media_id = cursor.lastrowid
|
|
media_ids.append(media_id)
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
media_count += 1
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to import {file_path.name}: {e}")
|
|
import traceback
|
|
logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
|
|
# Apply reddit tag to the post
|
|
if media_count > 0:
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
INSERT OR IGNORE INTO private_media_post_tags (post_id, tag_id)
|
|
VALUES (?, ?)
|
|
''', (post_id, reddit_tag_id))
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
else:
|
|
# Delete the empty post
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute("DELETE FROM private_media_posts WHERE id = ?", (post_id,))
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
# Record in history only if we successfully imported media
|
|
if media_count > 0:
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
INSERT OR IGNORE INTO private_media_reddit_history
|
|
(community_id, reddit_post_id, media_count)
|
|
VALUES (?, ?, ?)
|
|
''', (community_id, reddit_post_id, media_count))
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
if media_count > 0:
|
|
logger.info(f"Imported {media_count} files from r/{subreddit} post {reddit_post_id}")
|
|
|
|
return media_count
|
|
|
|
def _get_file_info(self, file_path: Path) -> Dict[str, Any]:
|
|
"""Get file type, mime type, and dimensions."""
|
|
ext = file_path.suffix.lower().lstrip('.')
|
|
mime_type, _ = mimetypes.guess_type(str(file_path))
|
|
if not mime_type:
|
|
mime_type = 'application/octet-stream'
|
|
|
|
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
|
|
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
|
|
|
|
if ext in image_exts:
|
|
file_type = 'image'
|
|
elif ext in video_exts:
|
|
file_type = 'video'
|
|
else:
|
|
file_type = 'other'
|
|
|
|
info = {
|
|
'file_type': file_type,
|
|
'mime_type': mime_type,
|
|
'width': None,
|
|
'height': None,
|
|
'duration': None
|
|
}
|
|
|
|
if file_type == 'image':
|
|
try:
|
|
from PIL import Image
|
|
with Image.open(file_path) as img:
|
|
info['width'], info['height'] = img.size
|
|
except Exception:
|
|
pass
|
|
|
|
if file_type == 'video':
|
|
try:
|
|
result = subprocess.run([
|
|
'ffprobe', '-v', 'quiet', '-print_format', 'json',
|
|
'-show_streams', '-show_format', str(file_path)
|
|
], capture_output=True, text=True, timeout=30)
|
|
if result.returncode == 0:
|
|
data = json.loads(result.stdout)
|
|
for stream in data.get('streams', []):
|
|
if stream.get('codec_type') == 'video':
|
|
info['width'] = stream.get('width')
|
|
info['height'] = stream.get('height')
|
|
break
|
|
if 'format' in data:
|
|
duration = data['format'].get('duration')
|
|
if duration:
|
|
info['duration'] = float(duration)
|
|
except Exception:
|
|
pass
|
|
|
|
return info
|
|
|
|
def _run_dedup_for_persons(self, person_ids: Set[int], crypto) -> int:
|
|
"""
|
|
Run perceptual dedup for the given person IDs, auto-deleting duplicates.
|
|
Uses the same algorithm as the dashboard's dedup scanner.
|
|
|
|
Returns total number of duplicates deleted.
|
|
"""
|
|
if not person_ids:
|
|
return 0
|
|
|
|
# Read config for threshold and storage path
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT key, value FROM private_media_config WHERE key IN ('duplicate_auto_select_distance', 'storage_path')")
|
|
config = {row['key']: row['value'] for row in cursor.fetchall()}
|
|
finally:
|
|
conn.close()
|
|
|
|
threshold = int(config.get('duplicate_auto_select_distance', '2'))
|
|
storage_path = Path(config.get('storage_path', '/opt/immich/private'))
|
|
data_path = storage_path / 'data'
|
|
thumbs_path = storage_path / 'thumbs'
|
|
|
|
total_deleted = 0
|
|
|
|
for person_id in person_ids:
|
|
try:
|
|
total_deleted += self._dedup_person(person_id, crypto, threshold, data_path, thumbs_path, storage_path)
|
|
except Exception as e:
|
|
logger.error(f"Dedup failed for person {person_id}: {e}")
|
|
import traceback
|
|
logger.debug(f"Dedup traceback: {traceback.format_exc()}")
|
|
|
|
return total_deleted
|
|
|
|
def _dedup_person(self, person_id: int, crypto, threshold: int, data_path: Path, thumbs_path: Path, storage_path: Path) -> int:
|
|
"""Run dedup for a single person. Returns number of duplicates deleted."""
|
|
# Fetch all media with perceptual hashes for this person
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
SELECT id, post_id, storage_id, file_type, perceptual_hash, width, height
|
|
FROM private_media
|
|
WHERE post_id IN (SELECT id FROM private_media_posts WHERE person_id = ?)
|
|
AND file_type IN ('image', 'video')
|
|
AND perceptual_hash IS NOT NULL
|
|
AND perceptual_hash != ''
|
|
''', (person_id,))
|
|
all_media = [dict(row) for row in cursor.fetchall()]
|
|
finally:
|
|
conn.close()
|
|
|
|
if len(all_media) < 2:
|
|
return 0
|
|
|
|
# Pre-compute integer values for fast XOR-based hamming distance
|
|
hash_ints = {}
|
|
for m in all_media:
|
|
try:
|
|
hash_ints[m['id']] = int(m['perceptual_hash'], 16)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# Union-Find for grouping duplicates
|
|
parent = {m['id']: m['id'] for m in all_media}
|
|
|
|
def find(x):
|
|
while parent[x] != x:
|
|
parent[x] = parent[parent[x]]
|
|
x = parent[x]
|
|
return x
|
|
|
|
def union(x, y):
|
|
px, py = find(x), find(y)
|
|
if px != py:
|
|
parent[px] = py
|
|
|
|
# Compare all pairs
|
|
for i in range(len(all_media)):
|
|
id_i = all_media[i]['id']
|
|
if id_i not in hash_ints:
|
|
continue
|
|
hi = hash_ints[id_i]
|
|
for j in range(i + 1, len(all_media)):
|
|
id_j = all_media[j]['id']
|
|
if id_j not in hash_ints:
|
|
continue
|
|
dist = bin(hi ^ hash_ints[id_j]).count('1')
|
|
if dist <= threshold:
|
|
union(id_i, id_j)
|
|
|
|
# Group by root
|
|
groups: Dict[int, list] = {}
|
|
for m in all_media:
|
|
root = find(m['id'])
|
|
if root not in groups:
|
|
groups[root] = []
|
|
groups[root].append(m)
|
|
|
|
# Filter to actual duplicate groups (size > 1)
|
|
duplicate_groups = [g for g in groups.values() if len(g) > 1]
|
|
|
|
if not duplicate_groups:
|
|
return 0
|
|
|
|
# In each group: keep highest resolution, mark rest for deletion
|
|
to_delete = []
|
|
for group in duplicate_groups:
|
|
# Sort by resolution (width * height) descending, keep first
|
|
group.sort(key=lambda m: (m['width'] or 0) * (m['height'] or 0), reverse=True)
|
|
to_delete.extend(group[1:]) # All except the highest resolution
|
|
|
|
if not to_delete:
|
|
return 0
|
|
|
|
# Delete duplicate files and DB records
|
|
deleted = 0
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
for media in to_delete:
|
|
storage_id = media['storage_id']
|
|
|
|
# Delete encrypted data file
|
|
data_file = data_path / f"{storage_id}.enc"
|
|
if data_file.exists():
|
|
data_file.unlink()
|
|
|
|
# Delete thumbnail file
|
|
thumb_file = thumbs_path / f"{storage_id}.enc"
|
|
if thumb_file.exists():
|
|
thumb_file.unlink()
|
|
|
|
# Delete DB record
|
|
cursor.execute('DELETE FROM private_media WHERE id = ?', (media['id'],))
|
|
deleted += 1
|
|
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
# Clean up empty reddit-tagged posts
|
|
self._cleanup_empty_reddit_posts_after_dedup(crypto, storage_path)
|
|
|
|
logger.info(f"Dedup: deleted {deleted} duplicates across {len(duplicate_groups)} groups for person {person_id}")
|
|
return deleted
|
|
|
|
def _cleanup_empty_reddit_posts_after_dedup(self, crypto, storage_path: Path):
|
|
"""Delete reddit-tagged posts that have no remaining media after dedup."""
|
|
try:
|
|
conn = self._get_connection()
|
|
try:
|
|
# Find the reddit tag ID
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT id, encrypted_name FROM private_gallery_tags")
|
|
reddit_tag_id = None
|
|
for row in cursor.fetchall():
|
|
try:
|
|
name = crypto.decrypt_field(row['encrypted_name'])
|
|
if name and name.lower() == 'reddit':
|
|
reddit_tag_id = row['id']
|
|
break
|
|
except Exception:
|
|
continue
|
|
finally:
|
|
conn.close()
|
|
|
|
if reddit_tag_id is None:
|
|
return
|
|
|
|
# Find empty reddit-tagged posts
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
SELECT p.id FROM private_media_posts p
|
|
JOIN private_media_post_tags pt ON pt.post_id = p.id
|
|
WHERE pt.tag_id = ?
|
|
AND NOT EXISTS (SELECT 1 FROM private_media m WHERE m.post_id = p.id)
|
|
''', (reddit_tag_id,))
|
|
empty_posts = [row['id'] for row in cursor.fetchall()]
|
|
finally:
|
|
conn.close()
|
|
|
|
if not empty_posts:
|
|
return
|
|
|
|
# Delete empty posts
|
|
conn = self._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
for post_id in empty_posts:
|
|
cursor.execute('DELETE FROM private_media_post_tags WHERE post_id = ?', (post_id,))
|
|
cursor.execute('DELETE FROM private_media_posts WHERE id = ?', (post_id,))
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
logger.info(f"Dedup cleanup: removed {len(empty_posts)} empty reddit-tagged posts")
|
|
except Exception as e:
|
|
logger.error(f"Failed to cleanup empty reddit posts after dedup: {e}")
|
|
|
|
def _compute_perceptual_hash(self, file_path: Path) -> Optional[str]:
|
|
"""Calculate perceptual hash for an image or video file."""
|
|
try:
|
|
import imagehash
|
|
from PIL import Image
|
|
except ImportError:
|
|
return None
|
|
|
|
ext = file_path.suffix.lower().lstrip('.')
|
|
image_exts = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff', 'heic', 'heif', 'avif'}
|
|
video_exts = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
|
|
|
|
pil_image = None
|
|
frame = None
|
|
frame_rgb = None
|
|
|
|
try:
|
|
if ext in video_exts:
|
|
try:
|
|
import cv2
|
|
except ImportError:
|
|
return None
|
|
cap = cv2.VideoCapture(str(file_path))
|
|
if not cap.isOpened():
|
|
return None
|
|
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, int(total_frames * 0.5))
|
|
ret, frame = cap.read()
|
|
cap.release()
|
|
if not ret or frame is None:
|
|
return None
|
|
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
|
pil_image = Image.fromarray(frame_rgb)
|
|
elif ext in image_exts:
|
|
pil_image = Image.open(file_path)
|
|
else:
|
|
return None
|
|
|
|
phash = str(imagehash.dhash(pil_image, hash_size=16))
|
|
return phash
|
|
except Exception:
|
|
return None
|
|
finally:
|
|
if pil_image is not None:
|
|
pil_image.close()
|
|
del pil_image
|
|
if frame_rgb is not None:
|
|
del frame_rgb
|
|
if frame is not None:
|
|
del frame
|
|
|
|
def _generate_thumbnail(self, file_path: Path, output_path: Path, file_type: str) -> bool:
|
|
"""Generate a thumbnail for an image or video."""
|
|
try:
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
if file_type == 'image':
|
|
from PIL import Image, ImageOps
|
|
with Image.open(file_path) as img:
|
|
img = ImageOps.exif_transpose(img)
|
|
img.thumbnail((400, 400))
|
|
if img.mode in ('RGBA', 'P'):
|
|
img = img.convert('RGB')
|
|
img.save(output_path, 'JPEG', quality=85)
|
|
return True
|
|
|
|
elif file_type == 'video':
|
|
result = subprocess.run([
|
|
'ffmpeg', '-y', '-i', str(file_path),
|
|
'-ss', '00:00:01', '-vframes', '1',
|
|
'-vf', 'scale=400:-1:force_original_aspect_ratio=decrease',
|
|
str(output_path)
|
|
], capture_output=True, timeout=30)
|
|
return result.returncode == 0 and output_path.exists()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Thumbnail generation failed: {e}")
|
|
return False
|