Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

100
scripts/add-backup-profile.sh Executable file
View File

@@ -0,0 +1,100 @@
#!/bin/bash
# Add Media Downloader backup profile to Backup Central
# Run this script to create or recreate the backup profile
set -e
echo "Adding Media Downloader backup profile to Backup Central..."
echo ""
# Delete existing profile if it exists
sqlite3 /opt/backup-central/data/backup_cache.db "DELETE FROM backup_profiles WHERE id = 'profile-media-downloader';" 2>/dev/null || true
# Insert new profile
sqlite3 /opt/backup-central/data/backup_cache.db <<'SQL'
INSERT INTO backup_profiles (
id,
name,
description,
enabled,
created_at,
updated_at,
destination_type,
destination_path,
sources,
schedule_enabled,
schedule_frequency,
schedule_time,
retention_daily,
retention_weekly,
retention_monthly,
retention_yearly,
notify_on_success,
notify_on_warning,
notify_on_failure,
notify_channels,
advanced_settings,
total_runs,
success_count,
failure_count
) VALUES (
'profile-media-downloader',
'Media Downloader System',
'Daily backup of media-downloader configuration, database, and code',
1,
datetime('now'),
datetime('now'),
'local',
'/media/backups/Ubuntu/restic-repo',
'{"include":["/opt/media-downloader/data","/opt/media-downloader/database","/opt/media-downloader/cookies","/opt/media-downloader/sessions","/opt/media-downloader/modules","/opt/media-downloader/wrappers","/opt/media-downloader/scripts","/opt/media-downloader/web/backend","/opt/media-downloader/web/frontend/src","/opt/media-downloader/*.py","/opt/media-downloader/VERSION","/opt/media-downloader/README.md","/opt/media-downloader/requirements.txt","/opt/media-downloader/docs"],"exclude":["*.log","*.log.*","*.pyc","__pycache__","/opt/media-downloader/temp/*","/opt/media-downloader/logs/*","/opt/media-downloader/venv/*","/opt/media-downloader/.playwright/*","/opt/media-downloader/debug/*","/opt/media-downloader/database/*.db-shm","/opt/media-downloader/database/*.db-wal","*.swp","*.swo","*~",".*.swp"]}',
1,
'daily',
'00:00',
7,
4,
12,
2,
0,
1,
1,
'["pushover"]',
'{"custom_name_template":"{{version}}-{{datetime}}","auto_lock_all_backups":true}',
0,
0,
0
);
SQL
if [ $? -eq 0 ]; then
echo "✓ Profile added successfully"
echo ""
# Restart backup-central to load the profile
echo "Restarting Backup Central service..."
sudo systemctl restart backup-central
sleep 2
echo "✓ Backup Central restarted"
echo ""
# Verify profile was created
echo "Verifying profile..."
backup-central profiles list | grep "Media Downloader"
echo ""
echo "╔════════════════════════════════════════════════╗"
echo "║ Profile Added Successfully ║"
echo "╠════════════════════════════════════════════════╣"
echo "║ ID: profile-media-downloader ║"
echo "║ Name: Media Downloader System ║"
echo "║ Schedule: Daily at 00:00 (midnight) ║"
echo "║ Status: Enabled ║"
echo "╚════════════════════════════════════════════════╝"
echo ""
echo "To view full details:"
echo " backup-central profiles --info profile-media-downloader"
echo ""
else
echo "✗ Failed to add profile"
exit 1
fi

View File

@@ -0,0 +1,112 @@
-- Database Performance Indexes
-- Adds indexes to frequently queried columns for improved performance
--
-- Run with: sqlite3 /opt/media-downloader/database/downloads.db < scripts/add-database-indexes.sql
-- ============================================================================
-- Downloads Table Indexes
-- ============================================================================
-- Index on platform for filtering downloads by platform
CREATE INDEX IF NOT EXISTS idx_downloads_platform
ON downloads(platform);
-- Index on source for filtering downloads by source/username
CREATE INDEX IF NOT EXISTS idx_downloads_source
ON downloads(source);
-- Index on download_date for time-based queries (DESC for most recent first)
CREATE INDEX IF NOT EXISTS idx_downloads_download_date
ON downloads(download_date DESC);
-- Index on status for filtering by download status
CREATE INDEX IF NOT EXISTS idx_downloads_status
ON downloads(status);
-- Compound index for platform + source queries (common filter combination)
CREATE INDEX IF NOT EXISTS idx_downloads_platform_source
ON downloads(platform, source);
-- Compound index for platform + download_date (common for analytics)
CREATE INDEX IF NOT EXISTS idx_downloads_platform_date
ON downloads(platform, download_date DESC);
-- Index on filename for search queries
CREATE INDEX IF NOT EXISTS idx_downloads_filename
ON downloads(filename);
-- Index on media_id for duplicate detection
CREATE INDEX IF NOT EXISTS idx_downloads_media_id
ON downloads(media_id)
WHERE media_id IS NOT NULL;
-- Index on file_hash for duplicate detection
CREATE INDEX IF NOT EXISTS idx_downloads_file_hash
ON downloads(file_hash)
WHERE file_hash IS NOT NULL;
-- ============================================================================
-- Notifications Table Indexes
-- ============================================================================
-- Index on sent_at for time-based queries
CREATE INDEX IF NOT EXISTS idx_notifications_sent_at
ON notifications(sent_at DESC);
-- Index on platform for filtering notifications
CREATE INDEX IF NOT EXISTS idx_notifications_platform
ON notifications(platform);
-- Index on status for filtering by notification status
CREATE INDEX IF NOT EXISTS idx_notifications_status
ON notifications(status);
-- Compound index for platform + sent_at (common query)
CREATE INDEX IF NOT EXISTS idx_notifications_platform_sent_at
ON notifications(platform, sent_at DESC);
-- ============================================================================
-- Scheduler State Table Indexes
-- ============================================================================
-- Index on status for active task queries
CREATE INDEX IF NOT EXISTS idx_scheduler_state_status
ON scheduler_state(status);
-- Index on next_run for finding next scheduled tasks
CREATE INDEX IF NOT EXISTS idx_scheduler_state_next_run
ON scheduler_state(next_run ASC);
-- Index on platform for platform-specific queries
CREATE INDEX IF NOT EXISTS idx_scheduler_state_platform
ON scheduler_state(platform);
-- ============================================================================
-- Users Table Indexes
-- ============================================================================
-- Index on username for login queries
CREATE INDEX IF NOT EXISTS idx_users_username
ON users(username);
-- Index on email for lookup queries (if email is used)
CREATE INDEX IF NOT EXISTS idx_users_email
ON users(email)
WHERE email IS NOT NULL;
-- ============================================================================
-- Performance Analysis
-- ============================================================================
-- Run ANALYZE to update query planner statistics
ANALYZE;
-- Display index information
SELECT
'Index Information' as info,
name as index_name,
tbl_name as table_name
FROM sqlite_master
WHERE type = 'index'
AND name LIKE 'idx_%'
ORDER BY tbl_name, name;

71
scripts/add_reference_face.py Executable file
View File

@@ -0,0 +1,71 @@
#!/usr/bin/env python3
"""
Add Reference Face Script
Adds a reference face encoding to the database
"""
import sys
import os
import resource
import time
# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from modules.unified_database import UnifiedDatabase
from modules.face_recognition_module import FaceRecognitionModule
# Limit CPU usage at Python level to prevent system freeze
try:
os.nice(19) # Lowest CPU priority
except OSError:
pass
try:
# Limit CPU time to 120 seconds max (face recognition can be slow)
resource.setrlimit(resource.RLIMIT_CPU, (120, 120))
except (OSError, ValueError):
pass
# Small delay to reduce CPU spike
time.sleep(0.2)
def main():
if len(sys.argv) < 3:
print("Usage: python3 add_reference_face.py <person_name> <image_path>")
print("Example: python3 add_reference_face.py 'Eva Longoria' /path/to/reference_image.jpg")
sys.exit(1)
person_name = sys.argv[1]
image_path = sys.argv[2]
if not os.path.exists(image_path):
print(f"Error: Image not found: {image_path}")
sys.exit(1)
print(f"Adding reference face for '{person_name}' from {image_path}")
# Initialize database
db = UnifiedDatabase()
# Initialize face recognition module
face_module = FaceRecognitionModule(unified_db=db)
# Add reference face
success = face_module.add_reference_face(person_name, image_path)
if success:
print(f"✓ Successfully added reference face for '{person_name}'")
print(f"\nCurrent reference faces:")
refs = face_module.get_reference_faces()
for ref in refs:
print(f" - {ref['person_name']} (ID: {ref['id']}, added: {ref['created_at']})")
else:
print(f"✗ Failed to add reference face")
sys.exit(1)
if __name__ == "__main__":
main()

17
scripts/api-call.sh Executable file
View File

@@ -0,0 +1,17 @@
#!/bin/bash
# Make authenticated API calls using saved token
# Usage: /opt/media-downloader/scripts/api-call.sh "/api/endpoint?params"
#
# First run get-api-token.sh to get a token, then use this script.
# Example: /opt/media-downloader/scripts/api-call.sh "/api/video-queue?limit=2"
ENDPOINT="$1"
shift
if [ ! -f /tmp/api_token.txt ]; then
echo "No token found. Run get-api-token.sh first."
exit 1
fi
TOKEN=$(cat /tmp/api_token.txt)
curl -s "http://localhost:8000${ENDPOINT}" -b "auth_token=$TOKEN" "$@"

145
scripts/backfill_dimensions.py Executable file
View File

@@ -0,0 +1,145 @@
#!/usr/bin/env python3
"""
Backfill missing dimensions (width/height) for files in file_inventory.
Uses PIL for images and ffprobe for videos.
"""
import os
import sys
from pathlib import Path
# Bootstrap PostgreSQL adapter before any sqlite3 imports
sys.path.insert(0, '/opt/media-downloader')
from modules.db_bootstrap import bootstrap_database
bootstrap_database()
import sqlite3
# Database path (routed to PostgreSQL via pgadapter)
DB_PATH = "/opt/media-downloader/database/media_downloader.db"
def get_image_dimensions(file_path: str) -> tuple:
"""Get dimensions for an image file using PIL."""
try:
from PIL import Image
with Image.open(file_path) as img:
return img.size # (width, height)
except Exception as e:
print(f" PIL error for {file_path}: {e}")
return None, None
def get_video_dimensions(file_path: str) -> tuple:
"""Get dimensions for a video file using ffprobe."""
try:
import subprocess
result = subprocess.run(
['ffprobe', '-v', 'error', '-select_streams', 'v:0',
'-show_entries', 'stream=width,height', '-of', 'csv=p=0',
file_path],
capture_output=True, text=True, timeout=30
)
if result.returncode == 0 and result.stdout.strip():
parts = result.stdout.strip().split(',')
if len(parts) >= 2:
return int(parts[0]), int(parts[1])
except Exception as e:
print(f" ffprobe error for {file_path}: {e}")
return None, None
def main():
if not os.path.exists(DB_PATH):
print(f"Database not found: {DB_PATH}")
sys.exit(1)
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
# Get count of files missing dimensions
cursor.execute('''
SELECT COUNT(*) FROM file_inventory
WHERE (width IS NULL OR height IS NULL)
AND location IN ('final', 'review')
''')
total_missing = cursor.fetchone()[0]
print(f"Found {total_missing} files with missing dimensions")
if total_missing == 0:
print("No files need dimension backfill!")
conn.close()
return
# Process in batches
batch_size = 100
processed = 0
updated = 0
errors = 0
cursor.execute('''
SELECT id, file_path, content_type FROM file_inventory
WHERE (width IS NULL OR height IS NULL)
AND location IN ('final', 'review')
''')
update_cursor = conn.cursor()
batch_updates = []
for row in cursor:
file_id = row['id']
file_path = row['file_path']
content_type = row['content_type']
if not os.path.exists(file_path):
processed += 1
continue
width, height = None, None
if content_type == 'image':
width, height = get_image_dimensions(file_path)
elif content_type == 'video':
width, height = get_video_dimensions(file_path)
else:
# Try to determine from extension
ext = Path(file_path).suffix.lower()
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif', '.webp', '.bmp', '.tiff'}
if ext in image_exts:
width, height = get_image_dimensions(file_path)
else:
width, height = get_video_dimensions(file_path)
if width and height:
batch_updates.append((width, height, file_id))
updated += 1
else:
errors += 1
processed += 1
# Commit in batches
if len(batch_updates) >= batch_size:
update_cursor.executemany(
'UPDATE file_inventory SET width = ?, height = ? WHERE id = ?',
batch_updates
)
conn.commit()
print(f" Processed {processed}/{total_missing} files, updated {updated}, errors {errors}")
batch_updates = []
# Final batch
if batch_updates:
update_cursor.executemany(
'UPDATE file_inventory SET width = ?, height = ? WHERE id = ?',
batch_updates
)
conn.commit()
print(f"\nComplete! Processed {processed} files, updated {updated} with dimensions, {errors} errors")
conn.close()
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,101 @@
#!/usr/bin/env python3
"""
Backfill face recognition scan results for existing files
This script looks at existing downloads and infers face recognition results:
- status='completed' and file in final destination = matched
- status='review' and file in review queue = no match
"""
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from modules.unified_database import UnifiedDatabase
from pathlib import Path
def backfill_face_recognition():
"""Backfill face recognition results from existing downloads"""
db = UnifiedDatabase()
print("🔄 Backfilling face recognition scan results...")
print("=" * 70)
with db.get_connection() as conn:
cursor = conn.cursor()
# Get all downloads with file paths
cursor.execute('''
SELECT id, filename, file_path, status
FROM downloads
WHERE file_path IS NOT NULL AND file_path != ''
''')
downloads = cursor.fetchall()
matched_count = 0
no_match_count = 0
skipped_count = 0
for download in downloads:
download_id = download['id']
filename = download['filename']
file_path = download['file_path']
status = download['status']
# Check if already scanned
cursor.execute('''
SELECT id FROM face_recognition_scans
WHERE file_path = ?
''', (file_path,))
if cursor.fetchone():
skipped_count += 1
continue
# Determine if file is image/video
path = Path(file_path)
ext = path.suffix.lower()
image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.heic', '.heif'}
video_exts = {'.mp4', '.mov', '.webm', '.avi', '.mkv', '.flv', '.m4v'}
if ext not in (image_exts | video_exts):
continue # Skip non-media files
# Infer face recognition result from status
if status == 'review':
# File in review = no match
has_match = False
matched_person = None
confidence = 0.0 # No match = 0% confidence
no_match_count += 1
elif status == 'completed' and '/review' not in file_path:
# File in final destination = matched (assume Eva Longoria for now)
has_match = True
matched_person = 'Eva Longoria'
confidence = 1.0 # Backfill assumes 100% for approved files
matched_count += 1
else:
continue # Skip uncertain cases
# Insert retroactive scan result
cursor.execute('''
INSERT INTO face_recognition_scans
(download_id, file_path, has_match, matched_person, confidence, face_count, scan_type)
VALUES (?, ?, ?, ?, ?, ?, ?)
''', (download_id, file_path, has_match, matched_person, confidence, 0, 'backfill'))
conn.commit()
print()
print("✅ Backfill complete!")
print(f" Matched: {matched_count}")
print(f" No match: {no_match_count}")
print(f" Skipped (already scanned): {skipped_count}")
print("=" * 70)
db.close()
if __name__ == "__main__":
backfill_face_recognition()

View File

@@ -0,0 +1,351 @@
#!/usr/bin/env python3
"""Backfill missing Instagram posts using authenticated browser cookies.
Paginates through the full timeline via /api/v1/feed/user/ and inserts
any posts missing from paid_content_posts. Uses Edge browser fingerprint
and the cookies stored in the instagram_browser scraper entry.
Usage:
cd /opt/media-downloader
./venv/bin/python3 -u scripts/backfill_ig_posts.py --creator-id 101
./venv/bin/python3 -u scripts/backfill_ig_posts.py --creator-id 110
"""
import argparse
import json
import sys
import time
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Bootstrap DB (pgadapter) — module-level import activates monkey-patching
import modules.db_bootstrap # noqa: F401
import sqlite3 # routed to PostgreSQL via pgadapter
from curl_cffi.requests import Session as CurlSession
from datetime import datetime
PER_PAGE = 33
SLEEP_BETWEEN = 2.0 # seconds between API calls
def load_cookies(conn):
cursor = conn.cursor()
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = 'instagram_browser'")
row = cursor.fetchone()
if not row or not row[0]:
print("ERROR: No cookies found in instagram_browser scraper")
sys.exit(1)
cookies = json.loads(row[0])
has_session = any(c.get('name') == 'sessionid' and c.get('value') for c in cookies)
if not has_session:
print("ERROR: No sessionid in cookies")
sys.exit(1)
return cookies
def save_cookies(conn, session):
updated = []
for c in session.cookies.jar:
updated.append({
'name': c.name,
'value': c.value,
'domain': c.domain or '.instagram.com',
})
if updated:
cursor = conn.cursor()
cursor.execute(
"UPDATE scrapers SET cookies_json = ?, cookies_updated_at = ? WHERE id = 'instagram_browser'",
(json.dumps(updated), datetime.now().isoformat())
)
conn.commit()
def load_known_post_ids(conn, creator_id):
cursor = conn.cursor()
cursor.execute("SELECT post_id FROM paid_content_posts WHERE creator_id = ?", (creator_id,))
return set(row[0] for row in cursor.fetchall())
def lookup_ig_user_id(session, username):
"""Look up Instagram user ID from username using authenticated session."""
resp = session.get(
f'https://www.instagram.com/api/v1/users/web_profile_info/?username={username}',
timeout=10
)
if resp.status_code != 200:
print(f"ERROR: Failed to look up user ID for @{username}: HTTP {resp.status_code}")
sys.exit(1)
data = resp.json()
user = data['data']['user']
ig_user_id = user['id']
ig_post_count = user['edge_owner_to_timeline_media']['count']
print(f"Instagram user ID for @{username}: {ig_user_id} ({ig_post_count} posts)")
return ig_user_id
def best_media_url(node):
media_type = node.get('media_type', 1)
if media_type == 2 and node.get('video_versions'):
best = max(node['video_versions'], key=lambda v: v.get('width', 0) * v.get('height', 0))
return best.get('url', '')
candidates = node.get('image_versions2', {}).get('candidates', [])
if candidates:
best = max(candidates, key=lambda c: c.get('width', 0) * c.get('height', 0))
return best.get('url', '')
return None
def node_to_post_row(node):
"""Convert an IG API node to DB row data."""
code = node.get('code', '')
if not code:
return None
taken_at = node.get('taken_at', 0)
published_at = datetime.fromtimestamp(taken_at).isoformat() if taken_at else None
caption_obj = node.get('caption')
caption = caption_obj.get('text', '') if isinstance(caption_obj, dict) else ''
srcs = []
media_type = node.get('media_type', 1)
if media_type == 8 and node.get('carousel_media'):
for child in node['carousel_media']:
url = best_media_url(child)
if url:
srcs.append(url)
else:
url = best_media_url(node)
if url:
srcs.append(url)
if not srcs:
return None
# Tagged users
tagged_users = []
for tag in (node.get('usertags') or {}).get('in', []):
uname = (tag.get('user') or {}).get('username')
if uname and uname not in tagged_users:
tagged_users.append(uname)
for cm in node.get('carousel_media') or []:
for tag in (cm.get('usertags') or {}).get('in', []):
uname = (tag.get('user') or {}).get('username')
if uname and uname not in tagged_users:
tagged_users.append(uname)
is_pinned = 1 if node.get('timeline_pinned_user_ids') else 0
return {
'post_id': code,
'published_at': published_at,
'content': caption,
'srcs': srcs,
'attachment_count': len(srcs),
'is_pinned': is_pinned,
'tagged_users': tagged_users,
}
def insert_post(conn, creator_id, post_data):
"""Insert a post + attachments into the DB."""
cursor = conn.cursor()
now = datetime.now().isoformat()
cursor.execute(
"""INSERT INTO paid_content_posts
(creator_id, post_id, title, content, published_at, added_at,
has_attachments, attachment_count, downloaded, is_pinned)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(creator_id, post_data['post_id'], None, post_data['content'],
post_data['published_at'], now,
1 if post_data['attachment_count'] > 0 else 0,
post_data['attachment_count'], False, post_data['is_pinned'])
)
# Get the inserted post's ID
cursor.execute(
"SELECT id FROM paid_content_posts WHERE creator_id = ? AND post_id = ?",
(creator_id, post_data['post_id'])
)
row = cursor.fetchone()
if not row:
return
db_post_id = row[0]
# Insert attachments
for idx, src_url in enumerate(post_data['srcs']):
ext = '.mp4' if '.mp4' in src_url.split('?')[0] else '.jpg'
file_type = 'video' if ext == '.mp4' else 'image'
name = f"{post_data['post_id']}_{idx}{ext}"
cursor.execute(
"""INSERT INTO paid_content_attachments
(post_id, attachment_index, name, file_type, extension, server_path, download_url, status)
VALUES (?, ?, ?, ?, ?, ?, ?, 'pending')""",
(db_post_id, idx, name, file_type, ext,
f"https://www.instagram.com/p/{post_data['post_id']}/?img_index={idx + 1}",
src_url)
)
# Insert tagged users
for uname in post_data.get('tagged_users', []):
cursor.execute(
"""INSERT INTO paid_content_post_tagged_users (post_id, username, created_at)
VALUES (?, ?, ?)
ON CONFLICT (post_id, username) DO NOTHING""",
(db_post_id, uname, now)
)
conn.commit()
def create_session(cookie_list):
session = CurlSession(impersonate='edge101')
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
'X-IG-App-ID': '936619743392459',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://www.instagram.com/',
'Origin': 'https://www.instagram.com',
'Sec-CH-UA': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
'Sec-CH-UA-Mobile': '?0',
'Sec-CH-UA-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
})
for c in cookie_list:
name = c.get('name', '')
value = c.get('value', '')
domain = c.get('domain', '.instagram.com')
if name and value:
session.cookies.set(name, value, domain=domain)
return session
def main():
parser = argparse.ArgumentParser(description='Backfill missing Instagram posts')
parser.add_argument('--creator-id', type=int, required=True, help='Paid content creator ID')
args = parser.parse_args()
conn = sqlite3.connect('media_downloader')
# Look up creator
cursor = conn.cursor()
cursor.execute(
"SELECT username FROM paid_content_creators WHERE id = ? AND platform = 'instagram'",
(args.creator_id,)
)
row = cursor.fetchone()
if not row:
print(f"ERROR: Creator ID {args.creator_id} not found")
sys.exit(1)
username = row[0]
print(f"Backfilling @{username} (creator_id={args.creator_id})")
cookie_list = load_cookies(conn)
session = create_session(cookie_list)
# Look up Instagram user ID
ig_user_id = lookup_ig_user_id(session, username)
time.sleep(1)
known = load_known_post_ids(conn, args.creator_id)
print(f"Known posts in DB: {len(known)}")
max_id = None
total_fetched = 0
total_new = 0
page = 0
consecutive_errors = 0
while True:
page += 1
params = {'count': PER_PAGE}
if max_id:
params['max_id'] = max_id
try:
resp = session.get(
f'https://www.instagram.com/api/v1/feed/user/{ig_user_id}/',
params=params,
timeout=15
)
except Exception as e:
print(f" Page {page}: request error: {e}")
consecutive_errors += 1
if consecutive_errors >= 3:
print("Too many consecutive errors, stopping.")
break
time.sleep(5)
continue
if resp.status_code != 200:
print(f" Page {page}: HTTP {resp.status_code}")
if resp.status_code == 401:
print("Session expired! Stopping.")
break
if resp.status_code == 429:
print("Rate limited. Waiting 60s...")
time.sleep(60)
continue
consecutive_errors += 1
if consecutive_errors >= 3:
print("Too many consecutive errors, stopping.")
break
time.sleep(5)
continue
consecutive_errors = 0
data = resp.json()
items = data.get('items', [])
more = data.get('more_available', False)
next_max_id = data.get('next_max_id')
if not items:
print(f" Page {page}: no items returned, done.")
break
total_fetched += len(items)
page_new = 0
for node in items:
code = node.get('code', '')
if not code:
continue
if code in known:
continue
post_data = node_to_post_row(node)
if not post_data:
continue
insert_post(conn, args.creator_id, post_data)
known.add(code)
page_new += 1
total_new += 1
print(f" Page {page}: {len(items)} items, {page_new} new (total: {total_fetched} fetched, {total_new} new)")
if not more or not next_max_id:
print("No more pages available.")
break
max_id = next_max_id
time.sleep(SLEEP_BETWEEN)
# Save updated cookies
try:
save_cookies(conn, session)
except Exception as e:
print(f"Warning: failed to save cookies: {e}")
conn.close()
print(f"\nDone! Fetched {total_fetched} posts total, inserted {total_new} new posts for @{username}.")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,296 @@
#!/usr/bin/env python3
"""Backfill missing kyliejenner posts using authenticated browser cookies.
Paginates through the full timeline via /api/v1/feed/user/ and inserts
any posts missing from paid_content_posts. Uses Edge browser fingerprint
and the cookies stored in the instagram_browser scraper entry.
Usage:
cd /opt/media-downloader
./venv/bin/python3 -u scripts/backfill_kylie_posts.py
"""
import json
import sys
import time
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Bootstrap DB (pgadapter) — module-level import activates monkey-patching
import modules.db_bootstrap # noqa: F401
import sqlite3 # routed to PostgreSQL via pgadapter
from curl_cffi.requests import Session as CurlSession
from datetime import datetime
CREATOR_ID = 101
USERNAME = 'kyliejenner'
IG_USER_ID = '12281817'
PER_PAGE = 33
SLEEP_BETWEEN = 2.0 # seconds between API calls
def load_cookies(conn):
cursor = conn.cursor()
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = 'instagram_browser'")
row = cursor.fetchone()
if not row or not row[0]:
print("ERROR: No cookies found in instagram_browser scraper")
sys.exit(1)
cookies = json.loads(row[0])
has_session = any(c.get('name') == 'sessionid' and c.get('value') for c in cookies)
if not has_session:
print("ERROR: No sessionid in cookies")
sys.exit(1)
return cookies
def save_cookies(conn, session):
updated = []
for c in session.cookies:
updated.append({
'name': c.name,
'value': c.value,
'domain': c.domain or '.instagram.com',
})
if updated:
cursor = conn.cursor()
cursor.execute(
"UPDATE scrapers SET cookies_json = ?, cookies_updated_at = ? WHERE id = 'instagram_browser'",
(json.dumps(updated), datetime.now().isoformat())
)
conn.commit()
def load_known_post_ids(conn):
cursor = conn.cursor()
cursor.execute("SELECT post_id FROM paid_content_posts WHERE creator_id = ?", (CREATOR_ID,))
return set(row[0] for row in cursor.fetchall())
def best_media_url(node):
media_type = node.get('media_type', 1)
if media_type == 2 and node.get('video_versions'):
best = max(node['video_versions'], key=lambda v: v.get('width', 0) * v.get('height', 0))
return best.get('url', '')
candidates = node.get('image_versions2', {}).get('candidates', [])
if candidates:
best = max(candidates, key=lambda c: c.get('width', 0) * c.get('height', 0))
return best.get('url', '')
return None
def node_to_post_row(node):
"""Convert an IG API node to DB row data."""
code = node.get('code', '')
if not code:
return None
taken_at = node.get('taken_at', 0)
published_at = datetime.fromtimestamp(taken_at).isoformat() if taken_at else None
caption_obj = node.get('caption')
caption = caption_obj.get('text', '') if isinstance(caption_obj, dict) else ''
srcs = []
media_type = node.get('media_type', 1)
if media_type == 8 and node.get('carousel_media'):
for child in node['carousel_media']:
url = best_media_url(child)
if url:
srcs.append(url)
else:
url = best_media_url(node)
if url:
srcs.append(url)
if not srcs:
return None
# Tagged users
tagged_users = []
for tag in (node.get('usertags') or {}).get('in', []):
uname = (tag.get('user') or {}).get('username')
if uname and uname not in tagged_users:
tagged_users.append(uname)
for cm in node.get('carousel_media') or []:
for tag in (cm.get('usertags') or {}).get('in', []):
uname = (tag.get('user') or {}).get('username')
if uname and uname not in tagged_users:
tagged_users.append(uname)
is_pinned = 1 if node.get('timeline_pinned_user_ids') else 0
return {
'post_id': code,
'published_at': published_at,
'content': caption,
'srcs': srcs,
'attachment_count': len(srcs),
'is_pinned': is_pinned,
'tagged_users': tagged_users,
}
def insert_post(conn, post_data):
"""Insert a post + attachments into the DB."""
cursor = conn.cursor()
now = datetime.now().isoformat()
cursor.execute(
"""INSERT INTO paid_content_posts
(creator_id, post_id, title, content, published_at, added_at,
has_attachments, attachment_count, downloaded, is_pinned)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(CREATOR_ID, post_data['post_id'], None, post_data['content'],
post_data['published_at'], now,
1 if post_data['attachment_count'] > 0 else 0,
post_data['attachment_count'], False, post_data['is_pinned'])
)
# Get the inserted post's ID
cursor.execute(
"SELECT id FROM paid_content_posts WHERE creator_id = ? AND post_id = ?",
(CREATOR_ID, post_data['post_id'])
)
row = cursor.fetchone()
if not row:
return
db_post_id = row[0]
# Insert attachments
for idx, src_url in enumerate(post_data['srcs']):
ext = '.mp4' if '.mp4' in src_url.split('?')[0] else '.jpg'
file_type = 'video' if ext == '.mp4' else 'image'
name = f"{post_data['post_id']}_{idx}{ext}"
cursor.execute(
"""INSERT INTO paid_content_attachments
(post_id, attachment_index, name, file_type, extension, server_path, download_url, status)
VALUES (?, ?, ?, ?, ?, ?, ?, 'pending')""",
(db_post_id, idx, name, file_type, ext,
f"https://www.instagram.com/p/{post_data['post_id']}/?img_index={idx + 1}",
src_url)
)
conn.commit()
def main():
conn = sqlite3.connect('media_downloader')
cookie_list = load_cookies(conn)
known = load_known_post_ids(conn)
print(f"Known posts in DB: {len(known)}")
session = CurlSession(impersonate='edge101')
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
'X-IG-App-ID': '936619743392459',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://www.instagram.com/',
'Origin': 'https://www.instagram.com',
'Sec-CH-UA': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
'Sec-CH-UA-Mobile': '?0',
'Sec-CH-UA-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
})
for c in cookie_list:
name = c.get('name', '')
value = c.get('value', '')
domain = c.get('domain', '.instagram.com')
if name and value:
session.cookies.set(name, value, domain=domain)
max_id = None
total_fetched = 0
total_new = 0
page = 0
consecutive_errors = 0
while True:
page += 1
params = {'count': PER_PAGE}
if max_id:
params['max_id'] = max_id
try:
resp = session.get(
f'https://www.instagram.com/api/v1/feed/user/{IG_USER_ID}/',
params=params,
timeout=15
)
except Exception as e:
print(f" Page {page}: request error: {e}")
consecutive_errors += 1
if consecutive_errors >= 3:
print("Too many consecutive errors, stopping.")
break
time.sleep(5)
continue
if resp.status_code != 200:
print(f" Page {page}: HTTP {resp.status_code}")
if resp.status_code == 401:
print("Session expired! Stopping.")
break
if resp.status_code == 429:
print("Rate limited. Waiting 60s...")
time.sleep(60)
continue
consecutive_errors += 1
if consecutive_errors >= 3:
print("Too many consecutive errors, stopping.")
break
time.sleep(5)
continue
consecutive_errors = 0
data = resp.json()
items = data.get('items', [])
more = data.get('more_available', False)
next_max_id = data.get('next_max_id')
if not items:
print(f" Page {page}: no items returned, done.")
break
total_fetched += len(items)
page_new = 0
for node in items:
code = node.get('code', '')
if not code:
continue
if code in known:
continue
post_data = node_to_post_row(node)
if not post_data:
continue
insert_post(conn, post_data)
known.add(code)
page_new += 1
total_new += 1
print(f" Page {page}: {len(items)} items, {page_new} new (total: {total_fetched} fetched, {total_new} new)")
if not more or not next_max_id:
print("No more pages available.")
break
max_id = next_max_id
time.sleep(SLEEP_BETWEEN)
# Save updated cookies
save_cookies(conn, session)
conn.close()
print(f"\nDone! Fetched {total_fetched} posts total, inserted {total_new} new posts.")
if __name__ == '__main__':
main()

358
scripts/backfill_paid_content.py Executable file
View File

@@ -0,0 +1,358 @@
#!/usr/bin/env python3
"""
Backfill Paid Content from existing downloaded files.
This script:
1. Scans a source directory containing previously downloaded content
2. Matches files to posts/attachments in the database by ID
3. Copies files to the proper download location
4. Generates thumbnails
5. Updates database records as if they were freshly downloaded
Usage:
python3 backfill_paid_content.py /path/to/source/files --creator puffinasmr --platform fansly
"""
import argparse
import hashlib
import os
import re
import shutil
import sqlite3
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from io import BytesIO
# Add project root to path
sys.path.insert(0, '/opt/media-downloader')
from modules.unified_database import UnifiedDatabase
def get_file_hash(file_path: Path) -> str:
"""Compute SHA256 hash of file"""
sha256 = hashlib.sha256()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(8192), b''):
sha256.update(chunk)
return sha256.hexdigest()
def generate_thumbnail(file_path: Path, file_type: str) -> bytes:
"""Generate thumbnail for image or video"""
try:
if file_type == 'image':
from PIL import Image
img = Image.open(file_path)
img.thumbnail((400, 400), Image.Resampling.LANCZOS)
if img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
buffer = BytesIO()
img.save(buffer, format='JPEG', quality=85)
return buffer.getvalue()
elif file_type == 'video':
# Use ffmpeg to extract a frame
result = subprocess.run([
'ffmpeg', '-i', str(file_path),
'-ss', '00:00:01', # 1 second in
'-vframes', '1',
'-vf', 'scale=400:-1',
'-f', 'image2pipe',
'-vcodec', 'mjpeg',
'-'
], capture_output=True, timeout=30)
if result.returncode == 0 and result.stdout:
return result.stdout
except Exception as e:
print(f" Warning: Failed to generate thumbnail: {e}")
return None
def get_file_type(filename: str) -> str:
"""Determine file type from extension"""
ext = Path(filename).suffix.lower()
if ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']:
return 'image'
elif ext in ['.mp4', '.webm', '.mov', '.avi', '.mkv', '.m4v']:
return 'video'
else:
return 'other'
def sanitize_filename(name: str) -> str:
"""Sanitize string for use in filename/directory"""
name = re.sub(r'[<>:"/\\|?*]', '', name)
name = re.sub(r'\s+', '-', name.strip())
return name or 'unnamed'
def main():
parser = argparse.ArgumentParser(description='Backfill paid content from existing files')
parser.add_argument('source_dir', help='Source directory containing downloaded files')
parser.add_argument('--creator', required=True, help='Creator username')
parser.add_argument('--platform', required=True, help='Platform (fansly, onlyfans, etc.)')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
parser.add_argument('--limit', type=int, help='Limit number of posts to process')
args = parser.parse_args()
source_dir = Path(args.source_dir)
if not source_dir.exists():
print(f"Error: Source directory does not exist: {source_dir}")
sys.exit(1)
# Initialize database
db = UnifiedDatabase()
# Get config for base download path
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT base_download_path FROM paid_content_config WHERE id = 1")
row = cursor.fetchone()
base_path = Path(row[0] if row else '/opt/immich/paid')
# Find the creator in database
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT id, username, platform, service_id
FROM paid_content_creators
WHERE LOWER(username) = LOWER(?) AND LOWER(platform) = LOWER(?)
""", (args.creator, args.platform))
creator = cursor.fetchone()
if not creator:
print(f"Error: Creator '{args.creator}' on platform '{args.platform}' not found in database")
sys.exit(1)
creator_id, username, platform, service_id = creator
print(f"Found creator: {username} ({platform}) - ID: {creator_id}")
# Scan source directory for post folders
post_folders = [d for d in source_dir.iterdir() if d.is_dir() and d.name.isdigit()]
print(f"Found {len(post_folders)} post folders in source directory")
if args.limit:
post_folders = post_folders[:args.limit]
print(f"Limited to {args.limit} posts")
# Stats
stats = {
'posts_found': 0,
'posts_matched': 0,
'files_copied': 0,
'files_skipped': 0,
'thumbnails_generated': 0,
'errors': 0
}
for post_folder in post_folders:
post_id = post_folder.name
# Find post in database
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT id, title, published_at
FROM paid_content_posts
WHERE creator_id = ? AND post_id = ?
""", (creator_id, post_id))
post = cursor.fetchone()
if not post:
# Try partial match (post_id might be truncated in DB)
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT id, title, published_at, post_id
FROM paid_content_posts
WHERE creator_id = ? AND post_id LIKE ?
""", (creator_id, f"{post_id[:12]}%"))
post = cursor.fetchone()
if post:
post_id = post[3] # Use the full post_id from DB
if not post:
print(f" Post {post_id}: Not found in database, skipping")
continue
post_db_id, post_title, published_at = post[0], post[1], post[2]
stats['posts_matched'] += 1
# Build destination directory - matches scraper's _build_file_path structure
# Format: /base/platform/username/date/post_id/
post_date = published_at[:10] if published_at else 'unknown-date'
post_dir_name = post_id # Just post_id, no prefix
dest_dir = base_path / platform / sanitize_filename(username) / post_date / post_dir_name
print(f" Post {post_id}: {post_title or '(no title)'}")
print(f" -> {dest_dir}")
# Get attachments for this post
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT id, name, server_path, status, local_path, attachment_index
FROM paid_content_attachments
WHERE post_id = ?
ORDER BY attachment_index
""", (post_db_id,))
attachments = cursor.fetchall()
# Scan files in source folder
source_files = list(post_folder.iterdir())
source_files = [f for f in source_files if f.is_file()]
print(f" Found {len(source_files)} files, {len(attachments)} attachments in DB")
for att in attachments:
att_id, att_name, server_path, status, local_path, att_index = att
# Skip if already completed with valid local_path
if status == 'completed' and local_path and Path(local_path).exists():
print(f" [{att_index}] Already downloaded: {att_name}")
stats['files_skipped'] += 1
continue
# Try to find matching file in source
# Files might be named with attachment ID or just the filename
matching_file = None
# Extract potential file ID from server_path or name
if server_path:
# Server path like /27/37/2737100bd05f040ae0a0b10c452be9efdf54816577e53775b96b035eac200cde.jpg
server_filename = Path(server_path).stem # Get hash without extension
for src_file in source_files:
src_stem = src_file.stem
src_name = src_file.name
# Match by various patterns
if att_name and src_name == att_name:
matching_file = src_file
break
if att_name and src_stem == Path(att_name).stem:
matching_file = src_file
break
# Match by attachment ID in filename (Fansly style: 286246551964098560.png)
if src_stem.isdigit():
# Could be attachment ID
if att_name and src_stem in att_name:
matching_file = src_file
break
if not matching_file:
# Try to match by index
if att_index < len(source_files):
# Sort source files and pick by index
sorted_files = sorted(source_files, key=lambda f: f.name)
matching_file = sorted_files[att_index]
print(f" [{att_index}] Matched by index: {matching_file.name}")
if not matching_file:
print(f" [{att_index}] No matching file found for: {att_name}")
stats['errors'] += 1
continue
# Determine file type and extension
file_type = get_file_type(matching_file.name)
ext = matching_file.suffix or Path(att_name).suffix if att_name else '.bin'
# Build destination filename - matches scraper's _build_file_path
# Fansly uses just media ID (unique), other platforms use index prefix
if att_name:
sanitized_name = sanitize_filename(att_name)
# Ensure extension is preserved
if not sanitized_name.lower().endswith(ext.lower()):
sanitized_name = Path(att_name).stem + ext
dest_filename = sanitized_name # Fansly: no index prefix needed
else:
# Fallback to source filename
dest_filename = matching_file.name
dest_path = dest_dir / dest_filename
print(f" [{att_index}] {matching_file.name} -> {dest_filename}")
if args.dry_run:
stats['files_copied'] += 1
continue
# Create destination directory
dest_dir.mkdir(parents=True, exist_ok=True)
# Copy file
try:
shutil.copy2(matching_file, dest_path)
stats['files_copied'] += 1
except Exception as e:
print(f" Error copying file: {e}")
stats['errors'] += 1
continue
# Compute file hash
file_hash = get_file_hash(dest_path)
file_size = dest_path.stat().st_size
# Generate thumbnail
thumbnail_data = generate_thumbnail(dest_path, file_type)
if thumbnail_data:
stats['thumbnails_generated'] += 1
# Update database
now = datetime.now().isoformat()
with db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute("""
UPDATE paid_content_attachments
SET status = 'completed',
local_path = ?,
local_filename = ?,
file_hash = ?,
file_size = ?,
file_type = ?,
downloaded_at = ?,
thumbnail_data = ?
WHERE id = ?
""", (str(dest_path), dest_filename, file_hash, file_size, file_type, now, thumbnail_data, att_id))
conn.commit()
# Update post downloaded status
if not args.dry_run:
with db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Check if all attachments are now completed
cursor.execute("""
SELECT COUNT(*) FROM paid_content_attachments
WHERE post_id = ? AND status != 'completed'
""", (post_db_id,))
pending = cursor.fetchone()[0]
if pending == 0:
cursor.execute("""
UPDATE paid_content_posts
SET downloaded = 1, download_date = ?
WHERE id = ?
""", (datetime.now().isoformat(), post_db_id))
conn.commit()
stats['posts_found'] += 1
# Print summary
print("\n" + "=" * 50)
print("BACKFILL SUMMARY")
print("=" * 50)
print(f"Posts found in source: {len(post_folders)}")
print(f"Posts matched in DB: {stats['posts_matched']}")
print(f"Files copied: {stats['files_copied']}")
print(f"Files skipped (existing): {stats['files_skipped']}")
print(f"Thumbnails generated: {stats['thumbnails_generated']}")
print(f"Errors: {stats['errors']}")
if args.dry_run:
print("\n(Dry run - no changes made)")
if __name__ == '__main__':
main()

594
scripts/backfill_press.py Normal file
View File

@@ -0,0 +1,594 @@
#!/usr/bin/env python3
"""
Backfill press articles from Google News RSS for the last year.
Google News RSS:
- 100 articles per query (cap)
- No rate limiting, no API key needed
- ~12 months of history
- Strategy: 1-week windows to stay under the 100 cap
"""
import hashlib
import json
import os
import subprocess
import sys
import time
import urllib.error
import urllib.request
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
from urllib.parse import urlparse
# Bootstrap database
sys.path.insert(0, '/opt/media-downloader')
import modules.db_bootstrap # noqa: E402,F401
from modules.universal_logger import get_logger
logger = get_logger('PressBackfill')
DB_PASSWORD = "PNsihOXvvuPwWiIvGlsc9Fh2YmMmB"
WEEKS_BACK = 52
# Domains that return no content even with FlareSolverr
SKIP_DOMAINS = {
'msn.com',
'news.google.com',
'imdb.com',
'st-aug.edu',
}
def fetch_google_news_window(name: str, start_date: str, end_date: str) -> list:
"""Fetch Google News RSS articles for a specific time window.
Returns list of dicts with: title, url, published_date, source."""
query = f'%22{name.replace(" ", "+")}%22+after:{start_date}+before:{end_date}'
url = f'https://news.google.com/rss/search?q={query}&hl=en&gl=US&ceid=US:en'
for attempt in range(3):
try:
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
with urllib.request.urlopen(req, timeout=30) as response:
data = response.read().decode('utf-8')
root = ET.fromstring(data)
articles = []
for item in root.findall('.//item'):
title_el = item.find('title')
link_el = item.find('link')
pub_el = item.find('pubDate')
source_el = item.find('source')
if title_el is None or link_el is None:
continue
title = title_el.text or ''
# Google News titles often end with " - Source Name", strip it
source_name = source_el.text if source_el is not None else ''
if source_name and title.endswith(f' - {source_name}'):
title = title[:-len(f' - {source_name}')].strip()
# Parse pubDate (RFC 2822 format)
published_date = ''
if pub_el is not None and pub_el.text:
try:
from email.utils import parsedate_to_datetime
dt = parsedate_to_datetime(pub_el.text)
published_date = dt.isoformat()
except Exception:
published_date = pub_el.text
articles.append({
'title': title,
'url': link_el.text or '',
'published_date': published_date,
'source': source_name,
})
return articles
except Exception as e:
if attempt < 2:
time.sleep(5)
continue
print(f" Error fetching Google News: {e}")
return []
return []
PRESS_IMAGE_CACHE = '/opt/media-downloader/data/press_images'
os.makedirs(PRESS_IMAGE_CACHE, exist_ok=True)
def cache_press_image(image_url: str) -> str | None:
"""Download and cache an image locally. Returns API path."""
if not image_url:
return None
url_hash = hashlib.sha256(image_url.encode('utf-8')).hexdigest()[:16]
# Check if already cached
for ext in ('.jpg', '.jpeg', '.png', '.webp', '.gif'):
cached = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}")
if os.path.exists(cached) and os.path.getsize(cached) > 0:
return f"/api/press/images/{url_hash}{ext}"
# Download
try:
req = urllib.request.Request(image_url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'image/*,*/*',
})
with urllib.request.urlopen(req, timeout=15) as resp:
image_data = resp.read()
if len(image_data) < 1000:
return None
except Exception:
# Try via FlareSolverr — but it can't fetch binary, so try fetching
# the page and extracting the image URL that works
return None
ext = '.jpg'
url_lower = image_url.lower()
if '.png' in url_lower:
ext = '.png'
elif '.webp' in url_lower:
ext = '.webp'
elif '.gif' in url_lower:
ext = '.gif'
cached_path = os.path.join(PRESS_IMAGE_CACHE, f"{url_hash}{ext}")
with open(cached_path, 'wb') as f:
f.write(image_data)
return f"/api/press/images/{url_hash}{ext}"
def cache_content_images(html_content: str) -> str:
"""Find all <img ...> in HTML content, cache each image locally,
and rewrite src to /api/press/images/... proxy path.
Removes img tags where caching fails (broken > missing)."""
if not html_content:
return html_content
import re as _re
def _replace_img(match):
full_tag = match.group(0)
src = match.group(1)
if not src or src.startswith('/api/press/images/'):
return full_tag
cached = cache_press_image(src)
if cached:
return full_tag.replace(src, cached)
return '' # Remove img if caching failed
return _re.sub(r'<img\s+src="([^"]+)"[^>]*>', _replace_img, html_content)
def decode_google_news_url(google_url: str) -> str | None:
"""Decode a Google News redirect URL to the real article URL."""
if 'news.google.com' not in google_url:
return google_url
try:
from googlenewsdecoder import gnewsdecoder
result = gnewsdecoder(google_url, interval=1)
if result.get('status'):
return result['decoded_url']
except Exception:
pass
return None
def extract_content(article_url: str) -> tuple[str | None, str | None]:
"""Extract article content and og:image from the real article URL.
Tries direct fetch first, falls back to FlareSolverr for bot-protected sites.
Returns (content_html, image_url)."""
content, image = _extract_content_direct(article_url)
if content:
return (content, image)
# Fallback to FlareSolverr for bot-protected sites
content2, image2 = _extract_content_flaresolverr(article_url)
return (content2, image2 or image)
def _fetch_html_flaresolverr(url: str) -> str | None:
"""Fetch HTML via FlareSolverr (headless browser)."""
try:
import requests
resp = requests.post('http://localhost:8191/v1', json={
'cmd': 'request.get',
'url': url,
'maxTimeout': 30000
}, timeout=45)
data = resp.json()
if data.get('status') == 'ok':
html = data.get('solution', {}).get('response', '')
if len(html) > 500:
return html
except Exception:
pass
return None
def _extract_content_flaresolverr(url: str) -> tuple[str | None, str | None]:
"""Extract content using FlareSolverr as fetcher."""
raw_html = _fetch_html_flaresolverr(url)
if not raw_html:
return (None, None)
return _parse_article_html(raw_html, url)
def _extract_content_direct(url: str) -> tuple[str | None, str | None]:
"""Self-contained article extraction. Returns (content_html, image_url)."""
import urllib.request
try:
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
})
with urllib.request.urlopen(req, timeout=20) as response:
raw_html = response.read().decode('utf-8', errors='replace')
return _parse_article_html(raw_html, url)
except Exception:
return (None, None)
def _parse_article_html(raw_html: str, url: str) -> tuple[str | None, str | None]:
"""Parse raw HTML into article content and og:image. Returns (content_html, image_url)."""
import re
from urllib.parse import urljoin
try:
from readability import Document
from bs4 import BeautifulSoup
# Extract og:image for thumbnail
og_soup = BeautifulSoup(raw_html, 'html.parser')
og_image = None
og_tag = og_soup.find('meta', property='og:image')
if og_tag and og_tag.get('content'):
og_image = og_tag['content']
if not og_image:
tw_tag = og_soup.find('meta', attrs={'name': 'twitter:image'})
if tw_tag and tw_tag.get('content'):
og_image = tw_tag['content']
import bleach
doc = Document(raw_html, url=url)
content_html = doc.summary()
if not content_html or len(content_html.strip()) < 50:
return (None, og_image)
reader_soup = BeautifulSoup(content_html, 'html.parser')
junk_text_re = re.compile(
r'(^published|^updated|^modified|^posted|^by\s|^written by|^photo:|^image:|^credit:|'
r'share or comment|share this article|comment on this|follow us on|'
r'sign up for|subscribe to|have you got a story|tips@|email us)',
re.I
)
inline_tags = ['b', 'i', 'em', 'strong', 'a', 'br']
inline_attrs = {'a': ['href']}
html_parts = []
for el in reader_soup.find_all(['p', 'h2', 'h3', 'h4', 'blockquote', 'ul', 'ol']):
text = el.get_text(strip=True)
if len(text) < 30:
continue
if junk_text_re.search(text):
continue
tag = el.name
inner = bleach.clean(
el.decode_contents(), tags=inline_tags,
attributes=inline_attrs, strip=True, protocols=['http', 'https']
).strip()
if not inner:
continue
if tag == 'p':
html_parts.append(f'<p>{inner}</p>')
elif tag in ('h2', 'h3', 'h4'):
html_parts.append(f'<{tag}>{inner}</{tag}>')
elif tag == 'blockquote':
html_parts.append(f'<blockquote><p>{inner}</p></blockquote>')
elif tag in ('ul', 'ol'):
items = []
for li in el.find_all('li', recursive=False):
li_inner = bleach.clean(
li.decode_contents(), tags=inline_tags,
attributes=inline_attrs, strip=True, protocols=['http', 'https']
).strip()
if li_inner and len(li.get_text(strip=True)) > 10:
items.append(f'<li>{li_inner}</li>')
if items:
html_parts.append(f'<{tag}>{"".join(items)}</{tag}>')
# Images from readability
junk_img_re = re.compile(r'(logo|icon|pixel|spacer|blank|1x1|svg|avatar|spinner|/ct/|cts\.businesswire)', re.I)
seen_srcs = set()
article_images = []
for img in reader_soup.find_all('img'):
src = img.get('src', '')
if src and src.startswith(('http://', 'https://')) and src not in seen_srcs:
if junk_img_re.search(src):
continue
seen_srcs.add(src)
alt = (img.get('alt', '') or '').strip()
article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
# If readability found no images, grab first real image from original HTML
if not article_images:
orig_soup = BeautifulSoup(raw_html, 'html.parser')
for noise in orig_soup.find_all(['script', 'style', 'nav', 'footer', 'header',
'aside', 'form', 'noscript', 'svg']):
noise.decompose()
for img in orig_soup.find_all('img'):
src = (img.get('data-src') or img.get('data-lazy-src') or
img.get('data-original') or img.get('src') or '')
if not src or not src.startswith(('http://', 'https://')):
continue
src_lower = src.lower()
if any(x in src_lower for x in ('logo', 'icon', 'pixel', 'spacer', 'blank',
'1x1', 'svg', 'avatar', 'spinner', '/ct/')):
continue
alt = (img.get('alt', '') or '').strip()
article_images.append(f'<img src="{bleach.clean(src)}" alt="{bleach.clean(alt)}">')
break # Only first real image
# Merge text + images
if article_images and html_parts:
text_count = len(html_parts)
img_count = len(article_images)
interval = max(1, text_count // (img_count + 1))
merged = []
img_idx = 0
for i, part in enumerate(html_parts):
merged.append(part)
if img_idx < img_count and (i + 1) % interval == 0:
merged.append(article_images[img_idx])
img_idx += 1
while img_idx < img_count:
merged.append(article_images[img_idx])
img_idx += 1
html_parts = merged
elif article_images and not html_parts:
html_parts = article_images
if not html_parts:
text = reader_soup.get_text(separator='\n\n', strip=True)
if text:
for para in text.split('\n\n'):
para = para.strip()
if len(para) > 30:
html_parts.append(f'<p>{bleach.clean(para)}</p>')
if not html_parts:
return (None, og_image)
# Quality check
from bs4 import BeautifulSoup as BS
clean_parts = []
for part in html_parts:
part_soup = BS(part, 'html.parser')
part_text = part_soup.get_text(strip=True)
if len(part_text) > 100:
words = part_text.split()
avg_word_len = len(part_text) / max(len(words), 1)
if avg_word_len > 12:
continue
clean_parts.append(part)
if not clean_parts:
return (None, og_image)
result = '\n'.join(clean_parts)
plain_text = BS(result, 'html.parser').get_text(separator=' ', strip=True)
garbage_re = re.compile(
r'(use (left|right|escape)|arrow keys|navigate between|'
r'sign (in|up) with|we won.t post|social account|'
r'accept cookies|cookie policy|privacy policy|terms of (use|service)|'
r'AlabamaAlaska|CaliforniaColorado|United States of America)',
re.I
)
if len(plain_text) < 200 or garbage_re.search(plain_text):
return (None, og_image)
return (result, og_image)
except Exception:
return (None, None)
def main():
# Get configured celebrities
env = os.environ.copy()
env['PGPASSWORD'] = DB_PASSWORD
result = subprocess.run(
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
'-tAc', "SELECT celebrity_ids FROM press_config WHERE id = 1"],
capture_output=True, text=True, env=env
)
celebrity_ids = json.loads(result.stdout.strip()) if result.stdout.strip() else []
if not celebrity_ids:
print("No celebrities configured in press_config")
return
# Get celebrity names
placeholders = ','.join(str(i) for i in celebrity_ids)
result = subprocess.run(
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
'-tAF', '|', '-c', f"SELECT id, name FROM celebrity_profiles WHERE id IN ({placeholders})"],
capture_output=True, text=True, env=env
)
celebrities = []
for line in result.stdout.strip().splitlines():
if '|' in line:
parts = line.split('|')
celebrities.append({'id': int(parts[0]), 'name': parts[1].strip()})
if not celebrities:
print("No celebrities found")
return
# Get existing URL hashes for dedup
result = subprocess.run(
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
'-tAc', "SELECT url_hash FROM press_articles"],
capture_output=True, text=True, env=env
)
existing_hashes = set(line.strip() for line in result.stdout.strip().splitlines() if line.strip())
print(f"Existing articles: {len(existing_hashes)}")
# Also get existing titles per celebrity for dedup
result = subprocess.run(
['psql', '-h', 'localhost', '-U', 'media_downloader', '-d', 'media_downloader',
'-tAF', '|', '-c', "SELECT celebrity_id, title FROM press_articles"],
capture_output=True, text=True, env=env
)
existing_titles = set()
for line in result.stdout.strip().splitlines():
if '|' in line:
parts = line.split('|', 1)
existing_titles.add((parts[0].strip(), parts[1].strip()))
now = datetime.now()
total_new = 0
total_fetched = 0
for celeb in celebrities:
celeb_id = celeb['id']
celeb_name = celeb['name']
print(f"\n{'='*60}")
print(f"Backfilling: {celeb_name} (id={celeb_id})")
print(f"{'='*60}")
celeb_new = 0
# Query in 1-week windows going back
for week in range(WEEKS_BACK):
end_dt = now - timedelta(weeks=week)
start_dt = now - timedelta(weeks=week + 1)
start_str = start_dt.strftime('%Y-%m-%d')
end_str = end_dt.strftime('%Y-%m-%d')
week_label = f"Week -{week+1} ({start_dt.strftime('%b %d')} - {end_dt.strftime('%b %d')})"
print(f"\n {week_label}...", end='', flush=True)
articles = fetch_google_news_window(celeb_name, start_str, end_str)
total_fetched += len(articles)
if not articles:
print(f" no articles")
continue
# Warn if we hit the 100 cap (may be missing articles)
cap_warning = " [HIT 100 CAP]" if len(articles) >= 100 else ""
print(f" {len(articles)} found{cap_warning}", flush=True)
week_new = 0
for article in articles:
google_url = article.get('url', '')
if not google_url:
continue
title = article.get('title', '').strip()
if title and (str(celeb_id), title) in existing_titles:
continue
# Only keep articles where celeb name appears in the title
if not title or celeb_name.lower() not in title.lower():
continue
# Decode Google News URL to real article URL
article_url = decode_google_news_url(google_url)
if not article_url:
continue
# Skip domains that are JS-rendered or block scrapers
parsed_check = urlparse(article_url)
host = parsed_check.netloc.lower()
# Check if host or any parent domain is in SKIP_DOMAINS
if any(host == d or host.endswith('.' + d) for d in SKIP_DOMAINS):
continue
url_hash = hashlib.sha256(article_url.encode('utf-8')).hexdigest()
if url_hash in existing_hashes:
continue
# Parse domain from real URL
parsed = urlparse(article_url)
domain = parsed.netloc.replace('www.', '')
published_date = article.get('published_date', '')
source = article.get('source', '')
# Extract content and og:image (with rate limiting to be polite)
content, og_image = extract_content(article_url)
# Cache all inline images in the content to local proxy
if content:
content = cache_content_images(content)
if content:
import re as _re3
snippet = _re3.sub(r'<[^>]+>', ' ', content)
snippet = ' '.join(snippet.split())[:300]
else:
snippet = title[:300] if title else ''
# Cache the og:image locally, fall back to first inline image
image_url = cache_press_image(og_image) if og_image else None
if not image_url and content:
import re as _re2
m = _re2.search(r'<img\s+src="(/api/press/images/[^"]+)"', content)
if m:
image_url = m.group(1)
time.sleep(0.5)
# Insert using parameterized query via psycopg2
import psycopg2
try:
pg_conn = psycopg2.connect(
host='localhost', user='media_downloader',
password=env.get('PGPASSWORD', ''), dbname='media_downloader'
)
pg_cur = pg_conn.cursor()
pg_cur.execute("""INSERT INTO press_articles
(celebrity_id, title, url, url_hash, domain, published_date,
image_url, language, country, article_content, snippet, notified, read)
VALUES (%s, %s, %s, %s, %s, %s, %s, 'en', '', %s, %s, 1, 0)
ON CONFLICT DO NOTHING""",
(celeb_id, title, article_url, url_hash, domain,
published_date, image_url or '', content, snippet))
inserted = pg_cur.rowcount > 0
pg_conn.commit()
pg_cur.close()
pg_conn.close()
except Exception as db_err:
print(f" DB error: {db_err}")
inserted = False
if inserted:
week_new += 1
existing_hashes.add(url_hash)
existing_titles.add((str(celeb_id), title))
if week_new > 0:
print(f" Added {week_new} new articles")
celeb_new += week_new
# Small delay between queries to be polite
time.sleep(1)
total_new += celeb_new
print(f"\n {celeb_name}: {celeb_new} new articles added")
print(f"\n{'='*60}")
print(f"DONE: Fetched {total_fetched} total, added {total_new} new articles")
print(f"{'='*60}")
if __name__ == '__main__':
main()

404
scripts/backfill_tagged_users.py Executable file
View File

@@ -0,0 +1,404 @@
#!/opt/media-downloader/venv/bin/python3 -u
"""
One-off script to backfill tagged users for existing Instagram posts.
Tries imginn.com first (no rate limits), then falls back to instagram.com
for posts where imginn didn't have tag data.
Tracks checked posts in a local file so re-runs skip already-checked posts
without polluting the database.
Usage:
python3 /opt/media-downloader/scripts/backfill_tagged_users.py [--creator-id N] [--limit N] [--dry-run]
"""
import argparse
import json
import os
import random
import re
import sys
import time
sys.path.insert(0, '/opt/media-downloader')
import modules.db_bootstrap # noqa: F401 — triggers pgadapter monkey-patch
import sqlite3
CHECKED_FILE = '/tmp/backfill_checked_posts.txt'
NORDVPN_CREDS = 'Dc9mgrpJnFnkTtc5iQkGNwLM:fKd2ZEjBUJ3YDQ5hhcoTKsnW'
NORDVPN_SERVERS = [
'us.socks.nordhold.net',
'nl.socks.nordhold.net',
'se.socks.nordhold.net',
'amsterdam.nl.socks.nordhold.net',
'atlanta.us.socks.nordhold.net',
'chicago.us.socks.nordhold.net',
'dallas.us.socks.nordhold.net',
'los-angeles.us.socks.nordhold.net',
'new-york.us.socks.nordhold.net',
'stockholm.se.socks.nordhold.net',
]
_proxy_index = 0
def get_current_proxy():
"""Return the current proxy without rotating."""
server = NORDVPN_SERVERS[_proxy_index % len(NORDVPN_SERVERS)]
return {'https': f'socks5://{NORDVPN_CREDS}@{server}:1080'}
def rotate_proxy():
"""Switch to the next server. Call on rate limit or persistent errors."""
global _proxy_index
_proxy_index += 1
server = NORDVPN_SERVERS[_proxy_index % len(NORDVPN_SERVERS)]
print(f' Rotating proxy -> {server}', flush=True)
return {'https': f'socks5://{NORDVPN_CREDS}@{server}:1080'}
def get_next_proxy():
"""Legacy: rotate and return. Only used in retry path."""
return rotate_proxy()
def load_checked_posts():
"""Load set of already-checked post DB IDs from local file."""
if not os.path.exists(CHECKED_FILE):
return set()
with open(CHECKED_FILE, 'r') as f:
return {int(line.strip()) for line in f if line.strip().isdigit()}
def save_checked_post(db_id):
"""Append a checked post ID to the local tracking file."""
with open(CHECKED_FILE, 'a') as f:
f.write(f'{db_id}\n')
def get_session():
"""Get a curl_cffi session with browser impersonation."""
from curl_cffi.requests import Session
# Try multiple browser versions for curl_cffi compatibility
for _browser in ("chrome131", "chrome136", "chrome"):
try:
session = Session(impersonate=_browser)
break
except Exception:
continue
else:
session = Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': 'https://www.instagram.com/',
})
return session
def get_posts_without_tags(conn, creator_id=None, limit=0):
"""Get Instagram posts without tagged users, newest first."""
query = """
SELECT p.id, p.post_id, p.creator_id, c.username
FROM paid_content_posts p
JOIN paid_content_creators c ON p.creator_id = c.id
LEFT JOIN paid_content_post_tagged_users tu ON tu.post_id = p.id
WHERE c.service_id = 'instagram'
AND p.deleted_at IS NULL
AND p.post_id NOT LIKE 'story_%%'
AND p.post_id NOT LIKE 'highlight_%%'
AND tu.post_id IS NULL
"""
params = []
if creator_id:
query += " AND p.creator_id = ?"
params.append(creator_id)
query += " ORDER BY p.id ASC"
if limit:
query += f" LIMIT {limit}"
cursor = conn.cursor()
cursor.execute(query, params)
return [(row[0], row[1], row[2], row[3]) for row in cursor.fetchall()]
def save_tagged_users(conn, post_db_id, usernames):
"""Save tagged users for a post."""
cursor = conn.cursor()
now = time.strftime('%Y-%m-%dT%H:%M:%S')
for username in usernames:
cursor.execute("""
INSERT INTO paid_content_post_tagged_users (post_id, username, created_at)
VALUES (?, ?, ?)
ON CONFLICT DO NOTHING
""", (post_db_id, username, now))
conn.commit()
def extract_usertags_from_item(item):
"""Extract tagged usernames from an Instagram media item dict."""
tagged = []
# Main item usertags
usertags = item.get('usertags') or {}
for tag in usertags.get('in', []):
username = (tag.get('user') or {}).get('username')
if username and username not in tagged:
tagged.append(username)
# Carousel item usertags
for cm in item.get('carousel_media') or []:
cm_usertags = cm.get('usertags') or {}
for tag in cm_usertags.get('in', []):
username = (tag.get('user') or {}).get('username')
if username and username not in tagged:
tagged.append(username)
return tagged
def create_flaresolverr_session():
"""Create a persistent FlareSolverr session for imginn requests."""
import requests as std_requests
try:
resp = std_requests.post('http://localhost:8191/v1', json={
'cmd': 'sessions.create',
}, timeout=30)
session_id = resp.json().get('session')
if session_id:
print(f" FlareSolverr session: {session_id}")
return session_id
except Exception as e:
print(f" FlareSolverr unavailable: {e}")
return None
def destroy_flaresolverr_session(session_id):
"""Clean up FlareSolverr session."""
if not session_id:
return
import requests as std_requests
try:
std_requests.post('http://localhost:8191/v1', json={
'cmd': 'sessions.destroy',
'session': session_id,
}, timeout=10)
except Exception:
pass
def fetch_usertags_imginn(shortcode, flaresolverr_session=None):
"""Try fetching usertags from imginn.com via FlareSolverr."""
import requests as std_requests
url = f'https://imginn.com/p/{shortcode}/'
try:
payload = {
'cmd': 'request.get',
'url': url,
'maxTimeout': 60000,
}
if flaresolverr_session:
payload['session'] = flaresolverr_session
resp = std_requests.post('http://localhost:8191/v1', json=payload, timeout=70)
data = resp.json()
if data.get('status') != 'ok':
return None, 'imginn_flaresolverr_fail'
html = data.get('solution', {}).get('response', '')
if not html:
return None, 'imginn_empty'
# Parse tagged users from imginn HTML (same logic as instagram_adapter)
tagged = []
idx = html.find('class="tagged-user-list"')
if idx < 0:
return [], 'imginn_no_tags'
chunk = html[idx:idx + 5000]
for m in re.finditer(r'class="name">\s*(\S+)\s*</div>', chunk):
username = m.group(1).strip()
if re.match(r'^[a-zA-Z0-9_.]{1,30}$', username):
tagged.append(username)
return tagged, 'imginn_ok'
except Exception as e:
return None, f'imginn_error: {e}'
def fetch_usertags_instagram(session, shortcode, max_retries=3):
"""Fetch usertags from Instagram post page HTML via proxy.
Sticks with current proxy server; only rotates on connection errors or rate limits."""
url = f'https://www.instagram.com/p/{shortcode}/'
last_error = None
for attempt in range(max_retries):
try:
proxy = get_current_proxy() if attempt == 0 else rotate_proxy()
server = list(proxy.values())[0].split('@')[1].split(':')[0]
resp = session.get(url, timeout=20, proxies=proxy)
if resp.status_code == 429:
rotate_proxy()
return None, 'rate_limited'
if resp.status_code == 404:
return None, 'not_found'
if resp.status_code != 200:
last_error = f'http_{resp.status_code}'
print(f' retry {attempt+1}/{max_retries}: http {resp.status_code} via {server}', flush=True)
continue
html = resp.text
# Find embedded post JSON in the page
idx = html.find('xdt_api__v1__media__shortcode__web_info')
if idx < 0:
if 'LoginAndSignupPage' in html or '"require_login":true' in html:
return None, 'login_required'
return None, 'no_embedded_data'
# Find the items JSON object
items_start = html.find('{"items":[{"code"', idx)
if items_start < 0:
return None, 'no_items_data'
# Find balanced braces to extract the full JSON
depth = 0
end = items_start
for i in range(items_start, min(items_start + 500000, len(html))):
if html[i] == '{':
depth += 1
elif html[i] == '}':
depth -= 1
if depth == 0:
end = i + 1
break
data = json.loads(html[items_start:end])
items = data.get('items', [])
if not items:
return [], 'ok'
tagged = extract_usertags_from_item(items[0])
return tagged, 'ok'
except json.JSONDecodeError as e:
return None, f'json_error: {e}'
except Exception as e:
last_error = str(e)
short_err = str(e).split('.')[0][:80]
print(f' retry {attempt+1}/{max_retries}: {short_err} via {server}', flush=True)
# Connection/proxy errors → retry with next server
continue
return None, last_error or 'max_retries'
def fetch_post_usertags(session, shortcode, flaresolverr_session=None):
"""Fetch usertags from Instagram via proxy."""
return fetch_usertags_instagram(session, shortcode)
def main():
parser = argparse.ArgumentParser(description='Backfill tagged users for Instagram posts')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without saving')
parser.add_argument('--creator-id', type=int, help='Only process a specific creator')
parser.add_argument('--limit', type=int, default=0, help='Max posts to process (0 = all)')
args = parser.parse_args()
conn = sqlite3.connect('media_downloader')
posts = get_posts_without_tags(conn, args.creator_id, args.limit)
if not posts:
print("No posts need tagged user backfill.")
return
# Filter out already-checked posts (from previous runs)
checked = load_checked_posts()
if checked:
before = len(posts)
posts = [(db_id, sc, cid, u) for db_id, sc, cid, u in posts if db_id not in checked]
if before != len(posts):
print(f"Skipping {before - len(posts)} already-checked posts from previous runs")
if not posts:
print("All posts already checked.")
return
# Group by creator for display
creators = {}
for db_id, shortcode, creator_id, username in posts:
if creator_id not in creators:
creators[creator_id] = {'username': username, 'count': 0}
creators[creator_id]['count'] += 1
print(f"Found {len(posts)} Instagram posts to check across {len(creators)} creators")
for cid, info in creators.items():
print(f" @{info['username']}: {info['count']} posts")
if args.dry_run:
print("DRY RUN - no changes will be saved\n")
session = get_session()
total_tagged = 0
total_no_tags = 0
total_errors = 0
rate_limit_wait = 60
try:
for i, (db_id, shortcode, creator_id, username) in enumerate(posts):
if i > 0:
time.sleep(2)
print(f" [{i+1}/{len(posts)}] @{username} {shortcode}: ", end='', flush=True)
tagged, status = fetch_post_usertags(session, shortcode)
if status == 'rate_limited':
print(f"rate limited, waiting {rate_limit_wait}s...", flush=True)
time.sleep(rate_limit_wait)
rate_limit_wait = min(rate_limit_wait * 2, 600)
print(f" [{i+1}/{len(posts)}] @{username} {shortcode}: ", end='', flush=True)
tagged, status = fetch_post_usertags(session, shortcode)
if tagged is None:
print(f"ERROR {status}", flush=True)
total_errors += 1
if status == 'login_required':
print(" Instagram is requiring login. Stopping.")
break
continue
# Reset rate limit backoff on success
rate_limit_wait = 60
if tagged:
if args.dry_run:
print(f"would tag {tagged}", flush=True)
else:
save_tagged_users(conn, db_id, tagged)
print(f"tagged {tagged}", flush=True)
total_tagged += 1
else:
if not args.dry_run:
save_checked_post(db_id)
print(f"no tags", flush=True)
total_no_tags += 1
# Polite delay
time.sleep(random.uniform(0.5, 1.5))
except KeyboardInterrupt:
print("\nInterrupted!")
conn.close()
print(f"\nDone! {total_tagged} posts had tagged users, {total_no_tags} had none, {total_errors} errors (out of {len(posts)} total)")
if __name__ == '__main__':
main()

225
scripts/bellazon_scraper.py Normal file
View File

@@ -0,0 +1,225 @@
#!/usr/bin/env python3
"""
Bellazon Forum Thread Image Scraper
Downloads all full-size images from a Bellazon forum thread.
Bellazon uses <a href="full.jpg"><img src="full_thumb.jpg"></a> pattern.
"""
import re
import sys
import time
import hashlib
import requests
from pathlib import Path
from urllib.parse import urlparse, urljoin
from html import unescape
THREAD_URL = sys.argv[1] if len(sys.argv) > 1 else "https://www.bellazon.com/main/topic/39089-india-reynolds/"
OUTPUT_DIR = sys.argv[2] if len(sys.argv) > 2 else "/opt/media-downloader/data/bellazon/india-reynolds"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.bellazon.com/',
}
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff'}
VIDEO_EXTENSIONS = {'.mp4', '.webm', '.mov', '.avi', '.mkv'}
MEDIA_EXTENSIONS = IMAGE_EXTENSIONS | VIDEO_EXTENSIONS
SKIP_PATTERNS = [
'avatar', 'emoji', 'icon', '/public/', 'rep_', 'style_',
'star_', '/js/', '/css/', 'button', 'logo', 'loading',
'spinner', 'pixel', 'spacer', '/default_photo',
'profile_photo', '/skin_', '/set_resources/', 'screenshot',
]
def get_page_count(html: str) -> int:
match = re.search(r'Page\s+\d+\s+of\s+(\d+)', html)
return int(match.group(1)) if match else 1
def is_media_url(url: str) -> bool:
parsed = urlparse(url)
ext = Path(parsed.path).suffix.lower()
return ext in MEDIA_EXTENSIONS
def should_skip(url: str) -> bool:
lower = url.lower()
return any(skip in lower for skip in SKIP_PATTERNS)
def extract_images_from_html(html: str, base_url: str) -> list:
"""Extract full-size image URLs from page HTML.
Priority: <a href="full.jpg"> wrapping <img src="thumb.jpg">
Fallback: standalone <img src="image.jpg"> (non-thumb)
"""
images = []
thumb_urls = set() # track thumbnails so we don't add them as standalone
# Pattern 1: <a href="full-size"><img src="thumb"></a>
# This catches the bellazon pattern where thumbnails link to full images
for match in re.finditer(
r'<a[^>]+href=["\']([^"\']+)["\'][^>]*>\s*<img[^>]+src=["\']([^"\']+)["\']',
html, re.IGNORECASE | re.DOTALL
):
href = unescape(match.group(1))
img_src = unescape(match.group(2))
if is_media_url(href) and not should_skip(href):
full_url = urljoin(base_url, href)
images.append(full_url)
# Track the thumbnail so we skip it later
thumb_urls.add(urljoin(base_url, img_src))
# Pattern 2: Standalone <img> tags not wrapped in links to full-size
for match in re.finditer(r'<img[^>]+src=["\']([^"\']+)["\']', html, re.IGNORECASE):
url = unescape(match.group(1))
if should_skip(url):
continue
full_url = urljoin(base_url, url)
# Skip if this is a thumbnail we already have the full version of
if full_url in thumb_urls:
continue
# Skip anything with _thumb or .thumb in the name
if '_thumb' in url or '.thumb.' in url:
continue
if is_media_url(url):
images.append(full_url)
# Pattern 3: Links to external image files (not bellazon)
for match in re.finditer(r'href=["\']([^"\']+)["\']', html, re.IGNORECASE):
url = unescape(match.group(1))
parsed = urlparse(url)
if parsed.netloc and 'bellazon' not in parsed.netloc and is_media_url(url):
images.append(url)
# Pattern 4: Forum attachments (attachment.php?id=XXX) with video/image filenames
# e.g. <a href="...attachment.php?id=6887160">B7A65853...MP4</a>
for match in re.finditer(
r'<a[^>]+href=["\']([^"\']*attachment\.php\?id=\d+)["\'][^>]*>([^<]+)</a>',
html, re.IGNORECASE
):
href = unescape(match.group(1))
link_text = match.group(2).strip()
ext = Path(link_text).suffix.lower()
if ext in MEDIA_EXTENSIONS:
full_url = urljoin(base_url, href)
images.append((full_url, link_text)) # tuple: (url, filename)
# Deduplicate preserving order
seen = set()
unique = []
for item in images:
key = item[0] if isinstance(item, tuple) else item
if key not in seen:
seen.add(key)
unique.append(item)
return unique
def download_media(item, output_dir: Path, session: requests.Session, seen_hashes: set) -> bool:
# item is either a URL string or a (url, filename) tuple
if isinstance(item, tuple):
url, orig_filename = item
else:
url, orig_filename = item, None
try:
resp = session.get(url, timeout=60)
if resp.status_code != 200:
return False
content_type = resp.headers.get('content-type', '')
if not any(t in content_type for t in ['image', 'video', 'octet-stream']):
return False
data = resp.content
if len(data) < 5000: # Skip tiny files (icons/placeholders)
return False
file_hash = hashlib.md5(data).hexdigest()
if file_hash in seen_hashes:
return False
seen_hashes.add(file_hash)
if orig_filename:
filename = re.sub(r'[^\w\-_.]', '_', orig_filename)
else:
parsed = urlparse(url)
filename = Path(parsed.path).name
filename = re.sub(r'[^\w\-_.]', '_', filename)
if not filename or filename == '_':
filename = f"{file_hash}.jpg"
filepath = output_dir / filename
if filepath.exists():
filepath = output_dir / f"{filepath.stem}_{file_hash[:8]}{filepath.suffix}"
filepath.write_bytes(data)
return True
except Exception as e:
display = url[:80] if not orig_filename else orig_filename
print(f" Error: {display}: {e}", flush=True)
return False
def main():
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(parents=True, exist_ok=True)
session = requests.Session()
session.headers.update(HEADERS)
print(f"Fetching: {THREAD_URL}", flush=True)
resp = session.get(THREAD_URL, timeout=30)
resp.raise_for_status()
total_pages = get_page_count(resp.text)
print(f"Total pages: {total_pages}", flush=True)
seen_hashes = set()
total_downloaded = 0
total_skipped = 0
for page_num in range(1, total_pages + 1):
if page_num == 1:
page_url = THREAD_URL
html = resp.text
else:
page_url = f"{THREAD_URL.rstrip('/')}/page/{page_num}/"
try:
resp = session.get(page_url, timeout=30)
resp.raise_for_status()
html = resp.text
except Exception as e:
print(f" Error fetching page {page_num}: {e}", flush=True)
continue
images = extract_images_from_html(html, page_url)
page_dl = 0
for img_url in images:
if download_media(img_url, output_dir, session, seen_hashes):
page_dl += 1
total_downloaded += 1
else:
total_skipped += 1
print(f"Page {page_num}/{total_pages}: {page_dl} downloaded ({len(images)} found, {total_downloaded} total)", flush=True)
if page_num < total_pages:
time.sleep(1)
print(f"\nDone! {total_downloaded} images saved to {output_dir}", flush=True)
print(f"Skipped: {total_skipped}", flush=True)
if __name__ == "__main__":
main()

106
scripts/cleanup-old-logs.py Executable file
View File

@@ -0,0 +1,106 @@
#!/usr/bin/env python3
"""
Log Cleanup Script for Media Downloader
Removes log files older than 7 days
Usage: python3 scripts/cleanup-old-logs.py
Cron: 0 0 * * * /opt/media-downloader/venv/bin/python3 /opt/media-downloader/scripts/cleanup-old-logs.py
"""
import sys
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from datetime import datetime, timedelta
import glob
from modules.universal_logger import get_logger
# Configuration
RETENTION_DAYS = 7
LOG_DIR = Path("/opt/media-downloader/logs")
# Initialize logger
logger = get_logger('LogCleanup')
def cleanup_old_logs():
"""Remove log files older than retention days"""
logger.info("LogCleanup", f"Starting log cleanup (retention: {RETENTION_DAYS} days)")
# Check if log directory exists
if not LOG_DIR.exists():
logger.error("LogCleanup", f"Log directory not found: {LOG_DIR}")
return False
# Calculate cutoff date
cutoff_date = datetime.now() - timedelta(days=RETENTION_DAYS)
logger.debug("LogCleanup", f"Cutoff date: {cutoff_date.strftime('%Y-%m-%d %H:%M:%S')}")
# Find all log files
datetime_logs = list(LOG_DIR.glob("[0-9]*_*.log")) # YYYYMMDD_component.log
rotated_logs = list(LOG_DIR.glob("*.log.*")) # component.log.1, component.log.2, etc.
all_logs = datetime_logs + rotated_logs
logger.info("LogCleanup", f"Found {len(all_logs)} total log files to check")
# Track cleanup stats
removed_count = 0
removed_size = 0
skipped_count = 0
error_count = 0
# Process each log file
for log_file in all_logs:
try:
# Check file modification time
mtime = datetime.fromtimestamp(log_file.stat().st_mtime)
file_age_days = (datetime.now() - mtime).days
file_size = log_file.stat().st_size
if mtime < cutoff_date:
# File is old enough to delete
try:
log_file.unlink()
removed_count += 1
removed_size += file_size
logger.info("LogCleanup", f"Removed old log: {log_file.name} (age: {file_age_days} days, size: {file_size:,} bytes)")
except Exception as e:
error_count += 1
logger.error("LogCleanup", f"Failed to remove {log_file.name}: {e}")
else:
# File is still within retention period
skipped_count += 1
logger.debug("LogCleanup", f"Kept: {log_file.name} (age: {file_age_days} days)")
except Exception as e:
error_count += 1
logger.error("LogCleanup", f"Error processing {log_file.name}: {e}")
# Log summary
if removed_count > 0:
size_mb = removed_size / (1024 * 1024)
logger.success("LogCleanup", f"Cleanup complete: Removed {removed_count} log file(s), freed {size_mb:.2f} MB")
else:
logger.info("LogCleanup", f"No old logs to clean up (all {skipped_count} logs are within {RETENTION_DAYS} days)")
if error_count > 0:
logger.warning("LogCleanup", f"Encountered {error_count} error(s) during cleanup")
# Log final stats
logger.info("LogCleanup", f"Summary: {removed_count} removed, {skipped_count} kept, {error_count} errors")
return error_count == 0
def main():
"""Main entry point"""
try:
success = cleanup_old_logs()
sys.exit(0 if success else 1)
except Exception as e:
logger.error("LogCleanup", f"Fatal error during log cleanup: {e}")
sys.exit(1)
if __name__ == '__main__':
main()

506
scripts/cloud_backup_restore.sh Executable file
View File

@@ -0,0 +1,506 @@
#!/usr/bin/env bash
# ============================================================================
# Cloud Backup Restore Script
#
# Restores the full media-downloader + Immich stack from a B2 cloud backup.
# Run on a fresh Ubuntu 24.04 server (or the same machine after failure).
#
# Usage:
# sudo bash cloud_backup_restore.sh [--rclone-conf /path/to/rclone.conf]
#
# Prerequisites on a fresh machine:
# apt update && apt install -y rclone
# Then place your rclone.conf at /root/.config/rclone/rclone.conf
# (contains cloud-backup-remote + cloud-backup-crypt sections)
#
# The script is interactive — it will ask before each destructive step.
# ============================================================================
set -euo pipefail
# ── Configuration ──────────────────────────────────────────────────────────
IMMICH_BASE="/opt/immich"
APP_DIR="/opt/media-downloader"
RCLONE_CONF="${1:---rclone-conf}"
RESTORE_TMP="/tmp/cloud-backup-restore"
LOG_FILE="/tmp/cloud_backup_restore.log"
# If --rclone-conf was passed, grab the value
if [[ "${1:-}" == "--rclone-conf" ]]; then
RCLONE_CONF_PATH="${2:-/root/.config/rclone/rclone.conf}"
else
RCLONE_CONF_PATH="/root/.config/rclone/rclone.conf"
fi
RCLONE_CRYPT="cloud-backup-crypt"
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# ── Helpers ────────────────────────────────────────────────────────────────
log() { echo -e "${GREEN}[$(date '+%H:%M:%S')]${NC} $*" | tee -a "$LOG_FILE"; }
warn() { echo -e "${YELLOW}[$(date '+%H:%M:%S')] WARNING:${NC} $*" | tee -a "$LOG_FILE"; }
err() { echo -e "${RED}[$(date '+%H:%M:%S')] ERROR:${NC} $*" | tee -a "$LOG_FILE"; }
step() { echo -e "\n${BLUE}━━━ $* ━━━${NC}" | tee -a "$LOG_FILE"; }
confirm() {
local msg="$1"
echo -en "${YELLOW}$msg [y/N]: ${NC}"
read -r answer
[[ "$answer" =~ ^[Yy]$ ]]
}
check_root() {
if [[ $EUID -ne 0 ]]; then
err "This script must be run as root"
exit 1
fi
}
# ── Pre-flight checks ─────────────────────────────────────────────────────
preflight() {
step "Pre-flight checks"
check_root
# Check rclone
if ! command -v rclone &>/dev/null; then
err "rclone not installed. Install with: apt install -y rclone"
exit 1
fi
log "rclone: $(rclone --version | head -1)"
# Check rclone config
if [[ ! -f "$RCLONE_CONF_PATH" ]]; then
err "rclone config not found at $RCLONE_CONF_PATH"
echo "You need the rclone.conf with [cloud-backup-remote] and [cloud-backup-crypt] sections."
echo "If restoring to a new machine, copy rclone.conf from your backup records."
exit 1
fi
log "rclone config: $RCLONE_CONF_PATH"
# Test remote connection
log "Testing remote connection..."
if rclone lsd "${RCLONE_CRYPT}:" --config "$RCLONE_CONF_PATH" --max-depth 1 &>/dev/null; then
log "Remote connection: OK"
else
err "Cannot connect to remote. Check your rclone config and encryption passwords."
exit 1
fi
# Show what's available on remote
log "Remote directories:"
rclone lsd "${RCLONE_CRYPT}:" --config "$RCLONE_CONF_PATH" --max-depth 1 2>/dev/null | tee -a "$LOG_FILE"
mkdir -p "$RESTORE_TMP"
}
# ── Step 1: Download app_backup and db_dumps first ────────────────────────
download_configs() {
step "Step 1: Downloading app_backup and db_dumps from remote"
mkdir -p "$RESTORE_TMP/app_backup" "$RESTORE_TMP/db_dumps"
log "Downloading app_backup..."
rclone copy "${RCLONE_CRYPT}:app_backup" "$RESTORE_TMP/app_backup" \
--config "$RCLONE_CONF_PATH" --progress 2>&1 | tee -a "$LOG_FILE"
log "Downloading db_dumps..."
rclone copy "${RCLONE_CRYPT}:db_dumps" "$RESTORE_TMP/db_dumps" \
--config "$RCLONE_CONF_PATH" --progress 2>&1 | tee -a "$LOG_FILE"
# Verify we got the essentials
if [[ ! -f "$RESTORE_TMP/app_backup/media-downloader-app.tar.gz" ]]; then
err "media-downloader-app.tar.gz not found in backup!"
exit 1
fi
log "App archive size: $(du -sh "$RESTORE_TMP/app_backup/media-downloader-app.tar.gz" | cut -f1)"
ls -la "$RESTORE_TMP/db_dumps/" | tee -a "$LOG_FILE"
ls -la "$RESTORE_TMP/app_backup/" | tee -a "$LOG_FILE"
ls -la "$RESTORE_TMP/app_backup/systemd/" 2>/dev/null | tee -a "$LOG_FILE"
}
# ── Step 2: Install system dependencies ───────────────────────────────────
install_dependencies() {
step "Step 2: Install system dependencies"
if ! confirm "Install system packages (python3, postgresql, docker, node, etc.)?"; then
warn "Skipping dependency installation"
return
fi
log "Updating package lists..."
apt update
log "Installing core packages..."
apt install -y \
python3 python3-venv python3-pip python3-dev \
postgresql postgresql-client \
docker.io docker-compose-v2 \
nodejs npm \
rclone \
xvfb \
python3-pyinotify \
nginx \
git curl wget jq \
build-essential libffi-dev libssl-dev \
libgl1-mesa-glx libglib2.0-0 \
ffmpeg \
2>&1 | tee -a "$LOG_FILE"
# Enable and start essential services
systemctl enable --now docker
systemctl enable --now postgresql
log "System dependencies installed"
}
# ── Step 3: Restore media-downloader application ──────────────────────────
restore_app() {
step "Step 3: Restore media-downloader application"
if [[ -d "$APP_DIR" ]]; then
if confirm "$APP_DIR already exists. Back it up and replace?"; then
local backup_name="${APP_DIR}.bak.$(date +%Y%m%d_%H%M%S)"
log "Moving existing app to $backup_name"
mv "$APP_DIR" "$backup_name"
else
warn "Skipping app restore"
return
fi
fi
log "Extracting media-downloader app..."
mkdir -p /opt
tar xzf "$RESTORE_TMP/app_backup/media-downloader-app.tar.gz" -C /opt
log "App extracted to $APP_DIR"
# Recreate venv
log "Creating Python virtual environment..."
python3 -m venv "$APP_DIR/venv"
log "Installing Python dependencies (this may take a while)..."
"$APP_DIR/venv/bin/pip" install --upgrade pip wheel 2>&1 | tail -3 | tee -a "$LOG_FILE"
"$APP_DIR/venv/bin/pip" install -r "$APP_DIR/requirements.txt" 2>&1 | tail -10 | tee -a "$LOG_FILE"
log "Python dependencies installed"
# Rebuild frontend
log "Installing frontend dependencies..."
cd "$APP_DIR/web/frontend"
npm install 2>&1 | tail -5 | tee -a "$LOG_FILE"
log "Building frontend..."
npx tsc && npx vite build 2>&1 | tail -5 | tee -a "$LOG_FILE"
log "Frontend built"
# Install Playwright browsers
log "Installing Playwright browsers..."
"$APP_DIR/venv/bin/python3" -m playwright install chromium firefox 2>&1 | tail -5 | tee -a "$LOG_FILE"
"$APP_DIR/venv/bin/python3" -m playwright install-deps 2>&1 | tail -5 | tee -a "$LOG_FILE"
# Create required directories
mkdir -p "$APP_DIR/logs" "$APP_DIR/temp" "$APP_DIR/cache/thumbnails"
log "Application restored"
}
# ── Step 4: Restore Immich ────────────────────────────────────────────────
restore_immich() {
step "Step 4: Restore Immich configuration"
mkdir -p "$IMMICH_BASE"
# Restore docker-compose and .env
if [[ -f "$RESTORE_TMP/app_backup/immich-docker-compose.yml" ]]; then
cp "$RESTORE_TMP/app_backup/immich-docker-compose.yml" "$IMMICH_BASE/docker-compose.yml"
log "Restored Immich docker-compose.yml"
fi
if [[ -f "$RESTORE_TMP/app_backup/immich-env" ]]; then
cp "$RESTORE_TMP/app_backup/immich-env" "$IMMICH_BASE/.env"
log "Restored Immich .env"
fi
# Create required directories
mkdir -p "$IMMICH_BASE/upload" "$IMMICH_BASE/db" "$IMMICH_BASE/db_dumps" "$IMMICH_BASE/app_backup"
log "Immich config restored. Media files will be synced in Step 7."
}
# ── Step 5: Restore databases ─────────────────────────────────────────────
restore_databases() {
step "Step 5: Restore databases"
# Media Downloader PostgreSQL (supports both .dump and legacy .sql)
# Media Downloader PostgreSQL (supports directory dump, .dump, and legacy .sql)
local md_dir="$RESTORE_TMP/db_dumps/media_downloader_dump"
local md_dump="$RESTORE_TMP/db_dumps/media_downloader.dump"
local md_sql="$RESTORE_TMP/db_dumps/media_downloader.sql"
if [[ -d "$md_dir" || -f "$md_dump" || -f "$md_sql" ]]; then
if confirm "Restore media_downloader PostgreSQL database?"; then
log "Creating media_downloader database and user..."
sudo -u postgres psql -c "CREATE USER media_downloader WITH PASSWORD 'PNsihOXvvuPwWiIvGlsc9Fh2YmMmB';" 2>/dev/null || true
sudo -u postgres psql -c "DROP DATABASE IF EXISTS media_downloader;" 2>/dev/null || true
sudo -u postgres psql -c "CREATE DATABASE media_downloader OWNER media_downloader;" 2>/dev/null || true
if [[ -d "$md_dir" ]]; then
log "Importing media_downloader dump (parallel directory format)..."
PGPASSWORD=PNsihOXvvuPwWiIvGlsc9Fh2YmMmB pg_restore -h localhost -U media_downloader \
-d media_downloader --no-owner --no-acl -j 4 "$md_dir" 2>&1 | tail -5 | tee -a "$LOG_FILE"
elif [[ -f "$md_dump" ]]; then
log "Importing media_downloader dump (custom format)..."
PGPASSWORD=PNsihOXvvuPwWiIvGlsc9Fh2YmMmB pg_restore -h localhost -U media_downloader \
-d media_downloader --no-owner --no-acl "$md_dump" 2>&1 | tail -5 | tee -a "$LOG_FILE"
else
log "Importing media_downloader dump (SQL format)..."
PGPASSWORD=PNsihOXvvuPwWiIvGlsc9Fh2YmMmB psql -h localhost -U media_downloader \
-d media_downloader < "$md_sql" 2>&1 | tail -5 | tee -a "$LOG_FILE"
fi
log "media_downloader database restored"
fi
else
warn "media_downloader dump not found in backup"
fi
# Immich PostgreSQL (supports .tar directory dump, .dump, and legacy .sql)
local im_tar="$RESTORE_TMP/db_dumps/immich_dump.tar"
local im_dump="$RESTORE_TMP/db_dumps/immich.dump"
local im_sql="$RESTORE_TMP/db_dumps/immich.sql"
if [[ -f "$im_tar" || -f "$im_dump" || -f "$im_sql" ]]; then
if confirm "Restore Immich PostgreSQL database? (starts Immich containers first)"; then
log "Starting Immich database container..."
cd "$IMMICH_BASE"
docker compose up -d database 2>&1 | tee -a "$LOG_FILE"
log "Waiting for Immich PostgreSQL to be ready..."
for i in $(seq 1 30); do
if docker exec immich_postgres pg_isready -U postgres &>/dev/null; then
break
fi
sleep 2
done
if [[ -f "$im_tar" ]]; then
log "Importing Immich dump (parallel directory format)..."
docker cp "$im_tar" immich_postgres:/tmp/immich_dump.tar
docker exec immich_postgres sh -c "cd /tmp && tar xf immich_dump.tar"
docker exec immich_postgres pg_restore -U postgres -d immich \
--no-owner --no-acl -j 4 /tmp/immich_dump 2>&1 | tail -5 | tee -a "$LOG_FILE"
docker exec immich_postgres sh -c "rm -rf /tmp/immich_dump /tmp/immich_dump.tar"
elif [[ -f "$im_dump" ]]; then
log "Importing Immich dump (custom format)..."
docker cp "$im_dump" immich_postgres:/tmp/immich.dump
docker exec immich_postgres pg_restore -U postgres -d immich \
--no-owner --no-acl /tmp/immich.dump 2>&1 | tail -5 | tee -a "$LOG_FILE"
docker exec immich_postgres rm -f /tmp/immich.dump
else
log "Importing Immich dump (SQL format)..."
docker exec -i immich_postgres psql -U postgres -d immich \
< "$im_sql" 2>&1 | tail -5 | tee -a "$LOG_FILE"
fi
log "Immich database restored"
fi
else
warn "immich dump not found in backup"
fi
}
# ── Step 6: Restore systemd services & rclone config ─────────────────────
restore_services() {
step "Step 6: Restore systemd services and configs"
# Systemd service files
if [[ -d "$RESTORE_TMP/app_backup/systemd" ]]; then
if confirm "Install systemd service files?"; then
for svc in "$RESTORE_TMP/app_backup/systemd/"*; do
local name=$(basename "$svc")
cp "$svc" "/etc/systemd/system/$name"
log "Installed $name"
done
systemctl daemon-reload
log "systemd reloaded"
fi
fi
# rclone config
if [[ -f "$RESTORE_TMP/app_backup/rclone.conf" ]]; then
if confirm "Restore rclone.conf?"; then
mkdir -p /root/.config/rclone
cp "$RESTORE_TMP/app_backup/rclone.conf" /root/.config/rclone/rclone.conf
chmod 600 /root/.config/rclone/rclone.conf
log "rclone.conf restored"
fi
fi
}
# ── Step 7: Download media files ──────────────────────────────────────────
restore_media() {
step "Step 7: Download media files from remote"
local dirs=$(rclone lsd "${RCLONE_CRYPT}:" --config "$RCLONE_CONF_PATH" --max-depth 1 2>/dev/null | awk '{print $NF}')
echo ""
log "Available remote directories:"
echo "$dirs" | while read -r d; do echo " - $d"; done
if ! confirm "Download all media directories? (This may take a long time for large backups)"; then
warn "Skipping media download. You can sync manually later with:"
echo " rclone copy ${RCLONE_CRYPT}:<dir> ${IMMICH_BASE}/<dir> --config $RCLONE_CONF_PATH --progress --transfers 4"
return
fi
echo "$dirs" | while read -r dir_name; do
# Skip dirs we already downloaded
[[ "$dir_name" == "app_backup" || "$dir_name" == "db_dumps" ]] && continue
[[ -z "$dir_name" ]] && continue
log "Syncing $dir_name..."
mkdir -p "$IMMICH_BASE/$dir_name"
rclone copy "${RCLONE_CRYPT}:${dir_name}" "$IMMICH_BASE/$dir_name" \
--config "$RCLONE_CONF_PATH" \
--progress \
--transfers 4 \
--checkers 8 \
2>&1 | tee -a "$LOG_FILE"
log "$dir_name done"
done
log "Media files restored"
}
# ── Step 8: Start services ───────────────────────────────────────────────
start_services() {
step "Step 8: Start services"
if confirm "Start all services?"; then
# Start Immich stack
log "Starting Immich..."
cd "$IMMICH_BASE"
docker compose up -d 2>&1 | tee -a "$LOG_FILE"
# Enable and start media-downloader services
log "Starting media-downloader services..."
systemctl enable --now xvfb-media-downloader.service 2>/dev/null || true
systemctl enable --now media-downloader-api.service
systemctl enable --now media-downloader.service
systemctl enable --now media-downloader-frontend.service 2>/dev/null || true
systemctl enable --now media-downloader-db-cleanup.timer 2>/dev/null || true
systemctl enable --now cloud-backup-sync.service
sleep 5
# Status check
log "Service status:"
for svc in media-downloader-api media-downloader cloud-backup-sync; do
local status=$(systemctl is-active "$svc" 2>/dev/null || echo "not found")
if [[ "$status" == "active" ]]; then
echo -e " ${GREEN}${NC} $svc: $status"
else
echo -e " ${RED}${NC} $svc: $status"
fi
done
# Docker containers
log "Docker containers:"
docker ps --format "table {{.Names}}\t{{.Status}}" | tee -a "$LOG_FILE"
fi
}
# ── Step 9: Post-restore verification ─────────────────────────────────────
verify() {
step "Step 9: Post-restore verification"
local issues=0
# Check API
if curl -sf http://localhost:8000/api/health &>/dev/null; then
log "API health check: OK"
else
warn "API health check: FAILED (may still be starting)"
((issues++))
fi
# Check Immich
if curl -sf http://localhost:2283/api/server-info/ping &>/dev/null; then
log "Immich health check: OK"
else
warn "Immich health check: FAILED (may still be starting)"
((issues++))
fi
# Check database
if PGPASSWORD=PNsihOXvvuPwWiIvGlsc9Fh2YmMmB psql -h localhost -U media_downloader -d media_downloader -c "SELECT 1" &>/dev/null; then
log "Media Downloader DB: OK"
else
warn "Media Downloader DB: FAILED"
((issues++))
fi
# Disk usage
log "Disk usage:"
df -h /opt/immich /opt/media-downloader 2>/dev/null | tee -a "$LOG_FILE"
echo ""
if [[ $issues -eq 0 ]]; then
log "${GREEN}Restore completed successfully!${NC}"
else
warn "Restore completed with $issues issue(s). Check the log: $LOG_FILE"
fi
echo ""
log "Restore log saved to: $LOG_FILE"
}
# ── Main ──────────────────────────────────────────────────────────────────
main() {
echo -e "${BLUE}"
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ Cloud Backup Restore — Media Downloader ║"
echo "║ ║"
echo "║ Restores: App, Databases, Media, Configs, Services ║"
echo "║ Source: Backblaze B2 (rclone crypt encrypted) ║"
echo "╚══════════════════════════════════════════════════════════════╝"
echo -e "${NC}"
echo "This script will restore your media-downloader + Immich stack"
echo "from an encrypted B2 cloud backup. Each step asks for confirmation."
echo ""
echo "Log: $LOG_FILE"
echo ""
if ! confirm "Ready to begin restore?"; then
echo "Aborted."
exit 0
fi
echo "" > "$LOG_FILE"
preflight
download_configs
install_dependencies
restore_app
restore_immich
restore_databases
restore_services
restore_media
start_services
verify
}
main "$@"

1227
scripts/cloud_backup_sync.py Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,47 @@
#!/bin/bash
# Create version-stamped locked backup using backup-central
set -e
# Get version from VERSION file
VERSION=$(cat /opt/media-downloader/VERSION | tr -d '[:space:]')
# Create timestamp
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
BACKUP_NAME="${VERSION}-${TIMESTAMP}"
# Configuration
PROFILE_ID="profile-media-downloader"
echo "╔════════════════════════════════════════════════╗"
echo "║ Media Downloader Version Backup ║"
echo "╠════════════════════════════════════════════════╣"
echo "║ Version: ${VERSION}"
echo "║ Name: ${BACKUP_NAME}"
echo "╚════════════════════════════════════════════════╝"
echo ""
echo "⏳ Starting backup using backup-central..."
echo ""
# Run backup using CLI with profile, custom name, and locked flag
backup-central backup -P "$PROFILE_ID" -n "$BACKUP_NAME" -l
if [ $? -eq 0 ]; then
echo ""
echo "╔════════════════════════════════════════════════╗"
echo "║ Backup Complete ║"
echo "╠════════════════════════════════════════════════╣"
echo "║ Name: ${BACKUP_NAME}"
echo "║ Profile: Media Downloader ║"
echo "║ Status: Locked & Protected ║"
echo "║ Type: Incremental ║"
echo "╚════════════════════════════════════════════════╝"
echo ""
echo "✓ Version backup created successfully!"
echo ""
else
echo ""
echo "✗ Backup failed!"
echo ""
exit 1
fi

93
scripts/db-cleanup.sh Executable file
View File

@@ -0,0 +1,93 @@
#!/bin/bash
# Database Cleanup Script
# Scans database for missing files and removes their references
# Runs via systemd timer nightly at 3:00 AM
set -e
# Configuration
API_URL="http://localhost:8000/api/maintenance/cleanup/missing-files"
STATUS_URL="http://localhost:8000/api/maintenance/cleanup/status"
LOG_FILE="/opt/media-downloader/logs/db-cleanup.log"
TOKEN_SCRIPT="/opt/media-downloader/scripts/get-api-token.sh"
# Logging function
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
# Get API token
if [ ! -f "$TOKEN_SCRIPT" ]; then
log "ERROR: API token script not found at $TOKEN_SCRIPT"
exit 1
fi
$TOKEN_SCRIPT > /dev/null 2>&1
TOKEN=$(cat /tmp/api_token.txt 2>/dev/null)
if [ -z "$TOKEN" ]; then
log "ERROR: Failed to get API token"
exit 1
fi
log "Starting database cleanup (dry_run=false)"
# Start cleanup
RESPONSE=$(curl -s -X POST "$API_URL" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $TOKEN" \
-d '{"dry_run": false}')
if [ $? -ne 0 ]; then
log "ERROR: Failed to start cleanup"
exit 1
fi
log "Cleanup started, waiting for completion..."
# Poll for status
MAX_WAIT=300 # 5 minutes max
WAITED=0
INTERVAL=5
while [ $WAITED -lt $MAX_WAIT ]; do
sleep $INTERVAL
WAITED=$((WAITED + INTERVAL))
STATUS=$(curl -s "$STATUS_URL" -H "Authorization: Bearer $TOKEN")
STATUS_CODE=$(echo "$STATUS" | grep -o '"status":"[^"]*"' | cut -d'"' -f4)
case "$STATUS_CODE" in
"completed")
TOTAL_CHECKED=$(echo "$STATUS" | grep -o '"total_checked":[0-9]*' | cut -d':' -f2)
TOTAL_MISSING=$(echo "$STATUS" | grep -o '"total_missing":[0-9]*' | cut -d':' -f2)
TOTAL_REMOVED=$(echo "$STATUS" | grep -o '"total_removed":[0-9]*' | cut -d':' -f2)
DURATION=$(echo "$STATUS" | grep -o '"duration_seconds":[0-9.]*' | cut -d':' -f2)
log "SUCCESS: Cleanup completed"
log " Checked: $TOTAL_CHECKED files"
log " Missing: $TOTAL_MISSING files"
log " Removed: $TOTAL_REMOVED references"
log " Duration: ${DURATION}s"
exit 0
;;
"failed")
ERROR=$(echo "$STATUS" | grep -o '"error":"[^"]*"' | cut -d'"' -f4)
log "ERROR: Cleanup failed - $ERROR"
exit 1
;;
"running")
log "Still running... (${WAITED}s elapsed)"
;;
"no_scan")
log "ERROR: Cleanup job not found"
exit 1
;;
*)
log "WARNING: Unknown status - $STATUS_CODE"
;;
esac
done
log "ERROR: Cleanup timed out after ${MAX_WAIT}s"
exit 1

125
scripts/fix_kylie_tags.py Normal file
View File

@@ -0,0 +1,125 @@
#!/usr/bin/env python3
"""Fix tagged users for recently backfilled kyliejenner posts.
Fetches each post via /api/v1/media/{code}/info/ and inserts tagged users.
"""
import json
import string
import sys
import time
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import modules.db_bootstrap # noqa: F401
import sqlite3
from curl_cffi.requests import Session as CurlSession
from datetime import datetime
CREATOR_ID = 110
SLEEP_BETWEEN = 1.5
CHARSET = string.ascii_uppercase + string.ascii_lowercase + string.digits + '-_'
def shortcode_to_media_id(code):
media_id = 0
for char in code:
media_id = media_id * 64 + CHARSET.index(char)
return str(media_id)
def main():
conn = sqlite3.connect('media_downloader')
cursor = conn.cursor()
# Load cookies
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = 'instagram_browser'")
cookie_list = json.loads(cursor.fetchone()[0])
# Get the backfilled posts
cursor.execute("""
SELECT p.id, p.post_id FROM paid_content_posts p
WHERE p.creator_id = ? AND p.added_at >= '2026-03-28T21:00:00'
ORDER BY p.id
""", (CREATOR_ID,))
posts = cursor.fetchall()
print(f"Found {len(posts)} posts to check for tags")
session = CurlSession(impersonate='edge101')
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
'X-IG-App-ID': '936619743392459',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://www.instagram.com/',
'Origin': 'https://www.instagram.com',
'Sec-CH-UA': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
'Sec-CH-UA-Mobile': '?0',
'Sec-CH-UA-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
})
for c in cookie_list:
if c.get('name') and c.get('value'):
session.cookies.set(c['name'], c['value'], domain=c.get('domain', '.instagram.com'))
tagged_count = 0
now = datetime.now().isoformat()
for i, (db_id, code) in enumerate(posts):
try:
media_id = shortcode_to_media_id(code)
resp = session.get(
f'https://www.instagram.com/api/v1/media/{media_id}/info/',
timeout=10
)
if resp.status_code != 200:
print(f" [{i+1}/{len(posts)}] {code}: HTTP {resp.status_code}")
if resp.status_code == 429:
print(" Rate limited, waiting 60s...")
time.sleep(60)
continue
data = resp.json()
items = data.get('items', [])
if not items:
print(f" [{i+1}/{len(posts)}] {code}: no items")
continue
node = items[0]
tagged_users = []
for tag in (node.get('usertags') or {}).get('in', []):
uname = (tag.get('user') or {}).get('username')
if uname and uname not in tagged_users:
tagged_users.append(uname)
for cm in node.get('carousel_media') or []:
for tag in (cm.get('usertags') or {}).get('in', []):
uname = (tag.get('user') or {}).get('username')
if uname and uname not in tagged_users:
tagged_users.append(uname)
if tagged_users:
for uname in tagged_users:
cursor.execute(
"""INSERT INTO paid_content_post_tagged_users (post_id, username, created_at)
VALUES (?, ?, ?) ON CONFLICT (post_id, username) DO NOTHING""",
(db_id, uname, now)
)
conn.commit()
tagged_count += 1
print(f" [{i+1}/{len(posts)}] {code}: {', '.join(tagged_users)}")
else:
if (i + 1) % 50 == 0:
print(f" [{i+1}/{len(posts)}] progress... ({tagged_count} tagged so far)")
except Exception as e:
print(f" [{i+1}/{len(posts)}] {code}: error: {e}")
time.sleep(SLEEP_BETWEEN)
conn.close()
print(f"\nDone! Tagged {tagged_count} posts out of {len(posts)}.")
if __name__ == '__main__':
main()

114
scripts/fix_special_dirs.py Normal file
View File

@@ -0,0 +1,114 @@
#!/usr/bin/env python3
"""
Rename special directories (manual_*, PPV, import_*) to use date format.
For multiple posts on same date, use suffixes: YYYY-MM-DD, YYYY-MM-DD_2, etc.
"""
import os
import sys
from pathlib import Path
from collections import defaultdict
sys.path.insert(0, '/opt/media-downloader')
from modules.unified_database import UnifiedDatabase
def main():
import argparse
parser = argparse.ArgumentParser(description='Fix special directory names')
parser.add_argument('--dry-run', action='store_true', help='Show changes without making them')
args = parser.parse_args()
db = UnifiedDatabase()
base_path = Path('/opt/immich/paid/fansly/puffinasmr')
stats = {'renamed': 0, 'db_updated': 0, 'errors': 0}
# Find all special directories grouped by date
date_dirs = defaultdict(list)
for date_dir in base_path.iterdir():
if not date_dir.is_dir():
continue
date_str = date_dir.name
for post_dir in date_dir.iterdir():
if not post_dir.is_dir():
continue
name = post_dir.name
# Check if it's a special directory (not a numeric post_id)
if name.startswith('manual_') or name.startswith('import_') or name == 'PPV':
date_dirs[date_str].append(post_dir)
# Process each date
for date_str, dirs in sorted(date_dirs.items()):
# Check if a date-named directory already exists
existing_date_dir = base_path / date_str / date_str
suffix = 1
if existing_date_dir.exists():
# Find next available suffix
while (base_path / date_str / f"{date_str}_{suffix + 1}").exists():
suffix += 1
suffix += 1
for old_dir in sorted(dirs, key=lambda d: d.name):
# Determine new name
if suffix == 1:
new_name = date_str
else:
new_name = f"{date_str}_{suffix}"
new_dir = old_dir.parent / new_name
# Skip if target exists
if new_dir.exists():
print(f" SKIP (exists): {old_dir} -> {new_dir}")
suffix += 1
continue
print(f" {old_dir.name} -> {new_name}")
if not args.dry_run:
try:
old_dir.rename(new_dir)
stats['renamed'] += 1
# Update database paths
with db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute("""
UPDATE paid_content_attachments
SET local_path = REPLACE(local_path, ?, ?)
WHERE local_path LIKE ?
""", (str(old_dir), str(new_dir), f"%{old_dir}%"))
stats['db_updated'] += cursor.rowcount
# Also update posts table if post_id matches the old dir name
old_name = old_dir.name
if old_name.startswith('manual_'):
cursor.execute("""
UPDATE paid_content_posts
SET post_id = ?
WHERE post_id = ?
""", (new_name, old_name))
conn.commit()
except Exception as e:
print(f" ERROR: {e}")
stats['errors'] += 1
suffix += 1
print("\n" + "=" * 50)
print("SUMMARY")
print("=" * 50)
print(f"Directories renamed: {stats['renamed']}")
print(f"DB records updated: {stats['db_updated']}")
print(f"Errors: {stats['errors']}")
if args.dry_run:
print("\n(Dry run - no changes made)")
if __name__ == '__main__':
main()

103
scripts/generate-embeddings.py Executable file
View File

@@ -0,0 +1,103 @@
#!/usr/bin/env python3
"""
Nightly embedding generation script
Run via systemd timer to index new media files
"""
import sys
import os
# Add parent directory to path for imports
sys.path.insert(0, '/opt/media-downloader')
# Bootstrap database backend (must be before any sqlite3 imports)
import modules.db_bootstrap # noqa: E402,F401
from modules.universal_logger import get_logger
from modules.unified_database import UnifiedDatabase
from modules.semantic_search import SemanticSearch
logger = get_logger('EmbeddingGenerator')
def generate_embeddings(db):
"""Generate embeddings for files that don't have them yet"""
logger.info("=== Embedding Generation ===")
try:
semantic = SemanticSearch(db)
# Get current stats
stats = semantic.get_embedding_stats()
logger.info(f"Current stats: {stats['total_embeddings']} embeddings, "
f"{stats['missing_embeddings']} missing, "
f"{stats['coverage_percent']}% coverage")
if stats['missing_embeddings'] == 0:
logger.info("All files already have embeddings, nothing to do")
return 0
# Process in batches of 1000 files
batch_size = 1000
total_processed = 0
max_batches = 10 # Process up to 10000 files per night
for batch_num in range(max_batches):
if stats['missing_embeddings'] == 0:
break
logger.info(f"Processing batch {batch_num + 1}/{max_batches} "
f"({stats['missing_embeddings']} files remaining)")
def progress_callback(processed, total, current_file):
if processed % 100 == 0:
logger.info(f" Progress: {processed}/{total} - {current_file}")
results = semantic.generate_embeddings_batch(
limit=batch_size,
progress_callback=progress_callback
)
total_processed += results['success']
logger.info(f"Batch {batch_num + 1} complete: "
f"{results['success']} success, "
f"{results['errors']} errors, "
f"{results['skipped']} skipped")
# Update stats for next iteration
stats = semantic.get_embedding_stats()
# Final stats
final_stats = semantic.get_embedding_stats()
logger.info(f"Embedding generation complete: {total_processed} new embeddings generated")
logger.info(f"Final coverage: {final_stats['coverage_percent']}% "
f"({final_stats['total_embeddings']}/{final_stats['total_files']} files)")
return total_processed
except Exception as e:
logger.error(f"Embedding generation failed: {e}")
return 0
def main():
"""Generate embeddings for files that don't have them yet"""
logger.info("Starting nightly embedding generation")
try:
# Initialize database
db = UnifiedDatabase()
# Generate embeddings
embeddings_processed = generate_embeddings(db)
logger.info(f"=== Nightly indexing complete ===")
logger.info(f" Embeddings generated: {embeddings_processed}")
except Exception as e:
logger.error(f"Nightly indexing failed: {e}")
sys.exit(1)
if __name__ == '__main__':
main()

18
scripts/get-api-token.sh Executable file
View File

@@ -0,0 +1,18 @@
#!/bin/bash
# Get API token for claude_test account and save to /tmp/api_token.txt
# Usage: /opt/media-downloader/scripts/get-api-token.sh
#
# After running this, use api-call.sh to make authenticated requests:
# /opt/media-downloader/scripts/api-call.sh "/api/video-queue?limit=2"
TOKEN=$(curl -s -X POST "http://localhost:8000/api/auth/login" \
-H "Content-Type: application/json" \
-d '{"username": "claude_test", "password": "ClaudeTest2025Secure"}' | jq -r '.token')
if [ "$TOKEN" != "null" ] && [ -n "$TOKEN" ]; then
echo "$TOKEN" > /tmp/api_token.txt
echo "Token saved to /tmp/api_token.txt"
else
echo "Failed to get token"
exit 1
fi

45
scripts/get-podchaser-token.sh Executable file
View File

@@ -0,0 +1,45 @@
#!/bin/bash
# Helper script to exchange Podchaser client credentials for an access token
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Usage: $0 <client_id> <client_secret>"
echo ""
echo "Example:"
echo " $0 'your-client-id' 'your-client-secret'"
echo ""
echo "Get your credentials from: https://www.podchaser.com/creators/dashboard/api"
exit 1
fi
CLIENT_ID="$1"
CLIENT_SECRET="$2"
echo "Exchanging Podchaser credentials for access token..."
RESPONSE=$(curl -s -X POST "https://api.podchaser.com/graphql" \
-H "Content-Type: application/json" \
-d "{\"query\": \"mutation { requestAccessToken(input: { grant_type: CLIENT_CREDENTIALS client_id: \\\"$CLIENT_ID\\\" client_secret: \\\"$CLIENT_SECRET\\\" }) { access_token } }\"}")
# Check for errors
if echo "$RESPONSE" | jq -e '.errors' > /dev/null 2>&1; then
echo "❌ Error getting access token:"
echo "$RESPONSE" | jq -r '.errors[].message'
exit 1
fi
# Extract access token
ACCESS_TOKEN=$(echo "$RESPONSE" | jq -r '.data.requestAccessToken.access_token')
if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then
echo "❌ Failed to get access token. Response:"
echo "$RESPONSE" | jq '.'
exit 1
fi
echo ""
echo "✅ Success! Your Podchaser access token:"
echo ""
echo "$ACCESS_TOKEN"
echo ""
echo "This token is valid for 1 year."
echo "Copy this token and paste it into Configuration > Appearances > Podchaser API Key"

674
scripts/install.sh Executable file
View File

@@ -0,0 +1,674 @@
#!/bin/bash
# Media Downloader Installer Script
# Version: 13.13.1
# Installs to /opt/media-downloader
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Installation directory
INSTALL_DIR="/opt/media-downloader"
SERVICE_NAME="media-downloader"
CURRENT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
echo -e "${GREEN}╔════════════════════════════════════════════════╗${NC}"
echo -e "${GREEN}║ Media Downloader Installer v13.13.1 ║${NC}"
echo -e "${GREEN}╚════════════════════════════════════════════════╝${NC}"
echo ""
# Check if running as root
if [[ $EUID -ne 0 ]]; then
echo -e "${RED}This script must be run as root (use sudo)${NC}"
exit 1
fi
# Get the actual user who ran sudo
ACTUAL_USER="${SUDO_USER:-$USER}"
ACTUAL_HOME=$(getent passwd "$ACTUAL_USER" | cut -d: -f6)
echo -e "${YELLOW}Installation Settings:${NC}"
echo " Install directory: $INSTALL_DIR"
echo " Service name: $SERVICE_NAME"
echo " User: $ACTUAL_USER"
echo " Source: $CURRENT_DIR"
echo ""
read -p "Continue with installation? (y/n) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "Installation cancelled"
exit 1
fi
# Stop services if they exist
echo -e "${YELLOW}Stopping existing services...${NC}"
systemctl stop $SERVICE_NAME 2>/dev/null || true
systemctl stop media-downloader-api 2>/dev/null || true
systemctl stop media-downloader-frontend 2>/dev/null || true
systemctl stop xvfb-media-downloader 2>/dev/null || true
# Create installation directory
echo -e "${GREEN}Creating installation directory...${NC}"
mkdir -p "$INSTALL_DIR"
# Copy files
echo -e "${GREEN}Copying files...${NC}"
rsync -a --exclude='.git' --exclude='node_modules' --exclude='venv' --exclude='__pycache__' \
--exclude='.playwright' --exclude='dist' --exclude='*.pyc' \
"$CURRENT_DIR/" "$INSTALL_DIR/"
# Create required directories
echo -e "${GREEN}Creating required directories...${NC}"
mkdir -p "$INSTALL_DIR/logs"
mkdir -p "$INSTALL_DIR/database"
mkdir -p "$INSTALL_DIR/cookies"
mkdir -p "$INSTALL_DIR/sessions"
mkdir -p "$INSTALL_DIR/config"
mkdir -p "$INSTALL_DIR/data"
mkdir -p "$INSTALL_DIR/data/face_references" # Face recognition reference images
mkdir -p "$INSTALL_DIR/data/cache/profile_images" # Cached creator avatars/banners
mkdir -p "/opt/immich/review" # Face recognition review queue
mkdir -p "/opt/immich/recycle" # Recycle bin
mkdir -p "/var/log/media-downloader" # System log directory
# Set permissions
echo -e "${GREEN}Setting permissions...${NC}"
chown -R "$ACTUAL_USER:$ACTUAL_USER" "$INSTALL_DIR"
chmod +x "$INSTALL_DIR/media-downloader.py"
chmod +x "$INSTALL_DIR/scripts/"*.sh
chmod +x "$INSTALL_DIR/scripts/"*.py 2>/dev/null || true
# Install system dependencies
echo -e "${GREEN}Installing system dependencies...${NC}"
apt-get update > /dev/null 2>&1
apt-get install -y cmake build-essential libopenblas-dev liblapack-dev \
ffmpeg redis-server xvfb nodejs npm \
libheif-examples imagemagick \
postgresql postgresql-contrib libpq-dev > /dev/null 2>&1
# Start Redis if not running
systemctl enable redis-server
systemctl start redis-server
# Setup PostgreSQL
echo -e "${GREEN}Setting up PostgreSQL...${NC}"
systemctl enable postgresql
systemctl start postgresql
sudo -u postgres psql -tc "SELECT 1 FROM pg_roles WHERE rolname='media_downloader'" | grep -q 1 || \
sudo -u postgres psql -c "CREATE USER media_downloader WITH PASSWORD 'changeme';"
sudo -u postgres psql -tc "SELECT 1 FROM pg_database WHERE datname='media_downloader'" | grep -q 1 || \
sudo -u postgres createdb -O media_downloader media_downloader
echo -e "${GREEN}✓ PostgreSQL configured (user: media_downloader, db: media_downloader)${NC}"
echo -e "${YELLOW}⚠ Remember to update DATABASE_URL in .env with the correct password${NC}"
# Create virtual environment
echo -e "${GREEN}Creating Python virtual environment...${NC}"
rm -rf "$INSTALL_DIR/venv" 2>/dev/null || true
python3 -m venv "$INSTALL_DIR/venv"
chown -R "$ACTUAL_USER:$ACTUAL_USER" "$INSTALL_DIR/venv"
# Install Python dependencies
echo -e "${GREEN}Installing Python dependencies from requirements.txt...${NC}"
sudo -u "$ACTUAL_USER" "$INSTALL_DIR/venv/bin/python" -m pip install --upgrade pip
sudo -u "$ACTUAL_USER" "$INSTALL_DIR/venv/bin/python" -m pip install -r "$INSTALL_DIR/requirements.txt"
# Install playwright browsers
echo -e "${GREEN}Installing Playwright browsers...${NC}"
sudo -u "$ACTUAL_USER" bash -c "cd '$INSTALL_DIR' && '$INSTALL_DIR/venv/bin/python' -m playwright install chromium firefox"
# Install frontend dependencies
echo -e "${GREEN}Installing frontend dependencies...${NC}"
cd "$INSTALL_DIR/web/frontend"
sudo -u "$ACTUAL_USER" npm install
sudo -u "$ACTUAL_USER" npm run build
PYTHON_BIN="$INSTALL_DIR/venv/bin/python"
# ============================================================================
# CHECK DEPENDENCIES
# ============================================================================
echo ""
echo -e "${BLUE}Checking Dependencies...${NC}"
# Check for FlareSolverr
if command -v docker &> /dev/null; then
if docker ps | grep -q flaresolverr; then
echo -e "${GREEN}✓ FlareSolverr container is running${NC}"
else
echo -e "${YELLOW}⚠ FlareSolverr container not found${NC}"
read -p "Install FlareSolverr Docker container now? (recommended) (y/n) " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
docker run -d \
--name flaresolverr \
-p 8191:8191 \
-e LOG_LEVEL=info \
--restart unless-stopped \
ghcr.io/flaresolverr/flaresolverr:latest
echo -e "${GREEN}✓ FlareSolverr installed on port 8191${NC}"
fi
fi
else
echo -e "${YELLOW}⚠ Docker not found. FlareSolverr requires Docker for Cloudflare bypass.${NC}"
fi
# ============================================================================
# CREATE SYSTEMD SERVICES
# ============================================================================
echo ""
echo -e "${GREEN}Creating systemd services...${NC}"
# 1. Xvfb Virtual Display Service
cat > "/etc/systemd/system/xvfb-media-downloader.service" << EOF
[Unit]
Description=Xvfb Virtual Display for Media Downloader
After=network.target
[Service]
Type=simple
User=root
ExecStart=/usr/bin/Xvfb :100 -screen 0 1920x1080x24 -nolisten tcp
Restart=always
RestartSec=5
[Install]
WantedBy=multi-user.target
EOF
# 2. Main Scheduler Service
cat > "/etc/systemd/system/$SERVICE_NAME.service" << EOF
[Unit]
Description=Media Downloader Scheduler Service
After=network.target xvfb-media-downloader.service redis-server.service
Wants=xvfb-media-downloader.service
[Service]
Type=simple
User=$ACTUAL_USER
Group=$ACTUAL_USER
WorkingDirectory=$INSTALL_DIR
ExecStart=$PYTHON_BIN $INSTALL_DIR/media-downloader.py --scheduler
Restart=on-failure
RestartSec=30
StandardOutput=append:$INSTALL_DIR/logs/service.log
StandardError=append:$INSTALL_DIR/logs/service.log
Environment="PYTHONUNBUFFERED=1"
Environment="PYTHONDONTWRITEBYTECODE=1"
Environment="DISPLAY=:100"
LimitNOFILE=65536
Nice=10
[Install]
WantedBy=multi-user.target
EOF
# 3. Web API Service
cat > "/etc/systemd/system/media-downloader-api.service" << EOF
[Unit]
Description=Media Downloader Web API
After=network.target redis-server.service
Wants=redis-server.service
[Service]
Type=simple
User=root
Group=root
WorkingDirectory=$INSTALL_DIR/web/backend
ExecStart=$PYTHON_BIN $INSTALL_DIR/web/backend/api.py
Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
Environment="PYTHONUNBUFFERED=1"
Environment="PYTHONDONTWRITEBYTECODE=1"
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
EOF
# 4. Web Frontend Service (Production - serves pre-built static files)
# Note: For development with hot-reload, run: cd web/frontend && npm run dev
cat > "/etc/systemd/system/media-downloader-frontend.service" << EOF
[Unit]
Description=Media Downloader Web Frontend (Production)
After=network.target media-downloader-api.service
Wants=media-downloader-api.service
[Service]
Type=simple
User=root
Group=root
WorkingDirectory=$INSTALL_DIR/web/frontend
ExecStart=/usr/bin/npm run preview -- --host 0.0.0.0 --port 5173
Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
Environment="NODE_ENV=production"
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
EOF
# 4b. Development Frontend Service (optional - hot reload)
cat > "/etc/systemd/system/media-downloader-frontend-dev.service" << EOF
[Unit]
Description=Media Downloader Web Frontend (Development - Hot Reload)
After=network.target media-downloader-api.service
Wants=media-downloader-api.service
[Service]
Type=simple
User=$ACTUAL_USER
Group=$ACTUAL_USER
WorkingDirectory=$INSTALL_DIR/web/frontend
ExecStart=/usr/bin/npm run dev -- --host 0.0.0.0 --port 5173
Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
Environment="NODE_ENV=development"
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
EOF
# 5. Thumbnail Cache Builder Service + Timer
cat > "/etc/systemd/system/media-cache-builder.service" << EOF
[Unit]
Description=Media Thumbnail and Metadata Cache Builder
After=network.target
[Service]
Type=oneshot
User=root
WorkingDirectory=$INSTALL_DIR
ExecStart=$PYTHON_BIN $INSTALL_DIR/modules/thumbnail_cache_builder.py
StandardOutput=journal
StandardError=journal
TimeoutStartSec=3600
Nice=19
IOSchedulingClass=idle
CPUQuota=50%
[Install]
WantedBy=multi-user.target
EOF
cat > "/etc/systemd/system/media-cache-builder.timer" << EOF
[Unit]
Description=Daily Media Cache Builder Timer
Requires=media-cache-builder.service
[Timer]
OnCalendar=*-*-* 03:00:00
Persistent=true
RandomizedDelaySec=30min
[Install]
WantedBy=timers.target
EOF
# 6. Embedding Generator Service + Timer
cat > "/etc/systemd/system/media-embedding-generator.service" << EOF
[Unit]
Description=Media Downloader Embedding Generator (CLIP)
After=network.target
[Service]
Type=oneshot
User=root
WorkingDirectory=$INSTALL_DIR
ExecStart=$PYTHON_BIN $INSTALL_DIR/scripts/generate-embeddings.py
StandardOutput=journal
StandardError=journal
TimeoutStartSec=3600
[Install]
WantedBy=multi-user.target
EOF
cat > "/etc/systemd/system/media-embedding-generator.timer" << EOF
[Unit]
Description=Nightly Media Embedding Generation Timer
Requires=media-embedding-generator.service
[Timer]
OnCalendar=*-*-* 03:00:00
RandomizedDelaySec=1800
Persistent=true
[Install]
WantedBy=timers.target
EOF
# 7. Celebrity Enrichment Service + Timer
cat > "/etc/systemd/system/media-celebrity-enrichment.service" << EOF
[Unit]
Description=Media Downloader Celebrity Metadata Enrichment
After=network.target
[Service]
Type=oneshot
User=root
WorkingDirectory=$INSTALL_DIR
ExecStart=$PYTHON_BIN $INSTALL_DIR/scripts/enrich_celebrity_metadata.py
StandardOutput=journal
StandardError=journal
TimeoutStartSec=3600
[Install]
WantedBy=multi-user.target
EOF
cat > "/etc/systemd/system/media-celebrity-enrichment.timer" << EOF
[Unit]
Description=Nightly Celebrity Metadata Enrichment Timer
Requires=media-celebrity-enrichment.service
[Timer]
OnCalendar=*-*-* 04:00:00
RandomizedDelaySec=300
Persistent=true
[Install]
WantedBy=timers.target
EOF
# 8. Plex Matching Service + Timer
cat > "/etc/systemd/system/plex-match.service" << EOF
[Unit]
Description=Match appearances to Plex library
After=media-downloader-api.service
Requires=media-downloader-api.service
[Service]
Type=oneshot
ExecStart=$INSTALL_DIR/scripts/plex-match.sh
User=root
EOF
cat > "/etc/systemd/system/plex-match.timer" << EOF
[Unit]
Description=Run Plex matching twice daily
[Timer]
OnCalendar=*-*-* 06:00:00
OnCalendar=*-*-* 18:00:00
Persistent=true
[Install]
WantedBy=timers.target
EOF
# 9. Database Cleanup Service + Timer
cat > "/etc/systemd/system/media-downloader-db-cleanup.service" << EOF
[Unit]
Description=Media Downloader Database Cleanup
After=network.target media-downloader-api.service
[Service]
Type=oneshot
User=root
WorkingDirectory=$INSTALL_DIR
ExecStart=$INSTALL_DIR/scripts/db-cleanup.sh
StandardOutput=append:$INSTALL_DIR/logs/db-cleanup.log
StandardError=append:$INSTALL_DIR/logs/db-cleanup.log
MemoryMax=512M
CPUQuota=50%
Restart=no
[Install]
WantedBy=multi-user.target
EOF
cat > "/etc/systemd/system/media-downloader-db-cleanup.timer" << EOF
[Unit]
Description=Media Downloader Database Cleanup Timer
Requires=media-downloader-db-cleanup.service
[Timer]
OnCalendar=*-*-* 03:00:00
Persistent=true
OnBootSec=5min
RandomizedDelaySec=10min
[Install]
WantedBy=timers.target
EOF
# Create command-line wrapper
echo -e "${GREEN}Creating command-line wrapper...${NC}"
cat > "/usr/local/bin/media-downloader" << EOF
#!/bin/bash
cd $INSTALL_DIR
export DISPLAY=:100
$PYTHON_BIN $INSTALL_DIR/media-downloader.py "\$@"
EOF
chmod +x "/usr/local/bin/media-downloader"
# Copy config if it doesn't exist
if [ ! -f "$INSTALL_DIR/config/settings.json" ]; then
echo -e "${GREEN}Copying default configuration...${NC}"
if [ -f "$INSTALL_DIR/config/settings.example.json" ]; then
cp "$INSTALL_DIR/config/settings.example.json" "$INSTALL_DIR/config/settings.json"
fi
chown -R "$ACTUAL_USER:$ACTUAL_USER" "$INSTALL_DIR/config"
chmod 600 "$INSTALL_DIR/config/settings.json" 2>/dev/null || true
fi
# Reload systemd
echo -e "${GREEN}Reloading systemd...${NC}"
systemctl daemon-reload
# ============================================================================
# ENABLE AND START SERVICES
# ============================================================================
echo ""
echo -e "${BLUE}Service Configuration${NC}"
read -p "Enable and start all services? (y/n) " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
echo -e "${GREEN}Enabling services...${NC}"
# Core services
systemctl enable xvfb-media-downloader.service
systemctl enable $SERVICE_NAME.service
systemctl enable media-downloader-api.service
# Timers
systemctl enable media-cache-builder.timer
systemctl enable media-embedding-generator.timer
systemctl enable media-celebrity-enrichment.timer
systemctl enable plex-match.timer
systemctl enable media-downloader-db-cleanup.timer
echo -e "${GREEN}Starting services...${NC}"
# Start in order
systemctl start xvfb-media-downloader.service
sleep 2
systemctl start media-downloader-api.service
sleep 2
systemctl start $SERVICE_NAME.service
# Start timers
systemctl start media-cache-builder.timer
systemctl start media-embedding-generator.timer
systemctl start media-celebrity-enrichment.timer
systemctl start plex-match.timer
systemctl start media-downloader-db-cleanup.timer
echo -e "${GREEN}✓ All services started${NC}"
fi
# ============================================================================
# OPTIONAL: NGINX REVERSE PROXY
# ============================================================================
echo ""
if command -v nginx &> /dev/null; then
read -p "Configure nginx reverse proxy? (recommended for production) (y/n) " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
echo -e "${GREEN}Creating nginx configuration...${NC}"
cat > "/etc/nginx/sites-available/media-downloader" << 'NGINX_EOF'
# Media Downloader Nginx Configuration
# Reverse proxy for API (8000) and Frontend (5173)
server {
listen 80;
server_name _; # Change to your domain
# Frontend (Vite)
location / {
proxy_pass http://127.0.0.1:5173;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection 'upgrade';
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_cache_bypass $http_upgrade;
}
# API Backend
location /api/ {
proxy_pass http://127.0.0.1:8000/api/;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# WebSocket support
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
# Timeouts for long-running requests
proxy_connect_timeout 60s;
proxy_send_timeout 300s;
proxy_read_timeout 300s;
}
# WebSocket endpoint
location /ws {
proxy_pass http://127.0.0.1:8000/ws;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_read_timeout 86400;
}
# Media files (if serving directly)
location /media/ {
alias /opt/immich/media/;
autoindex off;
}
# Thumbnails
location /thumbnails/ {
proxy_pass http://127.0.0.1:8000/api/thumbnails/;
proxy_cache_valid 200 1d;
}
# Increase max upload size for imports
client_max_body_size 500M;
}
NGINX_EOF
# Enable site
ln -sf /etc/nginx/sites-available/media-downloader /etc/nginx/sites-enabled/ 2>/dev/null || true
# Test and reload nginx
if nginx -t 2>/dev/null; then
systemctl reload nginx
echo -e "${GREEN}✓ Nginx configured and reloaded${NC}"
else
echo -e "${YELLOW}⚠ Nginx config has errors - please check manually${NC}"
fi
fi
else
echo -e "${YELLOW}Note: nginx not installed. For production, consider:${NC}"
echo " sudo apt install nginx"
echo " Then re-run installer or manually configure reverse proxy"
fi
# ============================================================================
# COMPLETION MESSAGE
# ============================================================================
echo ""
echo -e "${GREEN}╔════════════════════════════════════════════════╗${NC}"
echo -e "${GREEN}║ Installation Complete! ║${NC}"
echo -e "${GREEN}╚════════════════════════════════════════════════╝${NC}"
echo ""
echo -e "${BLUE}Installation location:${NC} $INSTALL_DIR"
echo -e "${BLUE}Configuration file:${NC} $INSTALL_DIR/config/settings.json"
echo -e "${BLUE}Database directory:${NC} $INSTALL_DIR/database"
echo -e "${BLUE}Logs directory:${NC} $INSTALL_DIR/logs"
echo ""
echo -e "${YELLOW}Services:${NC}"
echo " media-downloader - Main scheduler service"
echo " media-downloader-api - Web API (port 8000)"
echo " media-downloader-frontend - Web UI production (port 5173)"
echo " media-downloader-frontend-dev - Web UI development with hot-reload"
echo " xvfb-media-downloader - Virtual display for browser automation"
echo ""
echo -e "${YELLOW}Scheduled Tasks (timers):${NC}"
echo " media-cache-builder - Thumbnail cache (daily 3 AM)"
echo " media-embedding-generator - CLIP embeddings (daily 3 AM)"
echo " media-downloader-db-cleanup- Database cleanup (daily 3 AM)"
echo " media-celebrity-enrichment - Celebrity metadata (daily 4 AM)"
echo " plex-match - Plex library matching (6 AM, 6 PM)"
echo ""
echo -e "${YELLOW}Commands:${NC}"
echo " media-downloader - Run manual download"
echo " media-downloader --scheduler - Run scheduler"
echo " media-downloader --scheduler-status - Check scheduler status"
echo " media-downloader --platform instagram - Download specific platform"
echo ""
echo -e "${YELLOW}Service Management:${NC}"
echo " sudo systemctl status media-downloader - Check status"
echo " sudo systemctl restart media-downloader - Restart scheduler"
echo " sudo systemctl restart media-downloader-api- Restart API"
echo " sudo journalctl -u media-downloader -f - View logs"
echo ""
echo -e "${YELLOW}Web Interface:${NC}"
echo " API: http://localhost:8000"
echo " Frontend: http://localhost:5173"
echo ""
echo -e "${YELLOW}Development Mode:${NC}"
echo " # Switch to development frontend (with hot-reload):"
echo " sudo systemctl stop media-downloader-frontend"
echo " sudo systemctl start media-downloader-frontend-dev"
echo ""
echo -e "${YELLOW}To uninstall:${NC}"
echo " sudo $INSTALL_DIR/scripts/uninstall.sh"

307
scripts/mds Executable file
View File

@@ -0,0 +1,307 @@
#!/bin/bash
# mds - Media Downloader Services manager
# Usage: mds [command] [service(s)...]
#
# Commands:
# status - Show status of services (default)
# start - Start service(s)
# stop - Stop service(s)
# restart - Restart service(s)
# logs - Show recent logs for a service
#
# Services:
# all - All services
# scheduler - media-downloader (scheduler)
# api - media-downloader-api
# frontend - media-downloader-frontend
# xvfb - xvfb-media-downloader
# proxy - unified-proxy (Docker)
# cache - media-cache-builder
# enrich - media-celebrity-enrichment
# embeddings - media-embedding-generator
# dbcleanup - media-downloader-db-cleanup
# backup - cloud-backup-sync
# backupui - backup-central (web UI)
#
# Examples:
# mds # status of all services
# mds status # same
# mds restart api # restart just the API
# mds restart api frontend # restart API and frontend
# mds restart all # restart all services
# mds stop scheduler # stop the scheduler
# mds logs scheduler # show scheduler logs
set -euo pipefail
# Color codes
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
BOLD='\033[1m'
DIM='\033[2m'
NC='\033[0m' # No Color
# Service name mapping
declare -A SERVICE_MAP=(
[scheduler]="media-downloader"
[api]="media-downloader-api"
[frontend]="media-downloader-frontend"
[xvfb]="xvfb-media-downloader"
[cache]="media-cache-builder"
[enrich]="media-celebrity-enrichment"
[embeddings]="media-embedding-generator"
[dbcleanup]="media-downloader-db-cleanup"
[backup]="cloud-backup-sync"
[backupui]="backup-central"
)
ALL_ALIASES=(scheduler api frontend xvfb proxy cache enrich embeddings dbcleanup backup backupui)
resolve_service() {
local alias="$1"
if [[ "$alias" == "proxy" ]]; then
echo "proxy"
elif [[ -n "${SERVICE_MAP[$alias]+x}" ]]; then
echo "${SERVICE_MAP[$alias]}"
else
# Try as a literal service name
for key in "${!SERVICE_MAP[@]}"; do
if [[ "${SERVICE_MAP[$key]}" == "$alias" ]]; then
echo "$alias"
return
fi
done
echo ""
fi
}
get_alias() {
local service="$1"
for key in "${!SERVICE_MAP[@]}"; do
if [[ "${SERVICE_MAP[$key]}" == "$service" ]]; then
echo "$key"
return
fi
done
echo "$service"
}
print_status_line() {
local alias="$1"
local service="$2"
if [[ "$alias" == "proxy" ]]; then
# Docker container
local state
state=$(docker inspect -f '{{.State.Status}}' unified-proxy 2>/dev/null || echo "not found")
local uptime
uptime=$(docker inspect -f '{{.State.StartedAt}}' unified-proxy 2>/dev/null || echo "")
local color="$RED"
local symbol="●"
if [[ "$state" == "running" ]]; then
color="$GREEN"
fi
printf " ${color}${symbol}${NC} %-12s %-30s %s\n" "$alias" "unified-proxy (docker)" "$state"
return
fi
local active_state sub_state
active_state=$(systemctl show -p ActiveState --value "$service" 2>/dev/null || echo "unknown")
sub_state=$(systemctl show -p SubState --value "$service" 2>/dev/null || echo "unknown")
local color="$RED"
local symbol="●"
case "$active_state" in
active)
color="$GREEN"
;;
inactive)
if [[ "$sub_state" == "failed" ]]; then
color="$RED"
else
color="$DIM"
fi
;;
deactivating)
color="$YELLOW"
;;
activating)
color="$BLUE"
;;
failed)
color="$RED"
;;
esac
# Get memory and uptime
local memory=""
local pid=""
if [[ "$active_state" == "active" ]]; then
memory=$(systemctl show -p MemoryCurrent --value "$service" 2>/dev/null || echo "")
pid=$(systemctl show -p MainPID --value "$service" 2>/dev/null || echo "")
if [[ -n "$memory" && "$memory" != "[not set]" && "$memory" != "infinity" ]]; then
# Convert bytes to human readable
local mem_mb=$((memory / 1024 / 1024))
if [[ $mem_mb -gt 1024 ]]; then
local mem_gb=$((mem_mb / 1024))
local mem_frac=$(( (mem_mb % 1024) * 10 / 1024 ))
memory="${mem_gb}.${mem_frac}G"
else
memory="${mem_mb}M"
fi
else
memory=""
fi
fi
local details
if [[ "$active_state" == "inactive" && "$sub_state" == "dead" ]]; then
details="stopped"
elif [[ "$active_state" == "failed" || "$sub_state" == "failed" ]]; then
details="failed"
elif [[ "$active_state" == "active" && "$sub_state" == "running" ]]; then
details="running"
else
details="$active_state ($sub_state)"
fi
if [[ -n "$memory" ]]; then
details="$details ${DIM}mem: ${memory}${NC}"
fi
if [[ -n "$pid" && "$pid" != "0" ]]; then
details="$details ${DIM}pid: ${pid}${NC}"
fi
printf " ${color}${symbol}${NC} %-12s %-30s %b\n" "$alias" "$service" "$details"
}
do_status() {
local services=("$@")
if [[ ${#services[@]} -eq 0 ]]; then
services=("${ALL_ALIASES[@]}")
fi
echo -e "\n${BOLD}Media Downloader Services${NC}\n"
for alias in "${services[@]}"; do
local service
service=$(resolve_service "$alias")
if [[ -z "$service" ]]; then
echo -e " ${RED}?${NC} ${alias} (unknown service)"
continue
fi
print_status_line "$alias" "$service"
done
echo ""
}
do_action() {
local action="$1"
shift
local services=("$@")
if [[ ${#services[@]} -eq 0 ]]; then
echo -e "${RED}Error: specify service(s) or 'all'${NC}"
echo "Usage: mds $action [service(s)...]"
exit 1
fi
# Expand 'all'
if [[ "${services[0]}" == "all" ]]; then
services=("${ALL_ALIASES[@]}")
fi
for alias in "${services[@]}"; do
local service
service=$(resolve_service "$alias")
if [[ -z "$service" ]]; then
echo -e " ${RED}✗${NC} ${alias}: unknown service"
continue
fi
if [[ "$alias" == "proxy" ]]; then
case "$action" in
start)
echo -e " ${CYAN}▶${NC} Starting unified-proxy..."
docker start unified-proxy 2>/dev/null && echo -e " ${GREEN}✓${NC} unified-proxy started" || echo -e " ${RED}✗${NC} Failed to start unified-proxy"
;;
stop)
echo -e " ${YELLOW}■${NC} Stopping unified-proxy..."
docker stop unified-proxy 2>/dev/null && echo -e " ${GREEN}✓${NC} unified-proxy stopped" || echo -e " ${RED}✗${NC} Failed to stop unified-proxy"
;;
restart)
echo -e " ${CYAN}↻${NC} Restarting unified-proxy..."
docker restart unified-proxy 2>/dev/null && echo -e " ${GREEN}✓${NC} unified-proxy restarted" || echo -e " ${RED}✗${NC} Failed to restart unified-proxy"
;;
esac
continue
fi
case "$action" in
start)
echo -e " ${CYAN}▶${NC} Starting ${alias} (${service})..."
sudo systemctl start "$service" && echo -e " ${GREEN}✓${NC} ${alias} started" || echo -e " ${RED}✗${NC} Failed to start ${alias}"
;;
stop)
echo -e " ${YELLOW}■${NC} Stopping ${alias} (${service})..."
sudo systemctl stop "$service" && echo -e " ${GREEN}✓${NC} ${alias} stopped" || echo -e " ${RED}✗${NC} Failed to stop ${alias}"
;;
restart)
echo -e " ${CYAN}↻${NC} Restarting ${alias} (${service})..."
sudo systemctl restart "$service" && echo -e " ${GREEN}✓${NC} ${alias} restarted" || echo -e " ${RED}✗${NC} Failed to restart ${alias}"
;;
esac
done
}
do_logs() {
local alias="${1:-scheduler}"
local service
service=$(resolve_service "$alias")
if [[ -z "$service" ]]; then
echo -e "${RED}Unknown service: ${alias}${NC}"
exit 1
fi
if [[ "$alias" == "proxy" ]]; then
docker logs --tail 50 unified-proxy
return
fi
sudo journalctl -u "$service" --no-pager -n 50
}
# Main
command="${1:-status}"
shift 2>/dev/null || true
case "$command" in
status|st|s)
do_status "$@"
;;
start)
do_action start "$@"
;;
stop)
do_action stop "$@"
;;
restart|rs|r)
do_action restart "$@"
;;
logs|log|l)
do_logs "$@"
;;
help|--help|-h)
head -27 "$0" | tail -25
;;
*)
echo -e "${RED}Unknown command: ${command}${NC}"
echo "Commands: status, start, stop, restart, logs"
exit 1
;;
esac

View File

@@ -0,0 +1,433 @@
#!/usr/bin/env python3
"""
Migrate Immich assets into file_inventory and face_recognition_scans.
Connects to Immich PostgreSQL (via docker exec) and app PostgreSQL directly.
Idempotent — safe to re-run. Uses ON CONFLICT DO NOTHING for file_inventory
and checks for existing immich_import scans before inserting face data.
Path mapping:
/mnt/media/evalongoria/ → /opt/immich/el/
/mnt/media/elvideo/ → /opt/immich/elv/
/mnt/media/md/ → SKIPPED (already in file_inventory)
Platform inference from subdirectories:
evalongoria: IG→instagram, TT→tiktok, X→twitter, Discord→discord,
Flickr→flickr, rest→unknown
elvideo: YT→youtube, rest→unknown
"""
import subprocess
import sys
import time
import psycopg2
import psycopg2.extras
# ── Configuration ──────────────────────────────────────────────────────────
APP_DB_DSN = "postgresql://media_downloader:PNsihOXvvuPwWiIvGlsc9Fh2YmMmB@localhost/media_downloader"
IMMICH_CONTAINER = "immich_postgres"
IMMICH_DB = "immich"
IMMICH_USER = "postgres"
BATCH_SIZE = 5000
EVA_PERSON_UUID = "0154270a-8c30-4fb7-b73b-3fb3acc49483"
# Path prefix replacements (Immich → local)
PATH_MAP = {
"/mnt/media/evalongoria/": "/opt/immich/el/",
"/mnt/media/elvideo/": "/opt/immich/elv/",
}
# Subdirectory → platform mapping for evalongoria
EVALONGORIA_PLATFORM_MAP = {
"IG": "instagram",
"TT": "tiktok",
"X": "twitter",
"Discord": "discord",
"Flickr": "flickr",
"SC": "unknown",
"Caps": "unknown",
"Clips": "unknown",
"CT": "unknown",
"HQ": "unknown",
"Misc": "unknown",
}
# Subdirectory → platform mapping for elvideo
ELVIDEO_PLATFORM_MAP = {
"YT": "youtube",
"Misc": "unknown",
}
# ── Immich DB helper ───────────────────────────────────────────────────────
def immich_query(sql):
"""Run a SQL query against Immich PostgreSQL via docker exec, return rows as dicts."""
cmd = [
"docker", "exec", IMMICH_CONTAINER,
"psql", "-U", IMMICH_USER, "-d", IMMICH_DB,
"-t", "-A", "-F", "\x1f", # tuples-only, unaligned, unit-separator delimiter
"-c", sql,
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
if result.returncode != 0:
print(f"ERROR running Immich query: {result.stderr}", file=sys.stderr)
sys.exit(1)
return result.stdout.strip()
def immich_query_rows(sql, columns):
"""Run query, parse into list of dicts with given column names."""
raw = immich_query(sql)
if not raw:
return []
rows = []
for line in raw.split("\n"):
if not line.strip():
continue
fields = line.split("\x1f")
if len(fields) != len(columns):
continue
rows.append(dict(zip(columns, fields)))
return rows
# ── Path & platform helpers ────────────────────────────────────────────────
def map_path(immich_path):
"""Convert Immich path to local path. Returns None for /mnt/media/md/ paths."""
for immich_prefix, local_prefix in PATH_MAP.items():
if immich_path.startswith(immich_prefix):
return local_prefix + immich_path[len(immich_prefix):]
return None # md/ or unknown prefix — skip
def infer_platform(immich_path):
"""Infer platform from Immich path based on subdirectory."""
if immich_path.startswith("/mnt/media/evalongoria/"):
remainder = immich_path[len("/mnt/media/evalongoria/"):]
# Check if first component is a known subdirectory
first_component = remainder.split("/")[0] if "/" in remainder else None
if first_component and first_component in EVALONGORIA_PLATFORM_MAP:
return EVALONGORIA_PLATFORM_MAP[first_component]
return "unknown"
elif immich_path.startswith("/mnt/media/elvideo/"):
remainder = immich_path[len("/mnt/media/elvideo/"):]
first_component = remainder.split("/")[0] if "/" in remainder else None
if first_component and first_component in ELVIDEO_PLATFORM_MAP:
return ELVIDEO_PLATFORM_MAP[first_component]
return "unknown"
return "unknown"
def infer_content_type(asset_type):
"""Map Immich asset type to content_type."""
if asset_type == "IMAGE":
return "image"
elif asset_type == "VIDEO":
return "video"
return "unknown"
# ── Main migration ─────────────────────────────────────────────────────────
def migrate_assets(app_conn):
"""Fetch assets from Immich and insert into file_inventory."""
print("=" * 60)
print("Phase 1: Migrating Immich assets → file_inventory")
print("=" * 60)
# Fetch all evalongoria + elvideo assets from Immich
sql = """
SELECT
a.id::text,
a."originalPath",
a."originalFileName",
a.type,
a."fileCreatedAt"::text,
a."deletedAt"::text,
a.width::text,
a.height::text,
encode(a.checksum, 'hex') as file_hash,
COALESCE(e."fileSizeInByte"::text, '') as file_size
FROM asset a
LEFT JOIN asset_exif e ON a.id = e."assetId"
WHERE (a."originalPath" LIKE '/mnt/media/evalongoria/%'
OR a."originalPath" LIKE '/mnt/media/elvideo/%')
ORDER BY a."fileCreatedAt"
"""
print("Fetching assets from Immich...")
columns = [
"id", "originalPath", "originalFileName", "type",
"fileCreatedAt", "deletedAt", "width", "height",
"file_hash", "file_size",
]
rows = immich_query_rows(sql, columns)
total = len(rows)
print(f" Found {total:,} assets to process")
# Prepare and batch-insert
inserted = 0
skipped = 0
batch = []
cur = app_conn.cursor()
insert_sql = """
INSERT INTO file_inventory
(file_path, filename, platform, source, content_type,
file_size, file_hash, width, height, location, created_date)
VALUES %s
ON CONFLICT (file_path) DO NOTHING
"""
for i, row in enumerate(rows):
local_path = map_path(row["originalPath"])
if local_path is None:
skipped += 1
continue
platform = infer_platform(row["originalPath"])
content_type = infer_content_type(row["type"])
location = "recycle" if row["deletedAt"] else "final"
width = int(row["width"]) if row["width"] else None
height = int(row["height"]) if row["height"] else None
file_size = int(row["file_size"]) if row["file_size"] else None
# Parse timestamp — strip timezone info for timestamp without time zone column
created_date = row["fileCreatedAt"]
if created_date:
# Remove timezone suffix like +00 or +00:00 for naive timestamp
created_date = created_date.replace("+00:00", "").replace("+00", "").strip()
batch.append((
local_path,
row["originalFileName"],
platform,
"evalongoria",
content_type,
file_size,
row["file_hash"],
width,
height,
location,
created_date if created_date else None,
))
if len(batch) >= BATCH_SIZE:
psycopg2.extras.execute_values(
cur, insert_sql, batch,
template="(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
)
inserted += cur.rowcount
app_conn.commit()
processed = i + 1
print(f" Progress: {processed:,}/{total:,} processed, {inserted:,} inserted")
batch = []
# Final batch
if batch:
psycopg2.extras.execute_values(
cur, insert_sql, batch,
template="(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
)
inserted += cur.rowcount
app_conn.commit()
cur.close()
print(f"\n DONE: {inserted:,} rows inserted, {skipped:,} skipped (md/ paths)")
return inserted
def migrate_face_detections(app_conn):
"""Migrate Eva Longoria face detections from Immich → face_recognition_scans."""
print("\n" + "=" * 60)
print("Phase 2: Migrating face detections → face_recognition_scans")
print("=" * 60)
# First, check if we already ran this migration
cur = app_conn.cursor()
cur.execute("SELECT COUNT(*) FROM face_recognition_scans WHERE scan_type = 'immich_import'")
existing = cur.fetchone()[0]
if existing > 0:
print(f" Found {existing:,} existing immich_import scans — skipping face migration")
print(" (Delete existing immich_import scans first if you want to re-run)")
cur.close()
return 0
# Get distinct assets with Eva Longoria face + face count + path in one query
print("Fetching face detection data with paths from Immich...")
sql = f"""
SELECT
a."originalPath",
COUNT(*) as eva_faces
FROM asset_face af
JOIN asset a ON af."assetId" = a.id
WHERE af."personId" = '{EVA_PERSON_UUID}'
AND af."deletedAt" IS NULL
AND (a."originalPath" LIKE '/mnt/media/evalongoria/%'
OR a."originalPath" LIKE '/mnt/media/elvideo/%')
GROUP BY a."originalPath"
"""
columns = ["originalPath", "face_count"]
face_rows = immich_query_rows(sql, columns)
print(f" Found {len(face_rows):,} assets with Eva Longoria face detections")
# Build file_path lookup from file_inventory (for /opt/immich/el/ and /opt/immich/elv/ paths)
print("Building file_inventory lookup...")
cur.execute("""
SELECT file_path FROM file_inventory
WHERE file_path LIKE '/opt/immich/el/%' OR file_path LIKE '/opt/immich/elv/%'
""")
inventory_paths = set(row[0] for row in cur.fetchall())
print(f" {len(inventory_paths):,} paths in file_inventory for el/elv")
# Prepare face scan inserts
insert_sql = """
INSERT INTO face_recognition_scans
(file_path, has_match, matched_person, confidence, face_count, scan_type)
VALUES %s
"""
batch = []
inserted = 0
skipped_not_in_inventory = 0
total = len(face_rows)
for i, row in enumerate(face_rows):
local_path = map_path(row["originalPath"])
if local_path is None:
continue
if local_path not in inventory_paths:
skipped_not_in_inventory += 1
continue
face_count = int(row["face_count"])
batch.append((
local_path,
True,
"Eva Longoria",
1.0,
face_count,
"immich_import",
))
if len(batch) >= BATCH_SIZE:
psycopg2.extras.execute_values(
cur, insert_sql, batch,
template="(%s, %s, %s, %s, %s, %s)",
)
inserted += cur.rowcount
app_conn.commit()
print(f" Progress: {i + 1:,}/{total:,} processed, {inserted:,} inserted")
batch = []
if batch:
psycopg2.extras.execute_values(
cur, insert_sql, batch,
template="(%s, %s, %s, %s, %s, %s)",
)
inserted += cur.rowcount
app_conn.commit()
cur.close()
print(f"\n DONE: {inserted:,} face scans inserted")
print(f" Skipped: {skipped_not_in_inventory:,} (not in file_inventory)")
return inserted
def verify(app_conn):
"""Print verification counts."""
print("\n" + "=" * 60)
print("Verification")
print("=" * 60)
cur = app_conn.cursor()
# file_inventory counts
cur.execute("SELECT COUNT(*) FROM file_inventory WHERE file_path LIKE '/opt/immich/el/%'")
el_count = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM file_inventory WHERE file_path LIKE '/opt/immich/elv/%'")
elv_count = cur.fetchone()[0]
cur.execute("""
SELECT location, COUNT(*)
FROM file_inventory
WHERE file_path LIKE '/opt/immich/el/%' OR file_path LIKE '/opt/immich/elv/%'
GROUP BY location
""")
location_counts = dict(cur.fetchall())
cur.execute("""
SELECT platform, COUNT(*)
FROM file_inventory
WHERE file_path LIKE '/opt/immich/el/%' OR file_path LIKE '/opt/immich/elv/%'
GROUP BY platform
ORDER BY 2 DESC
""")
platform_counts = cur.fetchall()
# face_recognition_scans counts
cur.execute("SELECT COUNT(*) FROM face_recognition_scans WHERE scan_type = 'immich_import'")
face_count = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM face_recognition_scans")
total_face_scans = cur.fetchone()[0]
# Total file_inventory
cur.execute("SELECT COUNT(*) FROM file_inventory")
total_inventory = cur.fetchone()[0]
cur.close()
print(f"\n file_inventory:")
print(f" /opt/immich/el/* (evalongoria): {el_count:,}")
print(f" /opt/immich/elv/* (elvideo): {elv_count:,}")
print(f" Total new: {el_count + elv_count:,}")
print(f" By location: {dict(location_counts)}")
print(f" By platform:")
for platform, count in platform_counts:
print(f" {platform:12s}: {count:,}")
print(f"\n face_recognition_scans:")
print(f" immich_import: {face_count:,}")
print(f" Total scans: {total_face_scans:,}")
print(f"\n Total file_inventory rows: {total_inventory:,}")
def main():
start = time.time()
print("Immich → file_inventory migration")
print("=" * 60)
# Test Immich connection
print("Testing Immich database connection...")
test = immich_query("SELECT COUNT(*) FROM asset")
print(f" Immich has {int(test):,} assets")
# Connect to app database
print("Connecting to app database...")
app_conn = psycopg2.connect(APP_DB_DSN)
try:
assets_inserted = migrate_assets(app_conn)
faces_inserted = migrate_face_detections(app_conn)
verify(app_conn)
finally:
app_conn.close()
elapsed = time.time() - start
print(f"\nCompleted in {elapsed:.1f}s")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,127 @@
#!/usr/bin/env python3
"""
Move Immich soft-deleted files from /opt/immich/el/ and /opt/immich/elv/
into the actual recycle bin (/opt/immich/recycle/) with proper DB entries.
For each file with location='recycle' in file_inventory:
1. Move the file to /opt/immich/recycle/<uuid>.<ext>
2. Insert a row into recycle_bin
3. Delete from file_inventory
"""
import os
import shutil
import sys
import time
import uuid
from pathlib import Path
import psycopg2
APP_DB_DSN = "postgresql://media_downloader:PNsihOXvvuPwWiIvGlsc9Fh2YmMmB@localhost/media_downloader"
RECYCLE_DIR = Path("/opt/immich/recycle")
BATCH_SIZE = 500
def main():
start = time.time()
print("Moving Immich soft-deleted files to recycle bin")
print("=" * 60)
conn = psycopg2.connect(APP_DB_DSN)
cur = conn.cursor()
# Get all recycled entries from el/elv
cur.execute("""
SELECT id, file_path, filename, file_size, file_hash, created_date
FROM file_inventory
WHERE (file_path LIKE '/opt/immich/el/%' OR file_path LIKE '/opt/immich/elv/%')
AND location = 'recycle'
ORDER BY id
""")
rows = cur.fetchall()
total = len(rows)
print(f" Found {total:,} recycled entries to move")
if total == 0:
print(" Nothing to do.")
conn.close()
return
RECYCLE_DIR.mkdir(parents=True, exist_ok=True)
moved = 0
missing = 0
errors = 0
for i, (inv_id, file_path, filename, file_size, file_hash, created_date) in enumerate(rows):
src = Path(file_path)
if not src.exists():
# File doesn't exist on disk — just remove from file_inventory
cur.execute("DELETE FROM file_inventory WHERE id = %s", (inv_id,))
missing += 1
if missing <= 5:
print(f" MISSING (removed from DB): {file_path}")
continue
# Generate recycle path
ext = src.suffix or ""
recycle_id = str(uuid.uuid4())
recycle_path = RECYCLE_DIR / f"{recycle_id}{ext}"
try:
# Get file mtime before moving
mtime = src.stat().st_mtime
actual_size = src.stat().st_size
# Move the file
shutil.move(str(src), str(recycle_path))
# Insert into recycle_bin
cur.execute("""
INSERT INTO recycle_bin
(id, original_path, original_filename, recycle_path,
file_extension, file_size, original_mtime,
deleted_from, deleted_by, file_hash)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
""", (
recycle_id,
file_path,
filename,
str(recycle_path),
ext.lstrip(".") if ext else None,
actual_size or file_size,
mtime,
"immich_deleted",
"immich_migration",
file_hash,
))
# Delete from file_inventory
cur.execute("DELETE FROM file_inventory WHERE id = %s", (inv_id,))
moved += 1
except Exception as e:
errors += 1
if errors <= 5:
print(f" ERROR moving {file_path}: {e}")
if (i + 1) % BATCH_SIZE == 0:
conn.commit()
print(f" Progress: {i + 1:,}/{total:,} — moved: {moved:,}, missing: {missing:,}, errors: {errors:,}")
conn.commit()
cur.close()
conn.close()
elapsed = time.time() - start
print(f"\n DONE in {elapsed:.1f}s:")
print(f" Moved to recycle: {moved:,}")
print(f" Missing on disk: {missing:,}")
print(f" Errors: {errors:,}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python3
"""
Paid Content Service Health Check
Standalone script to check the health of all paid content services.
Designed to be run via systemd timer every 4 hours.
"""
import sys
import os
import asyncio
from datetime import datetime
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from modules.paid_content import (
PaidContentDBAdapter,
PaidContentAPIClient,
FanslyDirectClient,
YouTubeClient,
TwitchClient
)
from modules.unified_database import UnifiedDatabase
def log(message: str, level: str = "info"):
"""Simple logging to stdout"""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"[{timestamp}] [{level.upper()}] {message}")
async def check_service_health(service: dict, pc_db: PaidContentDBAdapter) -> dict:
"""Check health of a single service"""
service_id = service['id']
health = {'status': 'unknown', 'message': ''}
try:
if service_id == 'youtube':
youtube = YouTubeClient()
if youtube.is_available():
health = {'status': 'healthy', 'message': 'yt-dlp is available'}
else:
health = {'status': 'down', 'message': 'yt-dlp not found'}
elif service_id == 'twitch':
twitch = TwitchClient()
if twitch.is_available():
health = {'status': 'healthy', 'message': 'yt-dlp is available for Twitch'}
else:
health = {'status': 'down', 'message': 'yt-dlp not found'}
elif service_id == 'fansly_direct':
auth_token = service.get('session_cookie')
if not auth_token:
health = {'status': 'down', 'message': 'Auth token not configured'}
else:
client = FanslyDirectClient(auth_token=auth_token)
try:
result = await client.check_auth()
if result.get('valid'):
health = {
'status': 'healthy',
'message': f"Connected as {result.get('username', 'unknown')}"
}
else:
health = {'status': 'down', 'message': result.get('error', 'Auth failed')}
finally:
await client.close()
else:
# Coomer/Kemono services
client = PaidContentAPIClient(
service_id,
session_cookie=service.get('session_cookie'),
base_url=service.get('base_url')
)
try:
health = await client.check_health()
finally:
await client.close()
# Update database
pc_db.update_service(service_id, {
'health_status': health.get('status', 'unknown'),
'last_health_check': datetime.now().isoformat()
})
return {'service_id': service_id, **health}
except Exception as e:
log(f"Health check failed for {service_id}: {e}", "error")
return {'service_id': service_id, 'status': 'error', 'message': str(e)}
async def main():
"""Main health check routine"""
log("Starting paid content service health check")
try:
# Initialize database
db = UnifiedDatabase()
pc_db = PaidContentDBAdapter(db)
# Get all services
services = pc_db.get_services()
if not services:
log("No services configured", "warning")
return 0
log(f"Checking {len(services)} services...")
# Check each service
results = []
for service in services:
result = await check_service_health(service, pc_db)
results.append(result)
status_icon = "" if result['status'] == 'healthy' else ""
log(f" {status_icon} {result['service_id']}: {result['status']} - {result.get('message', '')}")
# Summary
healthy = sum(1 for r in results if r['status'] == 'healthy')
total = len(results)
log(f"Health check complete: {healthy}/{total} services healthy")
return 0 if healthy == total else 1
except Exception as e:
log(f"Health check failed: {e}", "error")
return 1
if __name__ == "__main__":
exit_code = asyncio.run(main())
sys.exit(exit_code)

8
scripts/plex-match.sh Executable file
View File

@@ -0,0 +1,8 @@
#!/bin/bash
# Trigger Plex matching for appearances
# Runs via cron to populate plex_rating_key and plex_show_rating_key
/opt/media-downloader/scripts/get-api-token.sh >/dev/null 2>&1
/opt/media-downloader/scripts/api-call.sh "/api/appearances/plex/match" -X POST -H "Content-Type: application/json" >/dev/null 2>&1
echo "$(date): Plex matching triggered" >> /var/log/media-downloader/plex-match.log

View File

@@ -0,0 +1,199 @@
#!/usr/bin/env python3
"""
Bulk pre-generate thumbnails for file_inventory entries that don't have cached thumbnails.
Targets /opt/immich/el/ and /opt/immich/elv/ paths (Immich migration).
Uses multiprocessing to generate thumbnails in parallel.
"""
import hashlib
import io
import sqlite3
import subprocess
import sys
import time
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
from PIL import Image
THUMB_DB = "/opt/media-downloader/database/thumbnails.db"
APP_DB_DSN = "postgresql://media_downloader:PNsihOXvvuPwWiIvGlsc9Fh2YmMmB@localhost/media_downloader"
MAX_SIZE = (300, 300)
WORKERS = 6
BATCH_SIZE = 200
def generate_image_thumbnail(file_path, max_size=(300, 300)):
try:
img = Image.open(file_path)
img.thumbnail(max_size, Image.Resampling.LANCZOS)
if img.mode in ('RGBA', 'LA', 'P'):
background = Image.new('RGB', img.size, (255, 255, 255))
if img.mode == 'P':
img = img.convert('RGBA')
background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
img = background
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=85)
return buffer.getvalue()
except Exception:
return None
def generate_video_thumbnail(file_path, max_size=(300, 300)):
for seek_time in ['00:00:01.000', '00:00:00.000']:
try:
result = subprocess.run([
'ffmpeg', '-ss', seek_time,
'-i', str(file_path),
'-vframes', '1',
'-f', 'image2pipe',
'-vcodec', 'mjpeg', '-'
], capture_output=True, timeout=30)
if result.returncode != 0 or not result.stdout:
continue
img = Image.open(io.BytesIO(result.stdout))
img.thumbnail(max_size, Image.Resampling.LANCZOS)
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=85)
return buffer.getvalue()
except Exception:
continue
return None
def process_file(args):
"""Generate thumbnail for a single file. Runs in worker process."""
file_path, content_type, file_hash = args
p = Path(file_path)
if not p.exists():
return (file_hash, file_path, None, 0, 'missing')
try:
mtime = p.stat().st_mtime
except OSError:
mtime = 0
if content_type == 'video':
data = generate_video_thumbnail(p, MAX_SIZE)
else:
data = generate_image_thumbnail(p, MAX_SIZE)
if data:
return (file_hash, file_path, data, mtime, 'ok')
return (file_hash, file_path, None, mtime, 'failed')
def get_files_needing_thumbnails():
"""Query file_inventory for el/elv files, check which lack thumbnails."""
import psycopg2
conn = psycopg2.connect(APP_DB_DSN)
cur = conn.cursor()
cur.execute("""
SELECT file_path, content_type, file_hash
FROM file_inventory
WHERE (file_path LIKE '/opt/immich/el/%%' OR file_path LIKE '/opt/immich/elv/%%')
AND location = 'final'
ORDER BY id
""")
all_files = cur.fetchall()
cur.close()
conn.close()
# Check which already have thumbnails
thumb_conn = sqlite3.connect(THUMB_DB, timeout=30)
thumb_cur = thumb_conn.cursor()
# Get all existing thumbnail hashes in one query
thumb_cur.execute("SELECT file_hash FROM thumbnails")
existing_hashes = set(row[0] for row in thumb_cur.fetchall())
thumb_conn.close()
needed = []
for file_path, content_type, file_hash in all_files:
# Use content hash if available, else path hash
cache_key = file_hash if file_hash else hashlib.sha256(file_path.encode()).hexdigest()
if cache_key not in existing_hashes:
needed.append((file_path, content_type, cache_key))
return needed
def save_batch(results):
"""Save a batch of thumbnails to the DB."""
from datetime import datetime, timezone
now = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S')
conn = sqlite3.connect(THUMB_DB, timeout=60)
saved = 0
for file_hash, file_path, data, mtime, status in results:
if data:
conn.execute("""
INSERT OR REPLACE INTO thumbnails
(file_hash, file_path, thumbnail_data, created_at, file_mtime)
VALUES (?, ?, ?, ?, ?)
""", (file_hash, file_path, data, now, mtime))
saved += 1
conn.commit()
conn.close()
return saved
def main():
start = time.time()
print("Bulk thumbnail pre-generation")
print("=" * 60)
print("Finding files needing thumbnails...")
needed = get_files_needing_thumbnails()
total = len(needed)
print(f" {total:,} files need thumbnails")
if total == 0:
print(" Nothing to do!")
return
generated = 0
failed = 0
missing = 0
batch_results = []
with ProcessPoolExecutor(max_workers=WORKERS) as executor:
futures = {executor.submit(process_file, item): item for item in needed}
for i, future in enumerate(as_completed(futures), 1):
result = future.result()
batch_results.append(result)
status = result[4]
if status == 'ok':
generated += 1
elif status == 'missing':
missing += 1
else:
failed += 1
if len(batch_results) >= BATCH_SIZE:
save_batch(batch_results)
batch_results = []
elapsed = time.time() - start
rate = i / elapsed if elapsed > 0 else 0
eta = (total - i) / rate if rate > 0 else 0
print(f" {i:,}/{total:,} ({generated:,} ok, {failed:,} failed, {missing:,} missing) "
f"[{rate:.0f}/s, ETA {eta:.0f}s]")
# Final batch
if batch_results:
save_batch(batch_results)
elapsed = time.time() - start
print(f"\nDone in {elapsed:.1f}s:")
print(f" Generated: {generated:,}")
print(f" Failed: {failed:,}")
print(f" Missing: {missing:,}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,178 @@
#!/usr/bin/env python3
"""Full scheduler startup profiler - mimics media-downloader.py --scheduler exactly.
Adds memory logging at every stage and a background thread that monitors RSS every 2 seconds.
"""
import os
import sys
import gc
import threading
import time
# Set up environment exactly like the systemd service
os.environ['PYTHONUNBUFFERED'] = '1'
os.environ['PYTHONDONTWRITEBYTECODE'] = '1'
os.environ['DATABASE_BACKEND'] = 'postgresql'
os.environ['DATABASE_URL'] = 'postgresql://media_downloader:PNsihOXvvuPwWiIvGlsc9Fh2YmMmB@localhost/media_downloader'
os.environ['HOME'] = '/root'
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = '/root/.cache/ms-playwright'
os.environ.setdefault('DISPLAY', ':100')
os.chdir('/opt/media-downloader')
sys.path.insert(0, '/opt/media-downloader')
def get_rss_mb():
"""Get current RSS in MB from /proc/self/status"""
try:
with open('/proc/self/status') as f:
for line in f:
if line.startswith('VmRSS:'):
return int(line.split()[1]) / 1024
except:
pass
return 0
def get_child_rss_mb():
"""Get total RSS of child processes"""
import subprocess
try:
pid = os.getpid()
result = subprocess.run(
['ps', '--ppid', str(pid), '-o', 'rss='],
capture_output=True, text=True, timeout=5
)
total = 0
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line:
total += int(line)
return total / 1024 # kB to MB
except:
return 0
# Memory monitoring thread
stop_monitor = False
peak_rss = 0
def memory_monitor():
global peak_rss
while not stop_monitor:
rss = get_rss_mb()
child_rss = get_child_rss_mb()
total = rss + child_rss
if rss > peak_rss:
peak_rss = rss
# Only print on significant changes or every 10s
sys.stderr.write(f"[MEMORY] RSS={rss:.0f}MB Children={child_rss:.0f}MB Total={total:.0f}MB Peak={peak_rss:.0f}MB\n")
sys.stderr.flush()
time.sleep(2)
# Start memory monitoring
monitor_thread = threading.Thread(target=memory_monitor, daemon=True)
monitor_thread.start()
sys.stderr.write(f"[STAGE] Baseline: {get_rss_mb():.0f}MB\n")
# Now do EXACTLY what media-downloader.py does
# --- Module-level code from media-downloader.py ---
try:
import nest_asyncio
nest_asyncio.apply()
except ImportError:
pass
import warnings
warnings.filterwarnings("ignore", message=".*pkg_resources is deprecated.*")
import modules.db_bootstrap
import json, sqlite3, logging, argparse, subprocess, random
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Any, Set, Tuple
import requests
from dataclasses import dataclass
sqlite3.register_adapter(datetime, lambda d: d.isoformat())
sqlite3.register_converter("datetime", lambda s: datetime.fromisoformat(s.decode()))
sys.path.insert(0, str(Path('/opt/media-downloader')))
sys.path.insert(0, str(Path('/opt/media-downloader') / 'modules'))
try:
from modules.instaloader_module import InstaLoaderModule as InstaLoaderDownloader
from modules.fastdl_module import FastDLDownloader
from modules.imginn_module import ImgInnDownloader
from modules.imginn_api_module import ImgInnAPIDownloader
from modules.instagram_client_module import InstagramClientDownloader
from modules.toolzu_module import ToolzuDownloader
from modules.snapchat_scraper import SnapchatDirectScraper
from modules.snapchat_client_module import SnapchatClientDownloader
from modules.tiktok_module import TikTokDownloader
from modules.forum_downloader import ForumDownloader
from modules.coppermine_module import CoppermineDownloader
from modules.download_manager import DownloadManager, DownloadItem
from modules.settings_manager import SettingsManager
from modules.date_utils import DateHandler, extract_date, update_timestamps
from modules.move_module import MoveManager
from modules.unified_database import UnifiedDatabase
from modules.universal_logger import get_logger
from modules.forum_db_adapter import ForumDatabaseAdapter
from modules.pushover_notifier import PushoverNotifier, create_notifier_from_config
from modules.service_health_monitor import ServiceHealthMonitor
from modules.dependency_updater import DependencyUpdater
from modules.downloader_monitor import get_monitor
from modules.activity_status import get_activity_manager
except ImportError as e:
print(f"Error importing modules: {e}")
sys.exit(1)
sys.stderr.write(f"[STAGE] All imports done: {get_rss_mb():.0f}MB\n")
# --- Scheduler section (what main() does with --scheduler) ---
from modules.scheduler import DownloadScheduler
from modules.unified_database import UnifiedDatabase
import signal
sys.stderr.write(f"[STAGE] Scheduler imported: {get_rss_mb():.0f}MB\n")
# Create unified database
unified_db = UnifiedDatabase('database/media_downloader.db', use_pool=True, pool_size=5)
sys.stderr.write(f"[STAGE] UnifiedDatabase created: {get_rss_mb():.0f}MB\n")
# Create SettingsManager
sm = SettingsManager('database/media_downloader.db')
# Create scheduler - pass settings_manager like main() does
scheduler = DownloadScheduler(unified_db=unified_db, settings_manager=sm)
sys.stderr.write(f"[STAGE] DownloadScheduler created: {get_rss_mb():.0f}MB\n")
# Set up graceful shutdown
shutdown_requested = False
def graceful_shutdown(signum, frame):
global shutdown_requested, stop_monitor
if shutdown_requested:
return
shutdown_requested = True
stop_monitor = True
sys.stderr.write(f"\n[SHUTDOWN] Signal received, stopping...\n")
sys.stderr.write(f"[SHUTDOWN] Final RSS: {get_rss_mb():.0f}MB, Peak: {peak_rss:.0f}MB\n")
scheduler.stop()
dl = getattr(scheduler, 'downloader', None)
if dl:
dl.cleanup_all_temp_dirs()
unified_db.close()
sys.exit(0)
signal.signal(signal.SIGTERM, graceful_shutdown)
signal.signal(signal.SIGINT, graceful_shutdown)
sys.stderr.write(f"[STAGE] About to call scheduler.start() - this will exec_module, create MediaDownloader, then enter main loop\n")
sys.stderr.write(f"[STAGE] Pre-start RSS: {get_rss_mb():.0f}MB\n")
sys.stderr.flush()
# Start scheduler (this blocks - enters main loop)
scheduler.start()

View File

@@ -0,0 +1,213 @@
#!/usr/bin/env python3
"""Profile memory usage at each stage of scheduler startup to find the 8GB culprit."""
import os
import sys
import gc
sys.path.insert(0, '/opt/media-downloader')
os.chdir('/opt/media-downloader')
# Set environment like the service does
os.environ['DATABASE_BACKEND'] = 'postgresql'
os.environ['DATABASE_URL'] = 'postgresql://media_downloader:PNsihOXvvuPwWiIvGlsc9Fh2YmMmB@localhost/media_downloader'
os.environ['PYTHONUNBUFFERED'] = '1'
def get_rss_mb():
"""Get current RSS in MB from /proc/self/status"""
with open('/proc/self/status') as f:
for line in f:
if line.startswith('VmRSS:'):
return int(line.split()[1]) / 1024 # kB to MB
return 0
def log_mem(label):
gc.collect()
rss = get_rss_mb()
print(f"[{rss:7.1f} MB] {label}")
return rss
# Stage 0: Baseline
log_mem("BASELINE (python + script)")
# Stage 1: Basic imports (what media-downloader.py does at top level)
import warnings
warnings.filterwarnings("ignore", message=".*pkg_resources is deprecated.*")
log_mem("After warnings")
import modules.db_bootstrap
log_mem("After db_bootstrap")
import json, sqlite3, logging, argparse, time, subprocess, random
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Any, Set, Tuple
import requests
from dataclasses import dataclass
log_mem("After stdlib + requests")
# Stage 2: Module imports (lines 52-80 of media-downloader.py)
from modules.instaloader_module import InstaLoaderModule as InstaLoaderDownloader
log_mem("After instaloader_module import")
from modules.fastdl_module import FastDLDownloader
log_mem("After fastdl_module import")
from modules.imginn_module import ImgInnDownloader
log_mem("After imginn_module import")
from modules.imginn_api_module import ImgInnAPIDownloader
log_mem("After imginn_api_module import")
from modules.instagram_client_module import InstagramClientDownloader
log_mem("After instagram_client_module import")
from modules.toolzu_module import ToolzuDownloader
log_mem("After toolzu_module import")
from modules.snapchat_scraper import SnapchatDirectScraper
log_mem("After snapchat_scraper import")
from modules.snapchat_client_module import SnapchatClientDownloader
log_mem("After snapchat_client_module import")
from modules.tiktok_module import TikTokDownloader
log_mem("After tiktok_module import")
from modules.forum_downloader import ForumDownloader
log_mem("After forum_downloader import (has Playwright)")
from modules.coppermine_module import CoppermineDownloader
log_mem("After coppermine_module import")
from modules.download_manager import DownloadManager, DownloadItem
log_mem("After download_manager import")
from modules.settings_manager import SettingsManager
from modules.date_utils import DateHandler, extract_date, update_timestamps
from modules.move_module import MoveManager
from modules.unified_database import UnifiedDatabase
from modules.universal_logger import get_logger
from modules.forum_db_adapter import ForumDatabaseAdapter
from modules.pushover_notifier import PushoverNotifier, create_notifier_from_config
from modules.service_health_monitor import ServiceHealthMonitor
from modules.dependency_updater import DependencyUpdater
from modules.downloader_monitor import get_monitor
from modules.activity_status import get_activity_manager
log_mem("After ALL module imports")
# Stage 3: Import scheduler and its dependencies
from modules.scheduler import DownloadScheduler
log_mem("After scheduler import (includes monitors)")
# Stage 4: Create UnifiedDatabase
db_path = '/opt/media-downloader/database/media_downloader.db'
unified_db = UnifiedDatabase(db_path, use_pool=True, pool_size=5)
log_mem("After UnifiedDatabase creation")
# Stage 5: Create DownloadScheduler
from modules.settings_manager import SettingsManager
sm = SettingsManager(db_path)
scheduler = DownloadScheduler(
config_path=None,
unified_db=unified_db,
settings_manager=sm
)
log_mem("After DownloadScheduler creation")
# Stage 6: exec_module to load media-downloader.py (what scheduler.start() does)
import importlib.util
spec = importlib.util.spec_from_file_location(
"media_downloader",
Path("/opt/media-downloader/media-downloader.py")
)
media_downloader = importlib.util.module_from_spec(spec)
spec.loader.exec_module(media_downloader)
MediaDownloader = media_downloader.MediaDownloader
log_mem("After exec_module (re-loads media-downloader.py)")
# Stage 7: Create MediaDownloader instance
downloader = MediaDownloader(enable_notifications=True, unified_db=unified_db)
log_mem("After MediaDownloader creation (lazy modules)")
# Stage 8: Access one lazy module to see how much it adds
print("\n--- Testing individual module instantiation ---")
if 'fastdl' in downloader.modules:
_ = downloader.modules['fastdl']
log_mem("After instantiating FastDL module")
downloader.modules.release('fastdl')
gc.collect()
log_mem("After releasing FastDL module")
if 'forum' in downloader.modules or 'forums' in downloader.modules:
key = 'forums' if 'forums' in downloader.modules else 'forum'
_ = downloader.modules[key]
log_mem(f"After instantiating {key} module (Playwright-based)")
downloader.modules.release(key)
gc.collect()
log_mem(f"After releasing {key} module")
# Stage 9: Create the monitors that scheduler creates
print("\n--- Testing monitor creation ---")
from modules.youtube_channel_monitor import YouTubeChannelMonitor
from modules.easynews_monitor import EasynewsMonitor
from modules.reddit_community_monitor import RedditCommunityMonitor
yt = YouTubeChannelMonitor(db_path, get_activity_manager(unified_db))
log_mem("After YouTubeChannelMonitor creation")
en = EasynewsMonitor(db_path, get_activity_manager(unified_db))
log_mem("After EasynewsMonitor creation")
rd = RedditCommunityMonitor(db_path, get_activity_manager(unified_db))
log_mem("After RedditCommunityMonitor creation")
# Stage 10: Simulate what happens when a background task runs
print("\n--- Simulating background task execution ---")
# Test: easynews check_all_celebrities
print("Running Easynews check_all_celebrities...")
try:
result = en.check_all_celebrities(from_scheduler=True)
log_mem(f"After Easynews check (results: {result.get('results_found', 0)})")
except Exception as e:
log_mem(f"After Easynews check (error: {e})")
gc.collect()
log_mem("After gc.collect")
# Test: reddit check_all_now
print("Running Reddit check_all_now...")
try:
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
count = loop.run_until_complete(rd.check_all_now(from_scheduler=True))
log_mem(f"After Reddit check (media: {count})")
finally:
loop.close()
except Exception as e:
log_mem(f"After Reddit check (error: {e})")
gc.collect()
log_mem("After gc.collect")
# Test: youtube check_all_now
print("Running YouTube check_all_now...")
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
count = loop.run_until_complete(yt.check_all_now(from_scheduler=True))
log_mem(f"After YouTube check (videos: {count})")
finally:
loop.close()
except Exception as e:
log_mem(f"After YouTube check (error: {e})")
gc.collect()
log_mem("After gc.collect")
print("\n--- DONE ---")
print(f"Final RSS: {get_rss_mb():.1f} MB")

112
scripts/quick_face_backfill.py Executable file
View File

@@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""
Quick backfill of face recognition scans for existing files
This scans all media files currently in /opt/immich/md and logs results to database
"""
import os
import sys
from pathlib import Path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Bootstrap PostgreSQL adapter before any database imports
from modules.db_bootstrap import bootstrap_database
bootstrap_database()
from modules.face_recognition_module import FaceRecognitionModule
from modules.unified_database import UnifiedDatabase
from modules.settings_manager import SettingsManager
# Configuration
SCAN_BASE_DIR = "/opt/immich/md"
DATABASE_PATH = "/opt/media-downloader/database/media_downloader.db"
# Supported file extensions
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.heic'}
VIDEO_EXTENSIONS = {'.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv', '.m4v'}
SUPPORTED_EXTENSIONS = IMAGE_EXTENSIONS | VIDEO_EXTENSIONS
def main():
print("🔄 Quick Face Recognition Backfill")
print("=" * 70)
db = UnifiedDatabase()
settings_manager = SettingsManager(DATABASE_PATH)
face_module = FaceRecognitionModule(unified_db=db)
# Get settings
settings = settings_manager.get('face_recognition', {})
if not settings.get('enabled', False):
print("✗ Face recognition is disabled in settings")
sys.exit(1)
tolerance = settings.get('tolerance', 0.6)
print(f"Scanning: {SCAN_BASE_DIR}")
print(f"Tolerance: {tolerance}")
print("=" * 70)
stats = {'total': 0, 'matched': 0, 'no_match': 0, 'errors': 0, 'already_scanned': 0}
# Walk through all files
for root, dirs, files in os.walk(SCAN_BASE_DIR):
for filename in files:
file_path = os.path.join(root, filename)
file_ext = os.path.splitext(filename)[1].lower()
if file_ext not in SUPPORTED_EXTENSIONS:
continue
stats['total'] += 1
# Check if already scanned
existing = db.get_face_recognition_result(file_path)
if existing:
stats['already_scanned'] += 1
if stats['total'] % 50 == 0:
print(f"Progress: {stats['total']} files processed, {stats['already_scanned']} already scanned, {stats['matched']} newly matched...")
continue
try:
# Scan the file
is_video = file_ext in VIDEO_EXTENSIONS
result = face_module.check_image(file_path, tolerance=tolerance, is_video=is_video)
# Log to database
db.log_face_recognition_scan(
file_path=file_path,
has_match=result.get('has_match', False),
matched_person=result.get('person_name'),
confidence=result.get('confidence'),
face_count=result.get('face_count', 0),
scan_type='quick_backfill'
)
if result.get('has_match'):
stats['matched'] += 1
person = result.get('person_name', 'Unknown')
conf = result.get('confidence', 0)
print(f"✓ [{stats['total']}] {filename[:60]} - MATCHED: {person} ({conf:.1%})")
else:
stats['no_match'] += 1
except Exception as e:
stats['errors'] += 1
print(f"✗ [{stats['total']}] {filename[:60]} - ERROR: {e}")
print("\n" + "=" * 70)
print("📊 BACKFILL COMPLETE")
print("=" * 70)
print(f"Total files: {stats['total']}")
print(f"Already scanned: {stats['already_scanned']}")
print(f"Newly matched: {stats['matched']}")
print(f"No match: {stats['no_match']}")
print(f"Errors: {stats['errors']}")
print("=" * 70)
db.close()
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,102 @@
#!/usr/bin/env python3
"""Regenerate all thumbnails for Fansly attachments."""
import os
import sys
import subprocess
import io
from pathlib import Path
from PIL import Image
# Bootstrap PostgreSQL adapter before any sqlite3 imports
sys.path.insert(0, '/opt/media-downloader')
from modules.db_bootstrap import bootstrap_database
bootstrap_database()
import sqlite3
# Database path (routed to PostgreSQL via pgadapter)
DB_PATH = '/opt/media-downloader/database/media_downloader.db'
THUMB_CACHE = Path('/opt/media-downloader/cache/thumbnails/large')
MAX_SIZE = (800, 800)
def generate_thumbnail(file_path, file_type):
"""Generate thumbnail for image or video."""
try:
if file_type == 'image':
with Image.open(file_path) as img:
img.thumbnail(MAX_SIZE, Image.LANCZOS)
if img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=85)
return buffer.getvalue()
elif file_type == 'video':
cmd = [
'ffmpeg', '-y', '-ss', '1', '-i', str(file_path),
'-vframes', '1', '-f', 'image2pipe', '-vcodec', 'mjpeg', '-'
]
result = subprocess.run(cmd, capture_output=True, timeout=30)
if result.returncode == 0 and result.stdout:
with Image.open(io.BytesIO(result.stdout)) as img:
img.thumbnail(MAX_SIZE, Image.LANCZOS)
if img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=85)
return buffer.getvalue()
except Exception as e:
print(f" Error: {e}")
return None
def main():
THUMB_CACHE.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(DB_PATH)
cursor = conn.execute("""
SELECT a.id, a.local_path, a.file_type
FROM paid_content_attachments a
JOIN paid_content_posts p ON a.post_id = p.id
JOIN paid_content_creators c ON p.creator_id = c.id
WHERE c.service_id = 'fansly_direct'
AND a.status = 'completed'
AND a.local_path IS NOT NULL
ORDER BY a.id
""")
attachments = cursor.fetchall()
conn.close()
print(f"Regenerating thumbnails for {len(attachments)} files...")
generated = 0
failed = 0
missing = 0
for i, (att_id, local_path, file_type) in enumerate(attachments):
if i % 100 == 0:
print(f"Progress: {i}/{len(attachments)} (generated: {generated}, failed: {failed})")
file_path = Path(local_path)
if not file_path.exists():
missing += 1
continue
thumb_data = generate_thumbnail(file_path, file_type)
if thumb_data:
thumb_file = THUMB_CACHE / f"{att_id}.jpg"
thumb_file.write_bytes(thumb_data)
generated += 1
else:
failed += 1
print(f" Failed: {att_id} - {local_path}")
print(f"\nDone!")
print(f" Generated: {generated}")
print(f" Failed: {failed}")
print(f" Missing files: {missing}")
if __name__ == '__main__':
main()

293
scripts/retroactive_face_scan.py Executable file
View File

@@ -0,0 +1,293 @@
#!/usr/bin/env python3
"""
Retroactive Face Recognition Scanner
Scans existing files in a directory and moves unmatched files to review queue
while storing their original destination paths for later restoration.
"""
import os
import sys
import json
import shutil
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Bootstrap PostgreSQL adapter before any database imports
from modules.db_bootstrap import bootstrap_database
bootstrap_database()
from modules.face_recognition_module import FaceRecognitionModule
from modules.unified_database import UnifiedDatabase
from modules.settings_manager import SettingsManager
# Configuration
SCAN_BASE_DIR = "/opt/immich/md"
REVIEW_DIR = "/opt/immich/review"
DATABASE_PATH = "/opt/media-downloader/database/media_downloader.db"
# Supported file extensions
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.heic'}
VIDEO_EXTENSIONS = {'.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv', '.m4v'}
SUPPORTED_EXTENSIONS = IMAGE_EXTENSIONS | VIDEO_EXTENSIONS
class RetroactiveFaceScanner:
def __init__(self, scan_only=False):
self.db = UnifiedDatabase()
self.settings_manager = SettingsManager(DATABASE_PATH)
self.face_module = FaceRecognitionModule(unified_db=self.db)
self.scan_only = scan_only
self.stats = {
'total_files': 0,
'matched': 0,
'unmatched': 0,
'errors': 0,
'skipped': 0
}
def get_relative_path(self, full_path):
"""Get path relative to SCAN_BASE_DIR"""
try:
return os.path.relpath(full_path, SCAN_BASE_DIR)
except ValueError:
return full_path
def scan_directory(self, directory):
"""Recursively scan directory for media files"""
print(f"\n🔍 Scanning directory: {directory}")
print("=" * 70)
for root, dirs, files in os.walk(directory):
for filename in files:
file_path = os.path.join(root, filename)
file_ext = os.path.splitext(filename)[1].lower()
if file_ext not in SUPPORTED_EXTENSIONS:
continue
self.stats['total_files'] += 1
self.process_file(file_path, file_ext in VIDEO_EXTENSIONS)
self.print_stats()
def process_file(self, file_path, is_video):
"""Process a single file with face recognition"""
filename = os.path.basename(file_path)
relative_path = self.get_relative_path(os.path.dirname(file_path))
print(f"\n[{self.stats['total_files']}] {filename}")
print(f" Location: {relative_path}")
try:
# Get face recognition settings
settings = self.settings_manager.get('face_recognition', {})
if not settings.get('enabled', False):
print(" ⚠ Face recognition is disabled in settings")
self.stats['skipped'] += 1
return
tolerance = settings.get('tolerance', 0.6)
# Check for faces
print(f" 🔍 Checking for faces (tolerance: {tolerance})...")
result = self.face_module.check_image(file_path, tolerance=tolerance, is_video=is_video)
# Log scan result to database
try:
self.db.log_face_recognition_scan(
file_path=file_path,
has_match=result.get('has_match', False),
matched_person=result.get('person_name'),
confidence=result.get('confidence'),
face_count=result.get('face_count', 0),
scan_type='retroactive'
)
except Exception as db_err:
print(f" ⚠ Warning: Failed to log to database: {db_err}")
if result.get('has_match'):
person_name = result.get('person_name', 'Unknown')
confidence = result.get('confidence', 0)
print(f" ✓ MATCH: {person_name} (confidence: {confidence:.2%})")
self.stats['matched'] += 1
else:
if self.scan_only:
print(f" ✗ NO MATCH (scan-only mode, not moving file)")
self.stats['unmatched'] += 1
else:
print(f" ✗ NO MATCH - Moving to review queue...")
self.move_to_review(file_path, file_path) # Pass full path as original path
self.stats['unmatched'] += 1
except Exception as e:
print(f" ✗ ERROR: {str(e)}")
self.stats['errors'] += 1
def move_to_review(self, file_path, original_path):
"""Move file to review queue and update database with intended_path"""
try:
from pathlib import Path
# Maintain directory structure in review queue
base_path = Path(SCAN_BASE_DIR)
file_path_obj = Path(file_path)
if file_path_obj.is_relative_to(base_path):
# Get relative path from base
relative_path = file_path_obj.relative_to(base_path)
# Recreate under review directory
review_path = Path(REVIEW_DIR) / relative_path
else:
# Fallback to flat structure if not under base path
review_path = Path(REVIEW_DIR) / file_path_obj.name
# Ensure parent directory exists
review_path.parent.mkdir(parents=True, exist_ok=True)
# Move file
shutil.move(file_path, str(review_path))
# Update database entry with new review path and store intended_path in metadata
with self.db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Find the download entry for this file
cursor.execute('SELECT id, metadata FROM downloads WHERE file_path = ?', (file_path,))
row = cursor.fetchone()
if row:
download_id = row['id']
existing_metadata = json.loads(row['metadata']) if row['metadata'] else {}
# Add intended_path to metadata
existing_metadata['intended_path'] = file_path
# Update the download record with new review path and metadata
cursor.execute('''
UPDATE downloads
SET file_path = ?, metadata = ?
WHERE id = ?
''', (str(review_path), json.dumps(existing_metadata), download_id))
print(f" → Moved to: {review_path}")
print(f" → Original path stored in database: {file_path}")
else:
print(f" ⚠ Warning: No database entry found for {file_path}")
print(f" → Moved to: {review_path} (not tracked in database)")
except Exception as e:
print(f" ✗ Failed to move file: {e}")
raise
def print_stats(self):
"""Print final statistics"""
print("\n" + "=" * 70)
print("📊 SCAN COMPLETE")
print("=" * 70)
print(f"Total files scanned: {self.stats['total_files']}")
print(f"✓ Matched: {self.stats['matched']}")
print(f"✗ Unmatched (moved): {self.stats['unmatched']}")
print(f"⚠ Errors: {self.stats['errors']}")
print(f"⊘ Skipped: {self.stats['skipped']}")
print("=" * 70)
def main():
import argparse
parser = argparse.ArgumentParser(
description='Retroactively scan existing files with face recognition',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Scan social media directory
python3 scripts/retroactive_face_scan.py "social media"
# Scan specific subdirectory
python3 scripts/retroactive_face_scan.py "social media/instagram"
# Scan with full path
python3 scripts/retroactive_face_scan.py "/opt/immich/md/social media"
Note: Original paths are stored in the database metadata as 'intended_path'.
Use the Review UI to keep/delete/add reference to moved files.
"""
)
parser.add_argument(
'directory',
help='Directory to scan (relative to /opt/immich/md or absolute path)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be done without moving files'
)
parser.add_argument(
'--scan-only',
action='store_true',
help='Scan and update database only - do not move unmatched files to review'
)
args = parser.parse_args()
# Determine scan directory
if os.path.isabs(args.directory):
scan_dir = args.directory
else:
scan_dir = os.path.join(SCAN_BASE_DIR, args.directory)
if not os.path.exists(scan_dir):
print(f"✗ Error: Directory does not exist: {scan_dir}")
sys.exit(1)
if not os.path.isdir(scan_dir):
print(f"✗ Error: Not a directory: {scan_dir}")
sys.exit(1)
if args.dry_run:
print("🔍 DRY RUN MODE - No files will be moved")
print("=" * 70)
print(f"\n🎯 Retroactive Face Recognition Scan")
print(f"Scan directory: {scan_dir}")
print(f"Review queue: {REVIEW_DIR}")
# Confirm
response = input("\nContinue? (y/n): ")
if response.lower() != 'y':
print("Cancelled.")
sys.exit(0)
# Run scan
scanner = RetroactiveFaceScanner(scan_only=args.scan_only)
if args.scan_only:
print("🔍 SCAN-ONLY MODE - Files will NOT be moved to review")
print("=" * 70)
if args.dry_run:
# TODO: Implement dry run mode
print("\n⚠ Dry run mode not yet implemented")
sys.exit(1)
else:
scanner.scan_directory(scan_dir)
print(f"\n✓ Scan complete!")
if args.scan_only:
print(f"\n📝 Scan-only mode: Database updated with face recognition results.")
print(f"No files were moved. Use the GUI to filter by 'Not Scanned' or 'No Match'.")
else:
print(f"\nUnmatched files have been moved to: {REVIEW_DIR}")
print(f"Use the Review UI at http://your-server:5173/review to process them.")
print(f"\nOriginal paths stored in database metadata.")
if __name__ == '__main__':
main()

103
scripts/run-dependency-updates.sh Executable file
View File

@@ -0,0 +1,103 @@
#!/bin/bash
# Dependency Update Script
# Safely stops services, runs updates, and restarts services
# Designed to be run by systemd timer at scheduled intervals
set -e
LOG_FILE="/opt/media-downloader/logs/dependency-updates.log"
LOCK_FILE="/tmp/dependency-updates.lock"
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') $1" | tee -a "$LOG_FILE"
}
# Check for lock file to prevent concurrent runs
if [ -f "$LOCK_FILE" ]; then
log "[WARN] Another update process is running. Exiting."
exit 0
fi
# Create lock file
trap "rm -f $LOCK_FILE" EXIT
echo $$ > "$LOCK_FILE"
log "[INFO] =========================================="
log "[INFO] Starting dependency update process"
log "[INFO] =========================================="
# Check if scheduler is running
SCHEDULER_WAS_RUNNING=false
if systemctl is-active --quiet media-downloader; then
SCHEDULER_WAS_RUNNING=true
log "[INFO] Stopping scheduler for updates..."
systemctl stop media-downloader
# Wait for clean shutdown
sleep 5
log "[INFO] Scheduler stopped"
fi
# Run dependency updates
log "[INFO] Running dependency updates..."
cd /opt/media-downloader
/opt/media-downloader/venv/bin/python3 -c "
import sys
sys.path.insert(0, '/opt/media-downloader')
from modules.dependency_updater import DependencyUpdater
from modules.settings_manager import SettingsManager
# Load config
settings = SettingsManager()
config = settings.get_all()
update_config = config.get('dependency_updater', {}) or config.get('dependency_updates', {})
if not update_config.get('enabled', True):
print('[INFO] Dependency updates disabled in config')
sys.exit(0)
updater = DependencyUpdater(config=update_config, scheduler_mode=True)
results = updater.force_update_check()
print('[INFO] Update results:')
for component, updated in results.items():
status = 'Updated' if updated else 'Current'
print(f' - {component}: {status}')
" 2>&1 | tee -a "$LOG_FILE"
UPDATE_STATUS=$?
if [ $UPDATE_STATUS -eq 0 ]; then
log "[INFO] Dependency updates completed successfully"
else
log "[ERROR] Dependency updates failed with status $UPDATE_STATUS"
fi
# Restart API to pick up any Python package changes
log "[INFO] Restarting API service..."
systemctl restart media-downloader-api
sleep 2
if systemctl is-active --quiet media-downloader-api; then
log "[INFO] API service restarted successfully"
else
log "[ERROR] API service failed to restart!"
fi
# Restart scheduler if it was running
if [ "$SCHEDULER_WAS_RUNNING" = true ]; then
log "[INFO] Restarting scheduler..."
systemctl start media-downloader
sleep 3
if systemctl is-active --quiet media-downloader; then
log "[INFO] Scheduler restarted successfully"
else
log "[ERROR] Scheduler failed to restart!"
fi
fi
log "[INFO] =========================================="
log "[INFO] Dependency update process complete"
log "[INFO] =========================================="

5
scripts/run-with-xvfb.sh Executable file
View File

@@ -0,0 +1,5 @@
#!/bin/bash
# Run media-downloader with Xvfb virtual display
# This allows headed browsers (ImgInn, Toolzu) to run without a GUI
env DISPLAY=:100 HOME=/root XAUTHORITY= "$@"

205
scripts/uninstall.sh Executable file
View File

@@ -0,0 +1,205 @@
#!/bin/bash
# Media Downloader Uninstaller Script
# Version: 11.27.0
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Installation directory
INSTALL_DIR="/opt/media-downloader"
echo -e "${RED}╔════════════════════════════════════════════════╗${NC}"
echo -e "${RED}║ Media Downloader Uninstaller ║${NC}"
echo -e "${RED}╚════════════════════════════════════════════════╝${NC}"
echo ""
# Check if running as root
if [[ $EUID -ne 0 ]]; then
echo -e "${RED}This script must be run as root (use sudo)${NC}"
exit 1
fi
echo -e "${YELLOW}This will remove:${NC}"
echo " - Installation directory: $INSTALL_DIR"
echo " - All systemd services and timers"
echo " - Command wrapper: /usr/local/bin/media-downloader"
echo ""
echo -e "${YELLOW}Services to be removed:${NC}"
echo " - media-downloader (scheduler)"
echo " - media-downloader-api (web API)"
echo " - media-downloader-frontend (web UI)"
echo " - xvfb-media-downloader (virtual display)"
echo " - media-cache-builder (timer)"
echo " - media-embedding-generator (timer)"
echo " - media-celebrity-enrichment (timer)"
echo ""
echo -e "${YELLOW}This will NOT remove:${NC}"
echo " - Downloaded media files (if stored elsewhere)"
echo " - Python packages installed system-wide"
echo " - Redis server"
echo " - FlareSolverr Docker container"
echo ""
read -p "Continue with uninstallation? (y/n) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "Uninstallation cancelled"
exit 1
fi
# ============================================================================
# STOP ALL SERVICES
# ============================================================================
echo -e "${YELLOW}Stopping all services...${NC}"
# Stop main services
for service in media-downloader media-downloader-api media-downloader-frontend xvfb-media-downloader; do
if systemctl is-active --quiet $service 2>/dev/null; then
echo " Stopping $service..."
systemctl stop $service
fi
done
# Stop timers
for timer in media-cache-builder media-embedding-generator media-celebrity-enrichment; do
if systemctl is-active --quiet $timer.timer 2>/dev/null; then
echo " Stopping $timer.timer..."
systemctl stop $timer.timer
fi
done
# ============================================================================
# DISABLE SERVICES
# ============================================================================
echo -e "${YELLOW}Disabling services...${NC}"
# Disable main services
for service in media-downloader media-downloader-api media-downloader-frontend xvfb-media-downloader; do
if systemctl is-enabled --quiet $service 2>/dev/null; then
echo " Disabling $service..."
systemctl disable $service
fi
done
# Disable timers
for timer in media-cache-builder media-embedding-generator media-celebrity-enrichment; do
if systemctl is-enabled --quiet $timer.timer 2>/dev/null; then
echo " Disabling $timer.timer..."
systemctl disable $timer.timer
fi
done
# ============================================================================
# REMOVE SYSTEMD FILES
# ============================================================================
echo -e "${YELLOW}Removing systemd files...${NC}"
rm -f /etc/systemd/system/media-downloader.service
rm -f /etc/systemd/system/media-downloader-api.service
rm -f /etc/systemd/system/media-downloader-frontend.service
rm -f /etc/systemd/system/xvfb-media-downloader.service
rm -f /etc/systemd/system/media-cache-builder.service
rm -f /etc/systemd/system/media-cache-builder.timer
rm -f /etc/systemd/system/media-embedding-generator.service
rm -f /etc/systemd/system/media-embedding-generator.timer
rm -f /etc/systemd/system/media-celebrity-enrichment.service
rm -f /etc/systemd/system/media-celebrity-enrichment.timer
# Reload systemd
systemctl daemon-reload
# ============================================================================
# BACKUP DATA
# ============================================================================
if [ -d "$INSTALL_DIR" ]; then
BACKUP_DIR="$HOME/media-downloader-backup-$(date +%Y%m%d-%H%M%S)"
echo -e "${GREEN}Creating backup at $BACKUP_DIR${NC}"
mkdir -p "$BACKUP_DIR"
# Backup config directory
if [ -d "$INSTALL_DIR/config" ]; then
cp -r "$INSTALL_DIR/config" "$BACKUP_DIR/"
echo " ✓ Configuration directory backed up"
fi
# Backup sessions
if [ -d "$INSTALL_DIR/sessions" ]; then
cp -r "$INSTALL_DIR/sessions" "$BACKUP_DIR/"
echo " ✓ Sessions backed up"
fi
# Backup cookies
if [ -d "$INSTALL_DIR/cookies" ]; then
cp -r "$INSTALL_DIR/cookies" "$BACKUP_DIR/"
echo " ✓ Forum cookies backed up"
fi
# Backup database directory
if [ -d "$INSTALL_DIR/database" ]; then
cp -r "$INSTALL_DIR/database" "$BACKUP_DIR/"
echo " ✓ Database directory backed up"
fi
# Backup data directory
if [ -d "$INSTALL_DIR/data" ]; then
cp -r "$INSTALL_DIR/data" "$BACKUP_DIR/"
echo " ✓ Data directory backed up"
fi
fi
# ============================================================================
# REMOVE INSTALLATION
# ============================================================================
# Remove installation directory
if [ -d "$INSTALL_DIR" ]; then
echo -e "${YELLOW}Removing installation directory...${NC}"
rm -rf "$INSTALL_DIR"
fi
# Remove command wrapper
if [ -f "/usr/local/bin/media-downloader" ]; then
echo -e "${YELLOW}Removing command wrapper...${NC}"
rm -f "/usr/local/bin/media-downloader"
fi
# ============================================================================
# COMPLETION
# ============================================================================
echo ""
echo -e "${GREEN}╔════════════════════════════════════════════════╗${NC}"
echo -e "${GREEN}║ Uninstallation Complete! ║${NC}"
echo -e "${GREEN}╚════════════════════════════════════════════════╝${NC}"
echo ""
if [ -n "$BACKUP_DIR" ] && [ -d "$BACKUP_DIR" ]; then
echo -e "${BLUE}Backup created at:${NC} $BACKUP_DIR"
echo ""
echo -e "${YELLOW}To restore your data to a new installation:${NC}"
echo " 1. Install Media Downloader again:"
echo " sudo ./scripts/install.sh"
echo ""
echo " 2. Restore your backed up data:"
echo " sudo cp -r $BACKUP_DIR/database/* /opt/media-downloader/database/"
echo " sudo cp -r $BACKUP_DIR/config/* /opt/media-downloader/config/"
echo " sudo cp -r $BACKUP_DIR/sessions /opt/media-downloader/"
echo " sudo cp -r $BACKUP_DIR/cookies /opt/media-downloader/"
echo " sudo cp -r $BACKUP_DIR/data/* /opt/media-downloader/data/"
echo " sudo chown -R \$USER:\$USER /opt/media-downloader/"
echo ""
echo " 3. Restart the services:"
echo " sudo systemctl restart media-downloader"
echo " sudo systemctl restart media-downloader-api"
fi

131
scripts/update-all-versions.sh Executable file
View File

@@ -0,0 +1,131 @@
#!/bin/bash
#
# Comprehensive Version Update Script for Media Downloader
# Updates ALL version references across the entire codebase
#
# Usage: ./scripts/update-all-versions.sh <new_version>
# Example: ./scripts/update-all-versions.sh 6.11.0
#
set -e
if [ -z "$1" ]; then
echo "Error: Version number required"
echo "Usage: $0 <version>"
echo "Example: $0 6.11.0"
exit 1
fi
NEW_VERSION="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
echo "╔════════════════════════════════════════════════╗"
echo "║ Media Downloader Version Update ║"
echo "╠════════════════════════════════════════════════╣"
echo "║ New Version: $NEW_VERSION"
echo "╚════════════════════════════════════════════════╝"
echo ""
# Read current version
CURRENT_VERSION=$(cat "$PROJECT_ROOT/VERSION" 2>/dev/null || echo "unknown")
echo "Current version: $CURRENT_VERSION"
echo "New version: $NEW_VERSION"
echo ""
# Function to update file with sed
update_file() {
local file=$1
local pattern=$2
local replacement=$3
local description=$4
if [ -f "$file" ]; then
sed -i "$pattern" "$file"
echo "✓ Updated: $description"
else
echo "⚠ Skipped: $file (not found)"
fi
}
echo "Updating version files..."
echo "─────────────────────────────────────────────────"
# 1. VERSION file
echo "$NEW_VERSION" > "$PROJECT_ROOT/VERSION"
echo "✓ Updated: VERSION file"
# 2. README.md
update_file "$PROJECT_ROOT/README.md" \
"s/\*\*Version:\*\* [0-9.]\+/**Version:** $NEW_VERSION/g" \
"README.md (header)"
update_file "$PROJECT_ROOT/README.md" \
"s/VERSION.*# Version number ([0-9.]\+)/VERSION # Version number ($NEW_VERSION)/g" \
"README.md (directory structure comment)"
# 3. Frontend files
echo ""
echo "Updating frontend files..."
echo "─────────────────────────────────────────────────"
update_file "$PROJECT_ROOT/web/frontend/src/pages/Login.tsx" \
"s/<p>v[0-9.]\+<\/p>/<p>v$NEW_VERSION<\/p>/g" \
"Login.tsx"
update_file "$PROJECT_ROOT/web/frontend/src/App.tsx" \
"s/v[0-9.]\+<\/p>/v$NEW_VERSION<\/p>/g" \
"App.tsx (all occurrences)"
update_file "$PROJECT_ROOT/web/frontend/src/pages/Configuration.tsx" \
"s/Version [0-9.]\+/Version $NEW_VERSION/g" \
"Configuration.tsx"
update_file "$PROJECT_ROOT/web/frontend/src/pages/Configuration.tsx" \
"s/v[0-9.]\+)/v$NEW_VERSION)/g" \
"Configuration.tsx (comments)"
update_file "$PROJECT_ROOT/web/frontend/package.json" \
"s/\"version\": \"[0-9.]\+\"/\"version\": \"$NEW_VERSION\"/g" \
"package.json"
# 4. Backend API version
echo ""
echo "Updating backend files..."
echo "─────────────────────────────────────────────────"
update_file "$PROJECT_ROOT/web/backend/api.py" \
"s/version=\"[0-9.]\+\"/version=\"$NEW_VERSION\"/g" \
"api.py"
update_file "$PROJECT_ROOT/web/backend/core/config.py" \
"s/API_VERSION: str = \"[0-9.]\+\"/API_VERSION: str = \"$NEW_VERSION\"/g" \
"core/config.py (API_VERSION)"
# 5. Installer script
echo ""
echo "Updating installer..."
echo "─────────────────────────────────────────────────"
update_file "$PROJECT_ROOT/scripts/install.sh" \
"s/# Version: [0-9.]\+/# Version: $NEW_VERSION/g" \
"install.sh (header comment)"
update_file "$PROJECT_ROOT/scripts/install.sh" \
"s/Installer v[0-9.]\+/Installer v$NEW_VERSION/g" \
"install.sh (banner)"
echo ""
echo "╔════════════════════════════════════════════════╗"
echo "║ Version Update Complete ║"
echo "╠════════════════════════════════════════════════╣"
echo "║ Version: $NEW_VERSION"
echo "║ Files Updated: 10"
echo "╚════════════════════════════════════════════════╝"
echo ""
echo "Next steps:"
echo "1. Update data/changelog.json with new version"
echo "2. Update docs/CHANGELOG.md with release notes"
echo "3. Rebuild frontend (automatically if dev server running)"
echo "4. Run: ./scripts/create-version-backup.sh"
echo ""

165
scripts/update-version.sh Executable file
View File

@@ -0,0 +1,165 @@
#!/bin/bash
################################################################################
# Media Downloader Version Update Script
# Updates version numbers across all application files
# Usage: bash scripts/update-version.sh 6.4.3
################################################################################
set -e # Exit on error
NEW_VERSION="$1"
OLD_VERSION=$(cat /opt/media-downloader/VERSION 2>/dev/null || echo "unknown")
# Color codes
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Function to print colored output
print_success() { echo -e "${GREEN}${NC} $1"; }
print_error() { echo -e "${RED}${NC} $1"; }
print_info() { echo -e "${BLUE}${NC} $1"; }
print_warning() { echo -e "${YELLOW}${NC} $1"; }
################################################################################
# Validate Input
################################################################################
if [ -z "$NEW_VERSION" ]; then
print_error "No version specified!"
echo ""
echo "Usage: $0 <version>"
echo "Example: $0 6.4.3"
echo ""
exit 1
fi
# Validate version format (X.X.X)
if ! [[ "$NEW_VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
print_error "Invalid version format: $NEW_VERSION"
echo "Version must be in format X.X.X (e.g., 6.4.3)"
exit 1
fi
################################################################################
# Header
################################################################################
echo ""
echo "╔════════════════════════════════════════════════╗"
echo "║ Media Downloader Version Update ║"
echo "╠════════════════════════════════════════════════╣"
echo "║ Current: ${OLD_VERSION} "
echo "║ New: ${NEW_VERSION} "
echo "╚════════════════════════════════════════════════╝"
echo ""
# Confirm with user
read -p "Update version from ${OLD_VERSION} to ${NEW_VERSION}? (y/N) " -n 1 -r
echo ""
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
print_info "Version update cancelled"
exit 0
fi
################################################################################
# Update Files
################################################################################
echo ""
print_info "Updating version files..."
echo ""
# 1. Update VERSION file
print_info "1/5 Updating VERSION file..."
echo "$NEW_VERSION" > /opt/media-downloader/VERSION
print_success "VERSION file updated"
# 2. Update backend API
print_info "2/5 Updating backend API (web/backend/api.py)..."
sed -i "s/version=\"[0-9]\+\.[0-9]\+\.[0-9]\+\"/version=\"$NEW_VERSION\"/" \
/opt/media-downloader/web/backend/api.py
print_success "Backend API version updated"
# 3. Update package.json
print_info "3/5 Updating frontend package.json..."
sed -i "s/\"version\": \"[0-9]\+\.[0-9]\+\.[0-9]\+\"/\"version\": \"$NEW_VERSION\"/" \
/opt/media-downloader/web/frontend/package.json
print_success "package.json updated"
# 4. Update App.tsx (both locations)
print_info "4/5 Updating App.tsx (desktop and mobile menus)..."
sed -i "s/>v[0-9]\+\.[0-9]\+\.[0-9]\+</>v$NEW_VERSION</g" \
/opt/media-downloader/web/frontend/src/App.tsx
print_success "App.tsx updated (2 locations)"
# 5. Update Configuration.tsx
print_info "5/5 Updating Configuration.tsx (About tab)..."
sed -i "s/Version [0-9]\+\.[0-9]\+\.[0-9]\+/Version $NEW_VERSION/" \
/opt/media-downloader/web/frontend/src/pages/Configuration.tsx
sed -i "s/currently v[0-9]\+\.[0-9]\+\.[0-9]\+/currently v$NEW_VERSION/" \
/opt/media-downloader/web/frontend/src/pages/Configuration.tsx
print_success "Configuration.tsx updated"
################################################################################
# Verification
################################################################################
echo ""
print_info "Verifying version updates..."
echo ""
# Check each file
check_file() {
local file="$1"
local pattern="$2"
local description="$3"
if grep -q "$pattern" "$file" 2>/dev/null; then
print_success "$description"
else
print_error "$description - NOT FOUND!"
return 1
fi
}
check_file "/opt/media-downloader/VERSION" "^$NEW_VERSION$" "VERSION file"
check_file "/opt/media-downloader/web/backend/api.py" "version=\"$NEW_VERSION\"" "Backend API"
check_file "/opt/media-downloader/web/frontend/package.json" "\"version\": \"$NEW_VERSION\"" "package.json"
check_file "/opt/media-downloader/web/frontend/src/App.tsx" "v$NEW_VERSION" "App.tsx (menus)"
check_file "/opt/media-downloader/web/frontend/src/pages/Configuration.tsx" "Version $NEW_VERSION" "Configuration.tsx (About tab)"
################################################################################
# Manual Steps Reminder
################################################################################
echo ""
echo "╔════════════════════════════════════════════════╗"
echo "║ Version Update Complete ║"
echo "╚════════════════════════════════════════════════╝"
echo ""
print_warning "MANUAL STEPS REQUIRED:"
echo ""
echo "1. Update data/changelog.json:"
echo " - Add new version entry at the TOP of the array"
echo " - Include version, date, title, type, and changes"
echo ""
echo "2. Update CHANGELOG.md:"
echo " - Add new version section at the TOP (after header)"
echo " - Document all changes, fixes, and features"
echo ""
echo "3. Restart services:"
echo " ${BLUE}sudo systemctl restart media-downloader-api${NC}"
echo ""
echo "4. Create version backup:"
echo " ${BLUE}bash scripts/create-version-backup.sh${NC}"
echo ""
echo "5. Verify in browser:"
echo " - Check Health page loads correctly"
echo " - Check Configuration → About tab shows v$NEW_VERSION"
echo " - Check desktop/mobile menu shows v$NEW_VERSION"
echo ""
print_info "See docs/VERSION_UPDATE_CHECKLIST.md for full checklist"
echo ""

View File

@@ -0,0 +1,181 @@
#!/usr/bin/env python3
"""
Check Fansly attachments for 4K variants and upgrade if available.
This script:
1. Finds all non-4K video attachments from Fansly Direct
2. Re-fetches media info from the Fansly API
3. Checks if a higher resolution variant is available
4. Updates the attachment URL and resets for re-download if upgrade found
"""
import asyncio
import sys
import os
# Add project to path
sys.path.insert(0, '/opt/media-downloader')
# Bootstrap PostgreSQL adapter before any database imports
from modules.db_bootstrap import bootstrap_database
bootstrap_database()
from modules.paid_content.fansly_direct_client import FanslyDirectClient
from modules.paid_content.db_adapter import PaidContentDB
async def check_and_upgrade_attachments():
"""Check all non-4K Fansly attachments for upgrades."""
db = PaidContentDB('/opt/media-downloader/database/media_downloader.db')
# Get Fansly auth token
service = db.get_service('fansly_direct')
if not service or not service.get('session_cookie'):
print("ERROR: No Fansly auth token configured")
return
auth_token = service['session_cookie']
client = FanslyDirectClient(auth_token)
# Find non-4K video attachments from Fansly Direct
# 4K is 3840x2160 or 2160x3840 (portrait)
query = """
SELECT a.id, a.name, a.width, a.height, a.status, p.post_id, p.id as db_post_id
FROM paid_content_attachments a
JOIN paid_content_posts p ON a.post_id = p.id
JOIN paid_content_creators c ON p.creator_id = c.id
WHERE c.service_id = 'fansly_direct'
AND a.file_type = 'video'
AND a.width IS NOT NULL
AND a.height IS NOT NULL
AND NOT (
(a.width >= 3840 AND a.height >= 2160) OR
(a.width >= 2160 AND a.height >= 3840)
)
AND p.post_id NOT LIKE 'manual_%'
AND p.post_id NOT LIKE 'import_%'
AND p.post_id NOT LIKE '20%-%'
ORDER BY a.id
"""
cursor = db.conn.execute(query)
attachments = cursor.fetchall()
print(f"Found {len(attachments)} non-4K video attachments to check")
print("-" * 80)
upgrades_found = 0
errors = 0
already_best = 0
for att in attachments:
att_id, name, width, height, status, post_id, db_post_id = att
current_res = f"{width}x{height}"
print(f"\nChecking: {name} (ID: {att_id}, current: {current_res})")
try:
# Extract media ID from filename (e.g., "12345.mp4" -> "12345")
media_id = name.replace('.mp4', '').replace('.mov', '')
# Fetch media info from Fansly API
# We need to get the account media for this post
async with client:
# Get the post to find media info
posts, _, media_dict, account_media_dict, bundle_dict = await client._fetch_timeline_page(
account_id=None, # We'll search by post ID
before=str(int(post_id) + 1), # Get this post
account={}
)
# Find media in the dictionaries
found_4k = False
best_width = width
best_height = height
best_url = None
# Check account_media_dict for this media
for am_id, am_data in account_media_dict.items():
media = am_data.get('media', {})
if str(media.get('id')) == media_id:
# Found the media, check variants
variants = media.get('variants', [])
print(f" Found media with {len(variants)} variants")
for v in variants:
v_w = v.get('width', 0) or 0
v_h = v.get('height', 0) or 0
v_locs = v.get('locations', [])
# Check if this is a higher resolution
if v_w * v_h > best_width * best_height:
for loc in v_locs:
loc_url = loc.get('location', '')
# Prefer streaming formats for 4K
if '.m3u8' in loc_url or '.mp4' in loc_url or '.mov' in loc_url:
best_width = v_w
best_height = v_h
# Construct signed URL if metadata present
metadata = loc.get('metadata', {})
if metadata:
params = []
for key in ['Key-Pair-Id', 'Signature', 'Policy']:
if key in metadata:
params.append(f"{key}={metadata[key]}")
if params:
best_url = loc_url + '?' + '&'.join(params)
else:
best_url = loc_url
else:
best_url = loc_url
if v_w >= 3840 or v_h >= 3840:
found_4k = True
break
break
if found_4k and best_url:
print(f" ✓ UPGRADE FOUND: {best_width}x{best_height}")
upgrades_found += 1
# Update the attachment
db.conn.execute("""
UPDATE paid_content_attachments
SET download_url = ?,
width = ?,
height = ?,
status = 'pending',
download_attempts = 0,
error_message = NULL,
local_path = NULL,
local_filename = NULL
WHERE id = ?
""", (best_url, best_width, best_height, att_id))
db.conn.commit()
print(f" → Updated and queued for re-download")
elif best_width > width or best_height > height:
print(f" ~ Better quality available: {best_width}x{best_height} (not 4K)")
else:
print(f" - Already at best available quality")
already_best += 1
except Exception as e:
print(f" ✗ Error: {e}")
errors += 1
# Rate limiting
await asyncio.sleep(0.5)
print("\n" + "=" * 80)
print(f"Summary:")
print(f" Upgrades found and queued: {upgrades_found}")
print(f" Already at best quality: {already_best}")
print(f" Errors: {errors}")
print(f" Total checked: {len(attachments)}")
if __name__ == '__main__':
asyncio.run(check_and_upgrade_attachments())