#!/usr/bin/env python3 """ Backfill Paid Content from existing downloaded files. This script: 1. Scans a source directory containing previously downloaded content 2. Matches files to posts/attachments in the database by ID 3. Copies files to the proper download location 4. Generates thumbnails 5. Updates database records as if they were freshly downloaded Usage: python3 backfill_paid_content.py /path/to/source/files --creator puffinasmr --platform fansly """ import argparse import hashlib import os import re import shutil import sqlite3 import subprocess import sys from datetime import datetime from pathlib import Path from io import BytesIO # Add project root to path sys.path.insert(0, '/opt/media-downloader') from modules.unified_database import UnifiedDatabase def get_file_hash(file_path: Path) -> str: """Compute SHA256 hash of file""" sha256 = hashlib.sha256() with open(file_path, 'rb') as f: for chunk in iter(lambda: f.read(8192), b''): sha256.update(chunk) return sha256.hexdigest() def generate_thumbnail(file_path: Path, file_type: str) -> bytes: """Generate thumbnail for image or video""" try: if file_type == 'image': from PIL import Image img = Image.open(file_path) img.thumbnail((400, 400), Image.Resampling.LANCZOS) if img.mode in ('RGBA', 'P'): img = img.convert('RGB') buffer = BytesIO() img.save(buffer, format='JPEG', quality=85) return buffer.getvalue() elif file_type == 'video': # Use ffmpeg to extract a frame result = subprocess.run([ 'ffmpeg', '-i', str(file_path), '-ss', '00:00:01', # 1 second in '-vframes', '1', '-vf', 'scale=400:-1', '-f', 'image2pipe', '-vcodec', 'mjpeg', '-' ], capture_output=True, timeout=30) if result.returncode == 0 and result.stdout: return result.stdout except Exception as e: print(f" Warning: Failed to generate thumbnail: {e}") return None def get_file_type(filename: str) -> str: """Determine file type from extension""" ext = Path(filename).suffix.lower() if ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']: return 'image' elif ext in ['.mp4', '.webm', '.mov', '.avi', '.mkv', '.m4v']: return 'video' else: return 'other' def sanitize_filename(name: str) -> str: """Sanitize string for use in filename/directory""" name = re.sub(r'[<>:"/\\|?*]', '', name) name = re.sub(r'\s+', '-', name.strip()) return name or 'unnamed' def main(): parser = argparse.ArgumentParser(description='Backfill paid content from existing files') parser.add_argument('source_dir', help='Source directory containing downloaded files') parser.add_argument('--creator', required=True, help='Creator username') parser.add_argument('--platform', required=True, help='Platform (fansly, onlyfans, etc.)') parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') parser.add_argument('--limit', type=int, help='Limit number of posts to process') args = parser.parse_args() source_dir = Path(args.source_dir) if not source_dir.exists(): print(f"Error: Source directory does not exist: {source_dir}") sys.exit(1) # Initialize database db = UnifiedDatabase() # Get config for base download path with db.get_connection() as conn: cursor = conn.cursor() cursor.execute("SELECT base_download_path FROM paid_content_config WHERE id = 1") row = cursor.fetchone() base_path = Path(row[0] if row else '/opt/immich/paid') # Find the creator in database with db.get_connection() as conn: cursor = conn.cursor() cursor.execute(""" SELECT id, username, platform, service_id FROM paid_content_creators WHERE LOWER(username) = LOWER(?) AND LOWER(platform) = LOWER(?) """, (args.creator, args.platform)) creator = cursor.fetchone() if not creator: print(f"Error: Creator '{args.creator}' on platform '{args.platform}' not found in database") sys.exit(1) creator_id, username, platform, service_id = creator print(f"Found creator: {username} ({platform}) - ID: {creator_id}") # Scan source directory for post folders post_folders = [d for d in source_dir.iterdir() if d.is_dir() and d.name.isdigit()] print(f"Found {len(post_folders)} post folders in source directory") if args.limit: post_folders = post_folders[:args.limit] print(f"Limited to {args.limit} posts") # Stats stats = { 'posts_found': 0, 'posts_matched': 0, 'files_copied': 0, 'files_skipped': 0, 'thumbnails_generated': 0, 'errors': 0 } for post_folder in post_folders: post_id = post_folder.name # Find post in database with db.get_connection() as conn: cursor = conn.cursor() cursor.execute(""" SELECT id, title, published_at FROM paid_content_posts WHERE creator_id = ? AND post_id = ? """, (creator_id, post_id)) post = cursor.fetchone() if not post: # Try partial match (post_id might be truncated in DB) with db.get_connection() as conn: cursor = conn.cursor() cursor.execute(""" SELECT id, title, published_at, post_id FROM paid_content_posts WHERE creator_id = ? AND post_id LIKE ? """, (creator_id, f"{post_id[:12]}%")) post = cursor.fetchone() if post: post_id = post[3] # Use the full post_id from DB if not post: print(f" Post {post_id}: Not found in database, skipping") continue post_db_id, post_title, published_at = post[0], post[1], post[2] stats['posts_matched'] += 1 # Build destination directory - matches scraper's _build_file_path structure # Format: /base/platform/username/date/post_id/ post_date = published_at[:10] if published_at else 'unknown-date' post_dir_name = post_id # Just post_id, no prefix dest_dir = base_path / platform / sanitize_filename(username) / post_date / post_dir_name print(f" Post {post_id}: {post_title or '(no title)'}") print(f" -> {dest_dir}") # Get attachments for this post with db.get_connection() as conn: cursor = conn.cursor() cursor.execute(""" SELECT id, name, server_path, status, local_path, attachment_index FROM paid_content_attachments WHERE post_id = ? ORDER BY attachment_index """, (post_db_id,)) attachments = cursor.fetchall() # Scan files in source folder source_files = list(post_folder.iterdir()) source_files = [f for f in source_files if f.is_file()] print(f" Found {len(source_files)} files, {len(attachments)} attachments in DB") for att in attachments: att_id, att_name, server_path, status, local_path, att_index = att # Skip if already completed with valid local_path if status == 'completed' and local_path and Path(local_path).exists(): print(f" [{att_index}] Already downloaded: {att_name}") stats['files_skipped'] += 1 continue # Try to find matching file in source # Files might be named with attachment ID or just the filename matching_file = None # Extract potential file ID from server_path or name if server_path: # Server path like /27/37/2737100bd05f040ae0a0b10c452be9efdf54816577e53775b96b035eac200cde.jpg server_filename = Path(server_path).stem # Get hash without extension for src_file in source_files: src_stem = src_file.stem src_name = src_file.name # Match by various patterns if att_name and src_name == att_name: matching_file = src_file break if att_name and src_stem == Path(att_name).stem: matching_file = src_file break # Match by attachment ID in filename (Fansly style: 286246551964098560.png) if src_stem.isdigit(): # Could be attachment ID if att_name and src_stem in att_name: matching_file = src_file break if not matching_file: # Try to match by index if att_index < len(source_files): # Sort source files and pick by index sorted_files = sorted(source_files, key=lambda f: f.name) matching_file = sorted_files[att_index] print(f" [{att_index}] Matched by index: {matching_file.name}") if not matching_file: print(f" [{att_index}] No matching file found for: {att_name}") stats['errors'] += 1 continue # Determine file type and extension file_type = get_file_type(matching_file.name) ext = matching_file.suffix or Path(att_name).suffix if att_name else '.bin' # Build destination filename - matches scraper's _build_file_path # Fansly uses just media ID (unique), other platforms use index prefix if att_name: sanitized_name = sanitize_filename(att_name) # Ensure extension is preserved if not sanitized_name.lower().endswith(ext.lower()): sanitized_name = Path(att_name).stem + ext dest_filename = sanitized_name # Fansly: no index prefix needed else: # Fallback to source filename dest_filename = matching_file.name dest_path = dest_dir / dest_filename print(f" [{att_index}] {matching_file.name} -> {dest_filename}") if args.dry_run: stats['files_copied'] += 1 continue # Create destination directory dest_dir.mkdir(parents=True, exist_ok=True) # Copy file try: shutil.copy2(matching_file, dest_path) stats['files_copied'] += 1 except Exception as e: print(f" Error copying file: {e}") stats['errors'] += 1 continue # Compute file hash file_hash = get_file_hash(dest_path) file_size = dest_path.stat().st_size # Generate thumbnail thumbnail_data = generate_thumbnail(dest_path, file_type) if thumbnail_data: stats['thumbnails_generated'] += 1 # Update database now = datetime.now().isoformat() with db.get_connection(for_write=True) as conn: cursor = conn.cursor() cursor.execute(""" UPDATE paid_content_attachments SET status = 'completed', local_path = ?, local_filename = ?, file_hash = ?, file_size = ?, file_type = ?, downloaded_at = ?, thumbnail_data = ? WHERE id = ? """, (str(dest_path), dest_filename, file_hash, file_size, file_type, now, thumbnail_data, att_id)) conn.commit() # Update post downloaded status if not args.dry_run: with db.get_connection(for_write=True) as conn: cursor = conn.cursor() # Check if all attachments are now completed cursor.execute(""" SELECT COUNT(*) FROM paid_content_attachments WHERE post_id = ? AND status != 'completed' """, (post_db_id,)) pending = cursor.fetchone()[0] if pending == 0: cursor.execute(""" UPDATE paid_content_posts SET downloaded = 1, download_date = ? WHERE id = ? """, (datetime.now().isoformat(), post_db_id)) conn.commit() stats['posts_found'] += 1 # Print summary print("\n" + "=" * 50) print("BACKFILL SUMMARY") print("=" * 50) print(f"Posts found in source: {len(post_folders)}") print(f"Posts matched in DB: {stats['posts_matched']}") print(f"Files copied: {stats['files_copied']}") print(f"Files skipped (existing): {stats['files_skipped']}") print(f"Thumbnails generated: {stats['thumbnails_generated']}") print(f"Errors: {stats['errors']}") if args.dry_run: print("\n(Dry run - no changes made)") if __name__ == '__main__': main()