Files
media-downloader/scripts/backfill_paid_content.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

359 lines
13 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Backfill Paid Content from existing downloaded files.
This script:
1. Scans a source directory containing previously downloaded content
2. Matches files to posts/attachments in the database by ID
3. Copies files to the proper download location
4. Generates thumbnails
5. Updates database records as if they were freshly downloaded
Usage:
python3 backfill_paid_content.py /path/to/source/files --creator puffinasmr --platform fansly
"""
import argparse
import hashlib
import os
import re
import shutil
import sqlite3
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from io import BytesIO
# Add project root to path
sys.path.insert(0, '/opt/media-downloader')
from modules.unified_database import UnifiedDatabase
def get_file_hash(file_path: Path) -> str:
"""Compute SHA256 hash of file"""
sha256 = hashlib.sha256()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(8192), b''):
sha256.update(chunk)
return sha256.hexdigest()
def generate_thumbnail(file_path: Path, file_type: str) -> bytes:
"""Generate thumbnail for image or video"""
try:
if file_type == 'image':
from PIL import Image
img = Image.open(file_path)
img.thumbnail((400, 400), Image.Resampling.LANCZOS)
if img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
buffer = BytesIO()
img.save(buffer, format='JPEG', quality=85)
return buffer.getvalue()
elif file_type == 'video':
# Use ffmpeg to extract a frame
result = subprocess.run([
'ffmpeg', '-i', str(file_path),
'-ss', '00:00:01', # 1 second in
'-vframes', '1',
'-vf', 'scale=400:-1',
'-f', 'image2pipe',
'-vcodec', 'mjpeg',
'-'
], capture_output=True, timeout=30)
if result.returncode == 0 and result.stdout:
return result.stdout
except Exception as e:
print(f" Warning: Failed to generate thumbnail: {e}")
return None
def get_file_type(filename: str) -> str:
"""Determine file type from extension"""
ext = Path(filename).suffix.lower()
if ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']:
return 'image'
elif ext in ['.mp4', '.webm', '.mov', '.avi', '.mkv', '.m4v']:
return 'video'
else:
return 'other'
def sanitize_filename(name: str) -> str:
"""Sanitize string for use in filename/directory"""
name = re.sub(r'[<>:"/\\|?*]', '', name)
name = re.sub(r'\s+', '-', name.strip())
return name or 'unnamed'
def main():
parser = argparse.ArgumentParser(description='Backfill paid content from existing files')
parser.add_argument('source_dir', help='Source directory containing downloaded files')
parser.add_argument('--creator', required=True, help='Creator username')
parser.add_argument('--platform', required=True, help='Platform (fansly, onlyfans, etc.)')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
parser.add_argument('--limit', type=int, help='Limit number of posts to process')
args = parser.parse_args()
source_dir = Path(args.source_dir)
if not source_dir.exists():
print(f"Error: Source directory does not exist: {source_dir}")
sys.exit(1)
# Initialize database
db = UnifiedDatabase()
# Get config for base download path
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT base_download_path FROM paid_content_config WHERE id = 1")
row = cursor.fetchone()
base_path = Path(row[0] if row else '/opt/immich/paid')
# Find the creator in database
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT id, username, platform, service_id
FROM paid_content_creators
WHERE LOWER(username) = LOWER(?) AND LOWER(platform) = LOWER(?)
""", (args.creator, args.platform))
creator = cursor.fetchone()
if not creator:
print(f"Error: Creator '{args.creator}' on platform '{args.platform}' not found in database")
sys.exit(1)
creator_id, username, platform, service_id = creator
print(f"Found creator: {username} ({platform}) - ID: {creator_id}")
# Scan source directory for post folders
post_folders = [d for d in source_dir.iterdir() if d.is_dir() and d.name.isdigit()]
print(f"Found {len(post_folders)} post folders in source directory")
if args.limit:
post_folders = post_folders[:args.limit]
print(f"Limited to {args.limit} posts")
# Stats
stats = {
'posts_found': 0,
'posts_matched': 0,
'files_copied': 0,
'files_skipped': 0,
'thumbnails_generated': 0,
'errors': 0
}
for post_folder in post_folders:
post_id = post_folder.name
# Find post in database
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT id, title, published_at
FROM paid_content_posts
WHERE creator_id = ? AND post_id = ?
""", (creator_id, post_id))
post = cursor.fetchone()
if not post:
# Try partial match (post_id might be truncated in DB)
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT id, title, published_at, post_id
FROM paid_content_posts
WHERE creator_id = ? AND post_id LIKE ?
""", (creator_id, f"{post_id[:12]}%"))
post = cursor.fetchone()
if post:
post_id = post[3] # Use the full post_id from DB
if not post:
print(f" Post {post_id}: Not found in database, skipping")
continue
post_db_id, post_title, published_at = post[0], post[1], post[2]
stats['posts_matched'] += 1
# Build destination directory - matches scraper's _build_file_path structure
# Format: /base/platform/username/date/post_id/
post_date = published_at[:10] if published_at else 'unknown-date'
post_dir_name = post_id # Just post_id, no prefix
dest_dir = base_path / platform / sanitize_filename(username) / post_date / post_dir_name
print(f" Post {post_id}: {post_title or '(no title)'}")
print(f" -> {dest_dir}")
# Get attachments for this post
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT id, name, server_path, status, local_path, attachment_index
FROM paid_content_attachments
WHERE post_id = ?
ORDER BY attachment_index
""", (post_db_id,))
attachments = cursor.fetchall()
# Scan files in source folder
source_files = list(post_folder.iterdir())
source_files = [f for f in source_files if f.is_file()]
print(f" Found {len(source_files)} files, {len(attachments)} attachments in DB")
for att in attachments:
att_id, att_name, server_path, status, local_path, att_index = att
# Skip if already completed with valid local_path
if status == 'completed' and local_path and Path(local_path).exists():
print(f" [{att_index}] Already downloaded: {att_name}")
stats['files_skipped'] += 1
continue
# Try to find matching file in source
# Files might be named with attachment ID or just the filename
matching_file = None
# Extract potential file ID from server_path or name
if server_path:
# Server path like /27/37/2737100bd05f040ae0a0b10c452be9efdf54816577e53775b96b035eac200cde.jpg
server_filename = Path(server_path).stem # Get hash without extension
for src_file in source_files:
src_stem = src_file.stem
src_name = src_file.name
# Match by various patterns
if att_name and src_name == att_name:
matching_file = src_file
break
if att_name and src_stem == Path(att_name).stem:
matching_file = src_file
break
# Match by attachment ID in filename (Fansly style: 286246551964098560.png)
if src_stem.isdigit():
# Could be attachment ID
if att_name and src_stem in att_name:
matching_file = src_file
break
if not matching_file:
# Try to match by index
if att_index < len(source_files):
# Sort source files and pick by index
sorted_files = sorted(source_files, key=lambda f: f.name)
matching_file = sorted_files[att_index]
print(f" [{att_index}] Matched by index: {matching_file.name}")
if not matching_file:
print(f" [{att_index}] No matching file found for: {att_name}")
stats['errors'] += 1
continue
# Determine file type and extension
file_type = get_file_type(matching_file.name)
ext = matching_file.suffix or Path(att_name).suffix if att_name else '.bin'
# Build destination filename - matches scraper's _build_file_path
# Fansly uses just media ID (unique), other platforms use index prefix
if att_name:
sanitized_name = sanitize_filename(att_name)
# Ensure extension is preserved
if not sanitized_name.lower().endswith(ext.lower()):
sanitized_name = Path(att_name).stem + ext
dest_filename = sanitized_name # Fansly: no index prefix needed
else:
# Fallback to source filename
dest_filename = matching_file.name
dest_path = dest_dir / dest_filename
print(f" [{att_index}] {matching_file.name} -> {dest_filename}")
if args.dry_run:
stats['files_copied'] += 1
continue
# Create destination directory
dest_dir.mkdir(parents=True, exist_ok=True)
# Copy file
try:
shutil.copy2(matching_file, dest_path)
stats['files_copied'] += 1
except Exception as e:
print(f" Error copying file: {e}")
stats['errors'] += 1
continue
# Compute file hash
file_hash = get_file_hash(dest_path)
file_size = dest_path.stat().st_size
# Generate thumbnail
thumbnail_data = generate_thumbnail(dest_path, file_type)
if thumbnail_data:
stats['thumbnails_generated'] += 1
# Update database
now = datetime.now().isoformat()
with db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
cursor.execute("""
UPDATE paid_content_attachments
SET status = 'completed',
local_path = ?,
local_filename = ?,
file_hash = ?,
file_size = ?,
file_type = ?,
downloaded_at = ?,
thumbnail_data = ?
WHERE id = ?
""", (str(dest_path), dest_filename, file_hash, file_size, file_type, now, thumbnail_data, att_id))
conn.commit()
# Update post downloaded status
if not args.dry_run:
with db.get_connection(for_write=True) as conn:
cursor = conn.cursor()
# Check if all attachments are now completed
cursor.execute("""
SELECT COUNT(*) FROM paid_content_attachments
WHERE post_id = ? AND status != 'completed'
""", (post_db_id,))
pending = cursor.fetchone()[0]
if pending == 0:
cursor.execute("""
UPDATE paid_content_posts
SET downloaded = 1, download_date = ?
WHERE id = ?
""", (datetime.now().isoformat(), post_db_id))
conn.commit()
stats['posts_found'] += 1
# Print summary
print("\n" + "=" * 50)
print("BACKFILL SUMMARY")
print("=" * 50)
print(f"Posts found in source: {len(post_folders)}")
print(f"Posts matched in DB: {stats['posts_matched']}")
print(f"Files copied: {stats['files_copied']}")
print(f"Files skipped (existing): {stats['files_skipped']}")
print(f"Thumbnails generated: {stats['thumbnails_generated']}")
print(f"Errors: {stats['errors']}")
if args.dry_run:
print("\n(Dry run - no changes made)")
if __name__ == '__main__':
main()