359 lines
13 KiB
Python
Executable File
359 lines
13 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Backfill Paid Content from existing downloaded files.
|
|
|
|
This script:
|
|
1. Scans a source directory containing previously downloaded content
|
|
2. Matches files to posts/attachments in the database by ID
|
|
3. Copies files to the proper download location
|
|
4. Generates thumbnails
|
|
5. Updates database records as if they were freshly downloaded
|
|
|
|
Usage:
|
|
python3 backfill_paid_content.py /path/to/source/files --creator puffinasmr --platform fansly
|
|
"""
|
|
|
|
import argparse
|
|
import hashlib
|
|
import os
|
|
import re
|
|
import shutil
|
|
import sqlite3
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from io import BytesIO
|
|
|
|
# Add project root to path
|
|
sys.path.insert(0, '/opt/media-downloader')
|
|
|
|
from modules.unified_database import UnifiedDatabase
|
|
|
|
|
|
def get_file_hash(file_path: Path) -> str:
|
|
"""Compute SHA256 hash of file"""
|
|
sha256 = hashlib.sha256()
|
|
with open(file_path, 'rb') as f:
|
|
for chunk in iter(lambda: f.read(8192), b''):
|
|
sha256.update(chunk)
|
|
return sha256.hexdigest()
|
|
|
|
|
|
def generate_thumbnail(file_path: Path, file_type: str) -> bytes:
|
|
"""Generate thumbnail for image or video"""
|
|
try:
|
|
if file_type == 'image':
|
|
from PIL import Image
|
|
img = Image.open(file_path)
|
|
img.thumbnail((400, 400), Image.Resampling.LANCZOS)
|
|
if img.mode in ('RGBA', 'P'):
|
|
img = img.convert('RGB')
|
|
buffer = BytesIO()
|
|
img.save(buffer, format='JPEG', quality=85)
|
|
return buffer.getvalue()
|
|
elif file_type == 'video':
|
|
# Use ffmpeg to extract a frame
|
|
result = subprocess.run([
|
|
'ffmpeg', '-i', str(file_path),
|
|
'-ss', '00:00:01', # 1 second in
|
|
'-vframes', '1',
|
|
'-vf', 'scale=400:-1',
|
|
'-f', 'image2pipe',
|
|
'-vcodec', 'mjpeg',
|
|
'-'
|
|
], capture_output=True, timeout=30)
|
|
if result.returncode == 0 and result.stdout:
|
|
return result.stdout
|
|
except Exception as e:
|
|
print(f" Warning: Failed to generate thumbnail: {e}")
|
|
return None
|
|
|
|
|
|
def get_file_type(filename: str) -> str:
|
|
"""Determine file type from extension"""
|
|
ext = Path(filename).suffix.lower()
|
|
if ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']:
|
|
return 'image'
|
|
elif ext in ['.mp4', '.webm', '.mov', '.avi', '.mkv', '.m4v']:
|
|
return 'video'
|
|
else:
|
|
return 'other'
|
|
|
|
|
|
def sanitize_filename(name: str) -> str:
|
|
"""Sanitize string for use in filename/directory"""
|
|
name = re.sub(r'[<>:"/\\|?*]', '', name)
|
|
name = re.sub(r'\s+', '-', name.strip())
|
|
return name or 'unnamed'
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Backfill paid content from existing files')
|
|
parser.add_argument('source_dir', help='Source directory containing downloaded files')
|
|
parser.add_argument('--creator', required=True, help='Creator username')
|
|
parser.add_argument('--platform', required=True, help='Platform (fansly, onlyfans, etc.)')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
|
parser.add_argument('--limit', type=int, help='Limit number of posts to process')
|
|
args = parser.parse_args()
|
|
|
|
source_dir = Path(args.source_dir)
|
|
if not source_dir.exists():
|
|
print(f"Error: Source directory does not exist: {source_dir}")
|
|
sys.exit(1)
|
|
|
|
# Initialize database
|
|
db = UnifiedDatabase()
|
|
|
|
# Get config for base download path
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT base_download_path FROM paid_content_config WHERE id = 1")
|
|
row = cursor.fetchone()
|
|
base_path = Path(row[0] if row else '/opt/immich/paid')
|
|
|
|
# Find the creator in database
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT id, username, platform, service_id
|
|
FROM paid_content_creators
|
|
WHERE LOWER(username) = LOWER(?) AND LOWER(platform) = LOWER(?)
|
|
""", (args.creator, args.platform))
|
|
creator = cursor.fetchone()
|
|
|
|
if not creator:
|
|
print(f"Error: Creator '{args.creator}' on platform '{args.platform}' not found in database")
|
|
sys.exit(1)
|
|
|
|
creator_id, username, platform, service_id = creator
|
|
print(f"Found creator: {username} ({platform}) - ID: {creator_id}")
|
|
|
|
# Scan source directory for post folders
|
|
post_folders = [d for d in source_dir.iterdir() if d.is_dir() and d.name.isdigit()]
|
|
print(f"Found {len(post_folders)} post folders in source directory")
|
|
|
|
if args.limit:
|
|
post_folders = post_folders[:args.limit]
|
|
print(f"Limited to {args.limit} posts")
|
|
|
|
# Stats
|
|
stats = {
|
|
'posts_found': 0,
|
|
'posts_matched': 0,
|
|
'files_copied': 0,
|
|
'files_skipped': 0,
|
|
'thumbnails_generated': 0,
|
|
'errors': 0
|
|
}
|
|
|
|
for post_folder in post_folders:
|
|
post_id = post_folder.name
|
|
|
|
# Find post in database
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT id, title, published_at
|
|
FROM paid_content_posts
|
|
WHERE creator_id = ? AND post_id = ?
|
|
""", (creator_id, post_id))
|
|
post = cursor.fetchone()
|
|
|
|
if not post:
|
|
# Try partial match (post_id might be truncated in DB)
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT id, title, published_at, post_id
|
|
FROM paid_content_posts
|
|
WHERE creator_id = ? AND post_id LIKE ?
|
|
""", (creator_id, f"{post_id[:12]}%"))
|
|
post = cursor.fetchone()
|
|
if post:
|
|
post_id = post[3] # Use the full post_id from DB
|
|
|
|
if not post:
|
|
print(f" Post {post_id}: Not found in database, skipping")
|
|
continue
|
|
|
|
post_db_id, post_title, published_at = post[0], post[1], post[2]
|
|
stats['posts_matched'] += 1
|
|
|
|
# Build destination directory - matches scraper's _build_file_path structure
|
|
# Format: /base/platform/username/date/post_id/
|
|
post_date = published_at[:10] if published_at else 'unknown-date'
|
|
post_dir_name = post_id # Just post_id, no prefix
|
|
dest_dir = base_path / platform / sanitize_filename(username) / post_date / post_dir_name
|
|
|
|
print(f" Post {post_id}: {post_title or '(no title)'}")
|
|
print(f" -> {dest_dir}")
|
|
|
|
# Get attachments for this post
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT id, name, server_path, status, local_path, attachment_index
|
|
FROM paid_content_attachments
|
|
WHERE post_id = ?
|
|
ORDER BY attachment_index
|
|
""", (post_db_id,))
|
|
attachments = cursor.fetchall()
|
|
|
|
# Scan files in source folder
|
|
source_files = list(post_folder.iterdir())
|
|
source_files = [f for f in source_files if f.is_file()]
|
|
|
|
print(f" Found {len(source_files)} files, {len(attachments)} attachments in DB")
|
|
|
|
for att in attachments:
|
|
att_id, att_name, server_path, status, local_path, att_index = att
|
|
|
|
# Skip if already completed with valid local_path
|
|
if status == 'completed' and local_path and Path(local_path).exists():
|
|
print(f" [{att_index}] Already downloaded: {att_name}")
|
|
stats['files_skipped'] += 1
|
|
continue
|
|
|
|
# Try to find matching file in source
|
|
# Files might be named with attachment ID or just the filename
|
|
matching_file = None
|
|
|
|
# Extract potential file ID from server_path or name
|
|
if server_path:
|
|
# Server path like /27/37/2737100bd05f040ae0a0b10c452be9efdf54816577e53775b96b035eac200cde.jpg
|
|
server_filename = Path(server_path).stem # Get hash without extension
|
|
|
|
for src_file in source_files:
|
|
src_stem = src_file.stem
|
|
src_name = src_file.name
|
|
|
|
# Match by various patterns
|
|
if att_name and src_name == att_name:
|
|
matching_file = src_file
|
|
break
|
|
if att_name and src_stem == Path(att_name).stem:
|
|
matching_file = src_file
|
|
break
|
|
# Match by attachment ID in filename (Fansly style: 286246551964098560.png)
|
|
if src_stem.isdigit():
|
|
# Could be attachment ID
|
|
if att_name and src_stem in att_name:
|
|
matching_file = src_file
|
|
break
|
|
|
|
if not matching_file:
|
|
# Try to match by index
|
|
if att_index < len(source_files):
|
|
# Sort source files and pick by index
|
|
sorted_files = sorted(source_files, key=lambda f: f.name)
|
|
matching_file = sorted_files[att_index]
|
|
print(f" [{att_index}] Matched by index: {matching_file.name}")
|
|
|
|
if not matching_file:
|
|
print(f" [{att_index}] No matching file found for: {att_name}")
|
|
stats['errors'] += 1
|
|
continue
|
|
|
|
# Determine file type and extension
|
|
file_type = get_file_type(matching_file.name)
|
|
ext = matching_file.suffix or Path(att_name).suffix if att_name else '.bin'
|
|
|
|
# Build destination filename - matches scraper's _build_file_path
|
|
# Fansly uses just media ID (unique), other platforms use index prefix
|
|
if att_name:
|
|
sanitized_name = sanitize_filename(att_name)
|
|
# Ensure extension is preserved
|
|
if not sanitized_name.lower().endswith(ext.lower()):
|
|
sanitized_name = Path(att_name).stem + ext
|
|
dest_filename = sanitized_name # Fansly: no index prefix needed
|
|
else:
|
|
# Fallback to source filename
|
|
dest_filename = matching_file.name
|
|
|
|
dest_path = dest_dir / dest_filename
|
|
|
|
print(f" [{att_index}] {matching_file.name} -> {dest_filename}")
|
|
|
|
if args.dry_run:
|
|
stats['files_copied'] += 1
|
|
continue
|
|
|
|
# Create destination directory
|
|
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Copy file
|
|
try:
|
|
shutil.copy2(matching_file, dest_path)
|
|
stats['files_copied'] += 1
|
|
except Exception as e:
|
|
print(f" Error copying file: {e}")
|
|
stats['errors'] += 1
|
|
continue
|
|
|
|
# Compute file hash
|
|
file_hash = get_file_hash(dest_path)
|
|
file_size = dest_path.stat().st_size
|
|
|
|
# Generate thumbnail
|
|
thumbnail_data = generate_thumbnail(dest_path, file_type)
|
|
if thumbnail_data:
|
|
stats['thumbnails_generated'] += 1
|
|
|
|
# Update database
|
|
now = datetime.now().isoformat()
|
|
with db.get_connection(for_write=True) as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
UPDATE paid_content_attachments
|
|
SET status = 'completed',
|
|
local_path = ?,
|
|
local_filename = ?,
|
|
file_hash = ?,
|
|
file_size = ?,
|
|
file_type = ?,
|
|
downloaded_at = ?,
|
|
thumbnail_data = ?
|
|
WHERE id = ?
|
|
""", (str(dest_path), dest_filename, file_hash, file_size, file_type, now, thumbnail_data, att_id))
|
|
conn.commit()
|
|
|
|
# Update post downloaded status
|
|
if not args.dry_run:
|
|
with db.get_connection(for_write=True) as conn:
|
|
cursor = conn.cursor()
|
|
# Check if all attachments are now completed
|
|
cursor.execute("""
|
|
SELECT COUNT(*) FROM paid_content_attachments
|
|
WHERE post_id = ? AND status != 'completed'
|
|
""", (post_db_id,))
|
|
pending = cursor.fetchone()[0]
|
|
|
|
if pending == 0:
|
|
cursor.execute("""
|
|
UPDATE paid_content_posts
|
|
SET downloaded = 1, download_date = ?
|
|
WHERE id = ?
|
|
""", (datetime.now().isoformat(), post_db_id))
|
|
conn.commit()
|
|
|
|
stats['posts_found'] += 1
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 50)
|
|
print("BACKFILL SUMMARY")
|
|
print("=" * 50)
|
|
print(f"Posts found in source: {len(post_folders)}")
|
|
print(f"Posts matched in DB: {stats['posts_matched']}")
|
|
print(f"Files copied: {stats['files_copied']}")
|
|
print(f"Files skipped (existing): {stats['files_skipped']}")
|
|
print(f"Thumbnails generated: {stats['thumbnails_generated']}")
|
|
print(f"Errors: {stats['errors']}")
|
|
|
|
if args.dry_run:
|
|
print("\n(Dry run - no changes made)")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|