358
scripts/backfill_paid_content.py
Executable file
358
scripts/backfill_paid_content.py
Executable file
@@ -0,0 +1,358 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Backfill Paid Content from existing downloaded files.
|
||||
|
||||
This script:
|
||||
1. Scans a source directory containing previously downloaded content
|
||||
2. Matches files to posts/attachments in the database by ID
|
||||
3. Copies files to the proper download location
|
||||
4. Generates thumbnails
|
||||
5. Updates database records as if they were freshly downloaded
|
||||
|
||||
Usage:
|
||||
python3 backfill_paid_content.py /path/to/source/files --creator puffinasmr --platform fansly
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from io import BytesIO
|
||||
|
||||
# Add project root to path
|
||||
sys.path.insert(0, '/opt/media-downloader')
|
||||
|
||||
from modules.unified_database import UnifiedDatabase
|
||||
|
||||
|
||||
def get_file_hash(file_path: Path) -> str:
|
||||
"""Compute SHA256 hash of file"""
|
||||
sha256 = hashlib.sha256()
|
||||
with open(file_path, 'rb') as f:
|
||||
for chunk in iter(lambda: f.read(8192), b''):
|
||||
sha256.update(chunk)
|
||||
return sha256.hexdigest()
|
||||
|
||||
|
||||
def generate_thumbnail(file_path: Path, file_type: str) -> bytes:
|
||||
"""Generate thumbnail for image or video"""
|
||||
try:
|
||||
if file_type == 'image':
|
||||
from PIL import Image
|
||||
img = Image.open(file_path)
|
||||
img.thumbnail((400, 400), Image.Resampling.LANCZOS)
|
||||
if img.mode in ('RGBA', 'P'):
|
||||
img = img.convert('RGB')
|
||||
buffer = BytesIO()
|
||||
img.save(buffer, format='JPEG', quality=85)
|
||||
return buffer.getvalue()
|
||||
elif file_type == 'video':
|
||||
# Use ffmpeg to extract a frame
|
||||
result = subprocess.run([
|
||||
'ffmpeg', '-i', str(file_path),
|
||||
'-ss', '00:00:01', # 1 second in
|
||||
'-vframes', '1',
|
||||
'-vf', 'scale=400:-1',
|
||||
'-f', 'image2pipe',
|
||||
'-vcodec', 'mjpeg',
|
||||
'-'
|
||||
], capture_output=True, timeout=30)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
return result.stdout
|
||||
except Exception as e:
|
||||
print(f" Warning: Failed to generate thumbnail: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_file_type(filename: str) -> str:
|
||||
"""Determine file type from extension"""
|
||||
ext = Path(filename).suffix.lower()
|
||||
if ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']:
|
||||
return 'image'
|
||||
elif ext in ['.mp4', '.webm', '.mov', '.avi', '.mkv', '.m4v']:
|
||||
return 'video'
|
||||
else:
|
||||
return 'other'
|
||||
|
||||
|
||||
def sanitize_filename(name: str) -> str:
|
||||
"""Sanitize string for use in filename/directory"""
|
||||
name = re.sub(r'[<>:"/\\|?*]', '', name)
|
||||
name = re.sub(r'\s+', '-', name.strip())
|
||||
return name or 'unnamed'
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Backfill paid content from existing files')
|
||||
parser.add_argument('source_dir', help='Source directory containing downloaded files')
|
||||
parser.add_argument('--creator', required=True, help='Creator username')
|
||||
parser.add_argument('--platform', required=True, help='Platform (fansly, onlyfans, etc.)')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
||||
parser.add_argument('--limit', type=int, help='Limit number of posts to process')
|
||||
args = parser.parse_args()
|
||||
|
||||
source_dir = Path(args.source_dir)
|
||||
if not source_dir.exists():
|
||||
print(f"Error: Source directory does not exist: {source_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
# Initialize database
|
||||
db = UnifiedDatabase()
|
||||
|
||||
# Get config for base download path
|
||||
with db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT base_download_path FROM paid_content_config WHERE id = 1")
|
||||
row = cursor.fetchone()
|
||||
base_path = Path(row[0] if row else '/opt/immich/paid')
|
||||
|
||||
# Find the creator in database
|
||||
with db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT id, username, platform, service_id
|
||||
FROM paid_content_creators
|
||||
WHERE LOWER(username) = LOWER(?) AND LOWER(platform) = LOWER(?)
|
||||
""", (args.creator, args.platform))
|
||||
creator = cursor.fetchone()
|
||||
|
||||
if not creator:
|
||||
print(f"Error: Creator '{args.creator}' on platform '{args.platform}' not found in database")
|
||||
sys.exit(1)
|
||||
|
||||
creator_id, username, platform, service_id = creator
|
||||
print(f"Found creator: {username} ({platform}) - ID: {creator_id}")
|
||||
|
||||
# Scan source directory for post folders
|
||||
post_folders = [d for d in source_dir.iterdir() if d.is_dir() and d.name.isdigit()]
|
||||
print(f"Found {len(post_folders)} post folders in source directory")
|
||||
|
||||
if args.limit:
|
||||
post_folders = post_folders[:args.limit]
|
||||
print(f"Limited to {args.limit} posts")
|
||||
|
||||
# Stats
|
||||
stats = {
|
||||
'posts_found': 0,
|
||||
'posts_matched': 0,
|
||||
'files_copied': 0,
|
||||
'files_skipped': 0,
|
||||
'thumbnails_generated': 0,
|
||||
'errors': 0
|
||||
}
|
||||
|
||||
for post_folder in post_folders:
|
||||
post_id = post_folder.name
|
||||
|
||||
# Find post in database
|
||||
with db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT id, title, published_at
|
||||
FROM paid_content_posts
|
||||
WHERE creator_id = ? AND post_id = ?
|
||||
""", (creator_id, post_id))
|
||||
post = cursor.fetchone()
|
||||
|
||||
if not post:
|
||||
# Try partial match (post_id might be truncated in DB)
|
||||
with db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT id, title, published_at, post_id
|
||||
FROM paid_content_posts
|
||||
WHERE creator_id = ? AND post_id LIKE ?
|
||||
""", (creator_id, f"{post_id[:12]}%"))
|
||||
post = cursor.fetchone()
|
||||
if post:
|
||||
post_id = post[3] # Use the full post_id from DB
|
||||
|
||||
if not post:
|
||||
print(f" Post {post_id}: Not found in database, skipping")
|
||||
continue
|
||||
|
||||
post_db_id, post_title, published_at = post[0], post[1], post[2]
|
||||
stats['posts_matched'] += 1
|
||||
|
||||
# Build destination directory - matches scraper's _build_file_path structure
|
||||
# Format: /base/platform/username/date/post_id/
|
||||
post_date = published_at[:10] if published_at else 'unknown-date'
|
||||
post_dir_name = post_id # Just post_id, no prefix
|
||||
dest_dir = base_path / platform / sanitize_filename(username) / post_date / post_dir_name
|
||||
|
||||
print(f" Post {post_id}: {post_title or '(no title)'}")
|
||||
print(f" -> {dest_dir}")
|
||||
|
||||
# Get attachments for this post
|
||||
with db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT id, name, server_path, status, local_path, attachment_index
|
||||
FROM paid_content_attachments
|
||||
WHERE post_id = ?
|
||||
ORDER BY attachment_index
|
||||
""", (post_db_id,))
|
||||
attachments = cursor.fetchall()
|
||||
|
||||
# Scan files in source folder
|
||||
source_files = list(post_folder.iterdir())
|
||||
source_files = [f for f in source_files if f.is_file()]
|
||||
|
||||
print(f" Found {len(source_files)} files, {len(attachments)} attachments in DB")
|
||||
|
||||
for att in attachments:
|
||||
att_id, att_name, server_path, status, local_path, att_index = att
|
||||
|
||||
# Skip if already completed with valid local_path
|
||||
if status == 'completed' and local_path and Path(local_path).exists():
|
||||
print(f" [{att_index}] Already downloaded: {att_name}")
|
||||
stats['files_skipped'] += 1
|
||||
continue
|
||||
|
||||
# Try to find matching file in source
|
||||
# Files might be named with attachment ID or just the filename
|
||||
matching_file = None
|
||||
|
||||
# Extract potential file ID from server_path or name
|
||||
if server_path:
|
||||
# Server path like /27/37/2737100bd05f040ae0a0b10c452be9efdf54816577e53775b96b035eac200cde.jpg
|
||||
server_filename = Path(server_path).stem # Get hash without extension
|
||||
|
||||
for src_file in source_files:
|
||||
src_stem = src_file.stem
|
||||
src_name = src_file.name
|
||||
|
||||
# Match by various patterns
|
||||
if att_name and src_name == att_name:
|
||||
matching_file = src_file
|
||||
break
|
||||
if att_name and src_stem == Path(att_name).stem:
|
||||
matching_file = src_file
|
||||
break
|
||||
# Match by attachment ID in filename (Fansly style: 286246551964098560.png)
|
||||
if src_stem.isdigit():
|
||||
# Could be attachment ID
|
||||
if att_name and src_stem in att_name:
|
||||
matching_file = src_file
|
||||
break
|
||||
|
||||
if not matching_file:
|
||||
# Try to match by index
|
||||
if att_index < len(source_files):
|
||||
# Sort source files and pick by index
|
||||
sorted_files = sorted(source_files, key=lambda f: f.name)
|
||||
matching_file = sorted_files[att_index]
|
||||
print(f" [{att_index}] Matched by index: {matching_file.name}")
|
||||
|
||||
if not matching_file:
|
||||
print(f" [{att_index}] No matching file found for: {att_name}")
|
||||
stats['errors'] += 1
|
||||
continue
|
||||
|
||||
# Determine file type and extension
|
||||
file_type = get_file_type(matching_file.name)
|
||||
ext = matching_file.suffix or Path(att_name).suffix if att_name else '.bin'
|
||||
|
||||
# Build destination filename - matches scraper's _build_file_path
|
||||
# Fansly uses just media ID (unique), other platforms use index prefix
|
||||
if att_name:
|
||||
sanitized_name = sanitize_filename(att_name)
|
||||
# Ensure extension is preserved
|
||||
if not sanitized_name.lower().endswith(ext.lower()):
|
||||
sanitized_name = Path(att_name).stem + ext
|
||||
dest_filename = sanitized_name # Fansly: no index prefix needed
|
||||
else:
|
||||
# Fallback to source filename
|
||||
dest_filename = matching_file.name
|
||||
|
||||
dest_path = dest_dir / dest_filename
|
||||
|
||||
print(f" [{att_index}] {matching_file.name} -> {dest_filename}")
|
||||
|
||||
if args.dry_run:
|
||||
stats['files_copied'] += 1
|
||||
continue
|
||||
|
||||
# Create destination directory
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copy file
|
||||
try:
|
||||
shutil.copy2(matching_file, dest_path)
|
||||
stats['files_copied'] += 1
|
||||
except Exception as e:
|
||||
print(f" Error copying file: {e}")
|
||||
stats['errors'] += 1
|
||||
continue
|
||||
|
||||
# Compute file hash
|
||||
file_hash = get_file_hash(dest_path)
|
||||
file_size = dest_path.stat().st_size
|
||||
|
||||
# Generate thumbnail
|
||||
thumbnail_data = generate_thumbnail(dest_path, file_type)
|
||||
if thumbnail_data:
|
||||
stats['thumbnails_generated'] += 1
|
||||
|
||||
# Update database
|
||||
now = datetime.now().isoformat()
|
||||
with db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
UPDATE paid_content_attachments
|
||||
SET status = 'completed',
|
||||
local_path = ?,
|
||||
local_filename = ?,
|
||||
file_hash = ?,
|
||||
file_size = ?,
|
||||
file_type = ?,
|
||||
downloaded_at = ?,
|
||||
thumbnail_data = ?
|
||||
WHERE id = ?
|
||||
""", (str(dest_path), dest_filename, file_hash, file_size, file_type, now, thumbnail_data, att_id))
|
||||
conn.commit()
|
||||
|
||||
# Update post downloaded status
|
||||
if not args.dry_run:
|
||||
with db.get_connection(for_write=True) as conn:
|
||||
cursor = conn.cursor()
|
||||
# Check if all attachments are now completed
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM paid_content_attachments
|
||||
WHERE post_id = ? AND status != 'completed'
|
||||
""", (post_db_id,))
|
||||
pending = cursor.fetchone()[0]
|
||||
|
||||
if pending == 0:
|
||||
cursor.execute("""
|
||||
UPDATE paid_content_posts
|
||||
SET downloaded = 1, download_date = ?
|
||||
WHERE id = ?
|
||||
""", (datetime.now().isoformat(), post_db_id))
|
||||
conn.commit()
|
||||
|
||||
stats['posts_found'] += 1
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 50)
|
||||
print("BACKFILL SUMMARY")
|
||||
print("=" * 50)
|
||||
print(f"Posts found in source: {len(post_folders)}")
|
||||
print(f"Posts matched in DB: {stats['posts_matched']}")
|
||||
print(f"Files copied: {stats['files_copied']}")
|
||||
print(f"Files skipped (existing): {stats['files_skipped']}")
|
||||
print(f"Thumbnails generated: {stats['thumbnails_generated']}")
|
||||
print(f"Errors: {stats['errors']}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n(Dry run - no changes made)")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user