Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/scripts/backfill_paid_content.py
+++ b/scripts/backfill_paid_content.py
@@ -0,0 +1,358 @@
+#!/usr/bin/env python3
+"""
+Backfill Paid Content from existing downloaded files.
+
+This script:
+1. Scans a source directory containing previously downloaded content
+2. Matches files to posts/attachments in the database by ID
+3. Copies files to the proper download location
+4. Generates thumbnails
+5. Updates database records as if they were freshly downloaded
+
+Usage:
+    python3 backfill_paid_content.py /path/to/source/files --creator puffinasmr --platform fansly
+"""
+
+import argparse
+import hashlib
+import os
+import re
+import shutil
+import sqlite3
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+from io import BytesIO
+
+# Add project root to path
+sys.path.insert(0, '/opt/media-downloader')
+
+from modules.unified_database import UnifiedDatabase
+
+
+def get_file_hash(file_path: Path) -> str:
+    """Compute SHA256 hash of file"""
+    sha256 = hashlib.sha256()
+    with open(file_path, 'rb') as f:
+        for chunk in iter(lambda: f.read(8192), b''):
+            sha256.update(chunk)
+    return sha256.hexdigest()
+
+
+def generate_thumbnail(file_path: Path, file_type: str) -> bytes:
+    """Generate thumbnail for image or video"""
+    try:
+        if file_type == 'image':
+            from PIL import Image
+            img = Image.open(file_path)
+            img.thumbnail((400, 400), Image.Resampling.LANCZOS)
+            if img.mode in ('RGBA', 'P'):
+                img = img.convert('RGB')
+            buffer = BytesIO()
+            img.save(buffer, format='JPEG', quality=85)
+            return buffer.getvalue()
+        elif file_type == 'video':
+            # Use ffmpeg to extract a frame
+            result = subprocess.run([
+                'ffmpeg', '-i', str(file_path),
+                '-ss', '00:00:01',  # 1 second in
+                '-vframes', '1',
+                '-vf', 'scale=400:-1',
+                '-f', 'image2pipe',
+                '-vcodec', 'mjpeg',
+                '-'
+            ], capture_output=True, timeout=30)
+            if result.returncode == 0 and result.stdout:
+                return result.stdout
+    except Exception as e:
+        print(f"    Warning: Failed to generate thumbnail: {e}")
+    return None
+
+
+def get_file_type(filename: str) -> str:
+    """Determine file type from extension"""
+    ext = Path(filename).suffix.lower()
+    if ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']:
+        return 'image'
+    elif ext in ['.mp4', '.webm', '.mov', '.avi', '.mkv', '.m4v']:
+        return 'video'
+    else:
+        return 'other'
+
+
+def sanitize_filename(name: str) -> str:
+    """Sanitize string for use in filename/directory"""
+    name = re.sub(r'[<>:"/\\|?*]', '', name)
+    name = re.sub(r'\s+', '-', name.strip())
+    return name or 'unnamed'
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Backfill paid content from existing files')
+    parser.add_argument('source_dir', help='Source directory containing downloaded files')
+    parser.add_argument('--creator', required=True, help='Creator username')
+    parser.add_argument('--platform', required=True, help='Platform (fansly, onlyfans, etc.)')
+    parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
+    parser.add_argument('--limit', type=int, help='Limit number of posts to process')
+    args = parser.parse_args()
+
+    source_dir = Path(args.source_dir)
+    if not source_dir.exists():
+        print(f"Error: Source directory does not exist: {source_dir}")
+        sys.exit(1)
+
+    # Initialize database
+    db = UnifiedDatabase()
+
+    # Get config for base download path
+    with db.get_connection() as conn:
+        cursor = conn.cursor()
+        cursor.execute("SELECT base_download_path FROM paid_content_config WHERE id = 1")
+        row = cursor.fetchone()
+        base_path = Path(row[0] if row else '/opt/immich/paid')
+
+    # Find the creator in database
+    with db.get_connection() as conn:
+        cursor = conn.cursor()
+        cursor.execute("""
+            SELECT id, username, platform, service_id
+            FROM paid_content_creators
+            WHERE LOWER(username) = LOWER(?) AND LOWER(platform) = LOWER(?)
+        """, (args.creator, args.platform))
+        creator = cursor.fetchone()
+
+        if not creator:
+            print(f"Error: Creator '{args.creator}' on platform '{args.platform}' not found in database")
+            sys.exit(1)
+
+        creator_id, username, platform, service_id = creator
+        print(f"Found creator: {username} ({platform}) - ID: {creator_id}")
+
+    # Scan source directory for post folders
+    post_folders = [d for d in source_dir.iterdir() if d.is_dir() and d.name.isdigit()]
+    print(f"Found {len(post_folders)} post folders in source directory")
+
+    if args.limit:
+        post_folders = post_folders[:args.limit]
+        print(f"Limited to {args.limit} posts")
+
+    # Stats
+    stats = {
+        'posts_found': 0,
+        'posts_matched': 0,
+        'files_copied': 0,
+        'files_skipped': 0,
+        'thumbnails_generated': 0,
+        'errors': 0
+    }
+
+    for post_folder in post_folders:
+        post_id = post_folder.name
+
+        # Find post in database
+        with db.get_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute("""
+                SELECT id, title, published_at
+                FROM paid_content_posts
+                WHERE creator_id = ? AND post_id = ?
+            """, (creator_id, post_id))
+            post = cursor.fetchone()
+
+        if not post:
+            # Try partial match (post_id might be truncated in DB)
+            with db.get_connection() as conn:
+                cursor = conn.cursor()
+                cursor.execute("""
+                    SELECT id, title, published_at, post_id
+                    FROM paid_content_posts
+                    WHERE creator_id = ? AND post_id LIKE ?
+                """, (creator_id, f"{post_id[:12]}%"))
+                post = cursor.fetchone()
+                if post:
+                    post_id = post[3]  # Use the full post_id from DB
+
+        if not post:
+            print(f"  Post {post_id}: Not found in database, skipping")
+            continue
+
+        post_db_id, post_title, published_at = post[0], post[1], post[2]
+        stats['posts_matched'] += 1
+
+        # Build destination directory - matches scraper's _build_file_path structure
+        # Format: /base/platform/username/date/post_id/
+        post_date = published_at[:10] if published_at else 'unknown-date'
+        post_dir_name = post_id  # Just post_id, no prefix
+        dest_dir = base_path / platform / sanitize_filename(username) / post_date / post_dir_name
+
+        print(f"  Post {post_id}: {post_title or '(no title)'}")
+        print(f"    -> {dest_dir}")
+
+        # Get attachments for this post
+        with db.get_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute("""
+                SELECT id, name, server_path, status, local_path, attachment_index
+                FROM paid_content_attachments
+                WHERE post_id = ?
+                ORDER BY attachment_index
+            """, (post_db_id,))
+            attachments = cursor.fetchall()
+
+        # Scan files in source folder
+        source_files = list(post_folder.iterdir())
+        source_files = [f for f in source_files if f.is_file()]
+
+        print(f"    Found {len(source_files)} files, {len(attachments)} attachments in DB")
+
+        for att in attachments:
+            att_id, att_name, server_path, status, local_path, att_index = att
+
+            # Skip if already completed with valid local_path
+            if status == 'completed' and local_path and Path(local_path).exists():
+                print(f"      [{att_index}] Already downloaded: {att_name}")
+                stats['files_skipped'] += 1
+                continue
+
+            # Try to find matching file in source
+            # Files might be named with attachment ID or just the filename
+            matching_file = None
+
+            # Extract potential file ID from server_path or name
+            if server_path:
+                # Server path like /27/37/2737100bd05f040ae0a0b10c452be9efdf54816577e53775b96b035eac200cde.jpg
+                server_filename = Path(server_path).stem  # Get hash without extension
+
+            for src_file in source_files:
+                src_stem = src_file.stem
+                src_name = src_file.name
+
+                # Match by various patterns
+                if att_name and src_name == att_name:
+                    matching_file = src_file
+                    break
+                if att_name and src_stem == Path(att_name).stem:
+                    matching_file = src_file
+                    break
+                # Match by attachment ID in filename (Fansly style: 286246551964098560.png)
+                if src_stem.isdigit():
+                    # Could be attachment ID
+                    if att_name and src_stem in att_name:
+                        matching_file = src_file
+                        break
+
+            if not matching_file:
+                # Try to match by index
+                if att_index < len(source_files):
+                    # Sort source files and pick by index
+                    sorted_files = sorted(source_files, key=lambda f: f.name)
+                    matching_file = sorted_files[att_index]
+                    print(f"      [{att_index}] Matched by index: {matching_file.name}")
+
+            if not matching_file:
+                print(f"      [{att_index}] No matching file found for: {att_name}")
+                stats['errors'] += 1
+                continue
+
+            # Determine file type and extension
+            file_type = get_file_type(matching_file.name)
+            ext = matching_file.suffix or Path(att_name).suffix if att_name else '.bin'
+
+            # Build destination filename - matches scraper's _build_file_path
+            # Fansly uses just media ID (unique), other platforms use index prefix
+            if att_name:
+                sanitized_name = sanitize_filename(att_name)
+                # Ensure extension is preserved
+                if not sanitized_name.lower().endswith(ext.lower()):
+                    sanitized_name = Path(att_name).stem + ext
+                dest_filename = sanitized_name  # Fansly: no index prefix needed
+            else:
+                # Fallback to source filename
+                dest_filename = matching_file.name
+
+            dest_path = dest_dir / dest_filename
+
+            print(f"      [{att_index}] {matching_file.name} -> {dest_filename}")
+
+            if args.dry_run:
+                stats['files_copied'] += 1
+                continue
+
+            # Create destination directory
+            dest_dir.mkdir(parents=True, exist_ok=True)
+
+            # Copy file
+            try:
+                shutil.copy2(matching_file, dest_path)
+                stats['files_copied'] += 1
+            except Exception as e:
+                print(f"        Error copying file: {e}")
+                stats['errors'] += 1
+                continue
+
+            # Compute file hash
+            file_hash = get_file_hash(dest_path)
+            file_size = dest_path.stat().st_size
+
+            # Generate thumbnail
+            thumbnail_data = generate_thumbnail(dest_path, file_type)
+            if thumbnail_data:
+                stats['thumbnails_generated'] += 1
+
+            # Update database
+            now = datetime.now().isoformat()
+            with db.get_connection(for_write=True) as conn:
+                cursor = conn.cursor()
+                cursor.execute("""
+                    UPDATE paid_content_attachments
+                    SET status = 'completed',
+                        local_path = ?,
+                        local_filename = ?,
+                        file_hash = ?,
+                        file_size = ?,
+                        file_type = ?,
+                        downloaded_at = ?,
+                        thumbnail_data = ?
+                    WHERE id = ?
+                """, (str(dest_path), dest_filename, file_hash, file_size, file_type, now, thumbnail_data, att_id))
+                conn.commit()
+
+        # Update post downloaded status
+        if not args.dry_run:
+            with db.get_connection(for_write=True) as conn:
+                cursor = conn.cursor()
+                # Check if all attachments are now completed
+                cursor.execute("""
+                    SELECT COUNT(*) FROM paid_content_attachments
+                    WHERE post_id = ? AND status != 'completed'
+                """, (post_db_id,))
+                pending = cursor.fetchone()[0]
+
+                if pending == 0:
+                    cursor.execute("""
+                        UPDATE paid_content_posts
+                        SET downloaded = 1, download_date = ?
+                        WHERE id = ?
+                    """, (datetime.now().isoformat(), post_db_id))
+                    conn.commit()
+
+        stats['posts_found'] += 1
+
+    # Print summary
+    print("\n" + "=" * 50)
+    print("BACKFILL SUMMARY")
+    print("=" * 50)
+    print(f"Posts found in source:    {len(post_folders)}")
+    print(f"Posts matched in DB:      {stats['posts_matched']}")
+    print(f"Files copied:             {stats['files_copied']}")
+    print(f"Files skipped (existing): {stats['files_skipped']}")
+    print(f"Thumbnails generated:     {stats['thumbnails_generated']}")
+    print(f"Errors:                   {stats['errors']}")
+
+    if args.dry_run:
+        print("\n(Dry run - no changes made)")
+
+
+if __name__ == '__main__':
+    main()