media-downloader/scripts/backfill_paid_content.py

#!/usr/bin/env python3
"""
Backfill Paid Content from existing downloaded files.

This script:
1. Scans a source directory containing previously downloaded content
2. Matches files to posts/attachments in the database by ID
3. Copies files to the proper download location
4. Generates thumbnails
5. Updates database records as if they were freshly downloaded

Usage:
    python3 backfill_paid_content.py /path/to/source/files --creator puffinasmr --platform fansly
"""

import argparse
import hashlib
import os
import re
import shutil
import sqlite3
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from io import BytesIO

# Add project root to path
sys.path.insert(0, '/opt/media-downloader')

from modules.unified_database import UnifiedDatabase


def get_file_hash(file_path: Path) -> str:
    """Compute SHA256 hash of file"""
    sha256 = hashlib.sha256()
    with open(file_path, 'rb') as f:
        for chunk in iter(lambda: f.read(8192), b''):
            sha256.update(chunk)
    return sha256.hexdigest()


def generate_thumbnail(file_path: Path, file_type: str) -> bytes:
    """Generate thumbnail for image or video"""
    try:
        if file_type == 'image':
            from PIL import Image
            img = Image.open(file_path)
            img.thumbnail((400, 400), Image.Resampling.LANCZOS)
            if img.mode in ('RGBA', 'P'):
                img = img.convert('RGB')
            buffer = BytesIO()
            img.save(buffer, format='JPEG', quality=85)
            return buffer.getvalue()
        elif file_type == 'video':
            # Use ffmpeg to extract a frame
            result = subprocess.run([
                'ffmpeg', '-i', str(file_path),
                '-ss', '00:00:01',  # 1 second in
                '-vframes', '1',
                '-vf', 'scale=400:-1',
                '-f', 'image2pipe',
                '-vcodec', 'mjpeg',
                '-'
            ], capture_output=True, timeout=30)
            if result.returncode == 0 and result.stdout:
                return result.stdout
    except Exception as e:
        print(f"    Warning: Failed to generate thumbnail: {e}")
    return None


def get_file_type(filename: str) -> str:
    """Determine file type from extension"""
    ext = Path(filename).suffix.lower()
    if ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']:
        return 'image'
    elif ext in ['.mp4', '.webm', '.mov', '.avi', '.mkv', '.m4v']:
        return 'video'
    else:
        return 'other'


def sanitize_filename(name: str) -> str:
    """Sanitize string for use in filename/directory"""
    name = re.sub(r'[<>:"/\\|?*]', '', name)
    name = re.sub(r'\s+', '-', name.strip())
    return name or 'unnamed'


def main():
    parser = argparse.ArgumentParser(description='Backfill paid content from existing files')
    parser.add_argument('source_dir', help='Source directory containing downloaded files')
    parser.add_argument('--creator', required=True, help='Creator username')
    parser.add_argument('--platform', required=True, help='Platform (fansly, onlyfans, etc.)')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
    parser.add_argument('--limit', type=int, help='Limit number of posts to process')
    args = parser.parse_args()

    source_dir = Path(args.source_dir)
    if not source_dir.exists():
        print(f"Error: Source directory does not exist: {source_dir}")
        sys.exit(1)

    # Initialize database
    db = UnifiedDatabase()

    # Get config for base download path
    with db.get_connection() as conn:
        cursor = conn.cursor()
        cursor.execute("SELECT base_download_path FROM paid_content_config WHERE id = 1")
        row = cursor.fetchone()
        base_path = Path(row[0] if row else '/opt/immich/paid')

    # Find the creator in database
    with db.get_connection() as conn:
        cursor = conn.cursor()
        cursor.execute("""
            SELECT id, username, platform, service_id
            FROM paid_content_creators
            WHERE LOWER(username) = LOWER(?) AND LOWER(platform) = LOWER(?)
        """, (args.creator, args.platform))
        creator = cursor.fetchone()

        if not creator:
            print(f"Error: Creator '{args.creator}' on platform '{args.platform}' not found in database")
            sys.exit(1)

        creator_id, username, platform, service_id = creator
        print(f"Found creator: {username} ({platform}) - ID: {creator_id}")

    # Scan source directory for post folders
    post_folders = [d for d in source_dir.iterdir() if d.is_dir() and d.name.isdigit()]
    print(f"Found {len(post_folders)} post folders in source directory")

    if args.limit:
        post_folders = post_folders[:args.limit]
        print(f"Limited to {args.limit} posts")

    # Stats
    stats = {
        'posts_found': 0,
        'posts_matched': 0,
        'files_copied': 0,
        'files_skipped': 0,
        'thumbnails_generated': 0,
        'errors': 0
    }

    for post_folder in post_folders:
        post_id = post_folder.name

        # Find post in database
        with db.get_connection() as conn:
            cursor = conn.cursor()
            cursor.execute("""
                SELECT id, title, published_at
                FROM paid_content_posts
                WHERE creator_id = ? AND post_id = ?
            """, (creator_id, post_id))
            post = cursor.fetchone()

        if not post:
            # Try partial match (post_id might be truncated in DB)
            with db.get_connection() as conn:
                cursor = conn.cursor()
                cursor.execute("""
                    SELECT id, title, published_at, post_id
                    FROM paid_content_posts
                    WHERE creator_id = ? AND post_id LIKE ?
                """, (creator_id, f"{post_id[:12]}%"))
                post = cursor.fetchone()
                if post:
                    post_id = post[3]  # Use the full post_id from DB

        if not post:
            print(f"  Post {post_id}: Not found in database, skipping")
            continue

        post_db_id, post_title, published_at = post[0], post[1], post[2]
        stats['posts_matched'] += 1

        # Build destination directory - matches scraper's _build_file_path structure
        # Format: /base/platform/username/date/post_id/
        post_date = published_at[:10] if published_at else 'unknown-date'
        post_dir_name = post_id  # Just post_id, no prefix
        dest_dir = base_path / platform / sanitize_filename(username) / post_date / post_dir_name

        print(f"  Post {post_id}: {post_title or '(no title)'}")
        print(f"    -> {dest_dir}")

        # Get attachments for this post
        with db.get_connection() as conn:
            cursor = conn.cursor()
            cursor.execute("""
                SELECT id, name, server_path, status, local_path, attachment_index
                FROM paid_content_attachments
                WHERE post_id = ?
                ORDER BY attachment_index
            """, (post_db_id,))
            attachments = cursor.fetchall()

        # Scan files in source folder
        source_files = list(post_folder.iterdir())
        source_files = [f for f in source_files if f.is_file()]

        print(f"    Found {len(source_files)} files, {len(attachments)} attachments in DB")

        for att in attachments:
            att_id, att_name, server_path, status, local_path, att_index = att

            # Skip if already completed with valid local_path
            if status == 'completed' and local_path and Path(local_path).exists():
                print(f"      [{att_index}] Already downloaded: {att_name}")
                stats['files_skipped'] += 1
                continue

            # Try to find matching file in source
            # Files might be named with attachment ID or just the filename
            matching_file = None

            # Extract potential file ID from server_path or name
            if server_path:
                # Server path like /27/37/2737100bd05f040ae0a0b10c452be9efdf54816577e53775b96b035eac200cde.jpg
                server_filename = Path(server_path).stem  # Get hash without extension

            for src_file in source_files:
                src_stem = src_file.stem
                src_name = src_file.name

                # Match by various patterns
                if att_name and src_name == att_name:
                    matching_file = src_file
                    break
                if att_name and src_stem == Path(att_name).stem:
                    matching_file = src_file
                    break
                # Match by attachment ID in filename (Fansly style: 286246551964098560.png)
                if src_stem.isdigit():
                    # Could be attachment ID
                    if att_name and src_stem in att_name:
                        matching_file = src_file
                        break

            if not matching_file:
                # Try to match by index
                if att_index < len(source_files):
                    # Sort source files and pick by index
                    sorted_files = sorted(source_files, key=lambda f: f.name)
                    matching_file = sorted_files[att_index]
                    print(f"      [{att_index}] Matched by index: {matching_file.name}")

            if not matching_file:
                print(f"      [{att_index}] No matching file found for: {att_name}")
                stats['errors'] += 1
                continue

            # Determine file type and extension
            file_type = get_file_type(matching_file.name)
            ext = matching_file.suffix or Path(att_name).suffix if att_name else '.bin'

            # Build destination filename - matches scraper's _build_file_path
            # Fansly uses just media ID (unique), other platforms use index prefix
            if att_name:
                sanitized_name = sanitize_filename(att_name)
                # Ensure extension is preserved
                if not sanitized_name.lower().endswith(ext.lower()):
                    sanitized_name = Path(att_name).stem + ext
                dest_filename = sanitized_name  # Fansly: no index prefix needed
            else:
                # Fallback to source filename
                dest_filename = matching_file.name

            dest_path = dest_dir / dest_filename

            print(f"      [{att_index}] {matching_file.name} -> {dest_filename}")

            if args.dry_run:
                stats['files_copied'] += 1
                continue

            # Create destination directory
            dest_dir.mkdir(parents=True, exist_ok=True)

            # Copy file
            try:
                shutil.copy2(matching_file, dest_path)
                stats['files_copied'] += 1
            except Exception as e:
                print(f"        Error copying file: {e}")
                stats['errors'] += 1
                continue

            # Compute file hash
            file_hash = get_file_hash(dest_path)
            file_size = dest_path.stat().st_size

            # Generate thumbnail
            thumbnail_data = generate_thumbnail(dest_path, file_type)
            if thumbnail_data:
                stats['thumbnails_generated'] += 1

            # Update database
            now = datetime.now().isoformat()
            with db.get_connection(for_write=True) as conn:
                cursor = conn.cursor()
                cursor.execute("""
                    UPDATE paid_content_attachments
                    SET status = 'completed',
                        local_path = ?,
                        local_filename = ?,
                        file_hash = ?,
                        file_size = ?,
                        file_type = ?,
                        downloaded_at = ?,
                        thumbnail_data = ?
                    WHERE id = ?
                """, (str(dest_path), dest_filename, file_hash, file_size, file_type, now, thumbnail_data, att_id))
                conn.commit()

        # Update post downloaded status
        if not args.dry_run:
            with db.get_connection(for_write=True) as conn:
                cursor = conn.cursor()
                # Check if all attachments are now completed
                cursor.execute("""
                    SELECT COUNT(*) FROM paid_content_attachments
                    WHERE post_id = ? AND status != 'completed'
                """, (post_db_id,))
                pending = cursor.fetchone()[0]

                if pending == 0:
                    cursor.execute("""
                        UPDATE paid_content_posts
                        SET downloaded = 1, download_date = ?
                        WHERE id = ?
                    """, (datetime.now().isoformat(), post_db_id))
                    conn.commit()

        stats['posts_found'] += 1

    # Print summary
    print("\n" + "=" * 50)
    print("BACKFILL SUMMARY")
    print("=" * 50)
    print(f"Posts found in source:    {len(post_folders)}")
    print(f"Posts matched in DB:      {stats['posts_matched']}")
    print(f"Files copied:             {stats['files_copied']}")
    print(f"Files skipped (existing): {stats['files_skipped']}")
    print(f"Thumbnails generated:     {stats['thumbnails_generated']}")
    print(f"Errors:                   {stats['errors']}")

    if args.dry_run:
        print("\n(Dry run - no changes made)")


if __name__ == '__main__':
    main()