media-downloader/tests/test_perceptual_with_sources.py

#!/usr/bin/env python3
"""
Perceptual Duplicate Detection with Proper Source Mapping

Maps UUID filenames from recycle bin back to original Instagram sources
using the media-downloader's recycle_bin database.
"""

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))

from modules.unified_database import UnifiedDatabase
from modules.instagram_perceptual_duplicate_detector import InstagramPerceptualDuplicateDetector
import sqlite3
import json
from datetime import datetime
from collections import defaultdict
import re

def get_source_mapping(backup_db_path):
    """Map UUID recycle filenames to original sources"""
    print("Loading recycle bin source mappings...")

    conn = sqlite3.connect(backup_db_path)
    conn.row_factory = sqlite3.Row
    cursor = conn.cursor()

    cursor.execute("""
        SELECT
            recycle_path,
            original_filename,
            original_path,
            deleted_at
        FROM recycle_bin
        WHERE deleted_at > datetime('now', '-3 days')
    """)

    mapping = {}
    for row in cursor.fetchall():
        recycle_path = row['recycle_path']
        original_filename = row['original_filename']

        # Extract source from Instagram filename pattern: source_date_id.ext
        source = 'unknown'
        match = re.match(r'^([a-z0-9._]+)_\d{8}', original_filename.lower())
        if match:
            source = match.group(1)

        mapping[recycle_path] = {
            'source': source,
            'original_filename': original_filename,
            'original_path': row['original_path'],
            'deleted_at': row['deleted_at']
        }

    conn.close()
    print(f"  Mapped {len(mapping)} recycled files to original sources")
    return mapping

def main():
    print("=" * 80)
    print("INSTAGRAM PERCEPTUAL DUPLICATES - WITH SOURCE MAPPING")
    print("=" * 80)
    print()

    # Load source mapping from recycle bin database
    backup_db = Path(__file__).parent.parent / 'data' / 'backup_cache.db'
    source_mapping = get_source_mapping(str(backup_db))

    # Load comprehensive scan results
    db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db'
    db = UnifiedDatabase(str(db_path))

    # Get all files
    print("\nCollecting Instagram files...")
    all_files = {}

    # Database files
    with db.get_connection() as conn:
        cursor = conn.cursor()
        cursor.execute("""
            SELECT filename, source, file_path, download_date
            FROM downloads
            WHERE platform = 'instagram'
            AND download_date > datetime('now', '-3 days')
            AND file_path IS NOT NULL
            AND file_path NOT LIKE '%_phrase_checked_%'
            AND file_path NOT LIKE '%_old_post_%'
        """)

        for row in cursor.fetchall():
            if Path(row[2]).exists():
                all_files[row[2]] = {
                    'source': row[1],
                    'filename': row[0],
                    'file_path': row[2],
                    'location': 'active'
                }

    # Recycle bin files with proper source mapping
    recycle_path = Path('/opt/immich/recycle')
    if recycle_path.exists():
        cutoff = datetime.now().timestamp() - (3 * 24 * 60 * 60)
        for ext in ['*.mp4', '*.jpg', '*.jpeg', '*.webp', '*.png', '*.heic']:
            for file_path in recycle_path.rglob(ext):
                if file_path.stat().st_mtime > cutoff:
                    file_path_str = str(file_path)

                    # Look up source from mapping
                    source_info = source_mapping.get(file_path_str, {})
                    source = source_info.get('source', 'unknown')
                    original_filename = source_info.get('original_filename', file_path.name)

                    all_files[file_path_str] = {
                        'source': source,
                        'filename': original_filename,
                        'file_path': file_path_str,
                        'location': 'recycle_bin'
                    }

    print(f"Total files to analyze: {len(all_files)}")
    print()

    # Initialize detector
    detector = InstagramPerceptualDuplicateDetector(
        unified_db=db,
        log_callback=lambda msg, lvl: None  # Suppress logs
    )

    # Analyze files
    print("Analyzing files (this may take a while)...")
    file_data = []

    for i, (path, info) in enumerate(all_files.items(), 1):
        if i % 50 == 0:
            print(f"  Progress: {i}/{len(all_files)}...")

        phash = detector._calculate_perceptual_hash(path)
        if not phash:
            continue

        text_count, text_chars = detector._detect_text_overlays(path)
        quality_metrics = detector._get_quality_metrics(path)
        clean_score = detector._calculate_clean_score(text_count, text_chars)
        quality_score = detector._calculate_quality_score(quality_metrics)

        file_data.append({
            'info': info,
            'phash': phash,
            'text_count': text_count,
            'text_chars': text_chars,
            'clean_score': clean_score,
            'quality_score': quality_score,
            'quality_metrics': quality_metrics,
            'total_score': (clean_score * 3) + (quality_score * 1)
        })

    print(f"Analyzed {len(file_data)} files")
    print()

    # Find duplicates by source
    print("=" * 80)
    print("DUPLICATE DETECTION BY SOURCE")
    print("=" * 80)
    print()

    # Group by source first
    by_source = defaultdict(list)
    for data in file_data:
        by_source[data['info']['source']].append(data)

    # Find duplicates within each source
    duplicate_groups = []

    for source, files in by_source.items():
        if source == 'unknown' or len(files) < 2:
            continue

        processed = set()

        for i, data1 in enumerate(files):
            if i in processed:
                continue

            group = [data1]

            for j, data2 in enumerate(files[i+1:], start=i+1):
                if j in processed:
                    continue

                distance = detector._hamming_distance(data1['phash'], data2['phash'])

                if distance <= 12:  # threshold
                    group.append(data2)
                    processed.add(j)

            if len(group) > 1:
                group.sort(key=lambda x: x['total_score'], reverse=True)
                duplicate_groups.append((source, group))
                processed.add(i)

    if len(duplicate_groups) == 0:
        print("✅ No duplicates found (excluding 'unknown' sources)")
        return

    # Report by source
    print(f"Found {len(duplicate_groups)} duplicate group(s) across {len(set(s for s, _ in duplicate_groups))} sources")
    print()

    # Group by source for reporting
    by_source_report = defaultdict(list)
    for source, group in duplicate_groups:
        by_source_report[source].append(group)

    total_would_remove = 0
    total_size_freed = 0

    for source in sorted(by_source_report.keys()):
        groups = by_source_report[source]

        print(f"\n{'=' * 80}")
        print(f"SOURCE: @{source}")
        print(f"{'=' * 80}")
        print(f"Duplicate groups: {len(groups)}")
        print()

        for group_num, group in enumerate(groups, 1):
            print(f"\n  Group {group_num} ({len(group)} files):")
            print(f"  {'-' * 76}")

            best = group[0]
            print(f"  ✅ KEEP: {best['info']['filename'][:60]}")
            print(f"     Location: {best['info']['location']}")
            print(f"     Clean: {best['clean_score']:.0f}/100 ({best['text_count']} text), Quality: {best['quality_score']:.0f}/100")
            print(f"     Resolution: {best['quality_metrics']['width']}x{best['quality_metrics']['height']}, Size: {best['quality_metrics']['file_size']/1024/1024:.1f}MB")
            print()

            for data in group[1:]:
                total_would_remove += 1
                total_size_freed += data['quality_metrics']['file_size']

                distance = detector._hamming_distance(best['phash'], data['phash'])

                print(f"  ❌ REMOVE: {data['info']['filename'][:60]}")
                print(f"     Location: {data['info']['location']}")
                print(f"     Clean: {data['clean_score']:.0f}/100 ({data['text_count']} text), Quality: {data['quality_score']:.0f}/100")
                print(f"     Hash distance: {distance}")

                reasons = []
                if data['clean_score'] < best['clean_score'] - 5:
                    reasons.append(f"More text ({data['text_count']} vs {best['text_count']})")
                if data['quality_score'] < best['quality_score']:
                    reasons.append("Lower quality")
                if reasons:
                    print(f"     Reason: {', '.join(reasons)}")
                print()

    print()
    print("=" * 80)
    print("SUMMARY BY SOURCE")
    print("=" * 80)

    source_stats = defaultdict(lambda: {'groups': 0, 'would_remove': 0})
    for source, group in duplicate_groups:
        source_stats[source]['groups'] += 1
        source_stats[source]['would_remove'] += len(group) - 1

    print()
    for source in sorted(source_stats.keys(), key=lambda s: source_stats[s]['would_remove'], reverse=True):
        stats = source_stats[source]
        print(f"  @{source:30s} : {stats['groups']:2d} groups, {stats['would_remove']:3d} files to remove")

    print()
    print("=" * 80)
    print("OVERALL SUMMARY")
    print("=" * 80)
    print(f"Sources with duplicates: {len(source_stats)}")
    print(f"Total duplicate groups: {len(duplicate_groups)}")
    print(f"Files that would be removed: {total_would_remove}")
    print(f"Storage that would be freed: {total_size_freed / 1024 / 1024:.1f} MB")
    print()

if __name__ == '__main__':
    main()