media-downloader/tests/test_perceptual_scan.py

#!/usr/bin/env python3
"""
Dry-run test of Instagram Perceptual Duplicate Detection

Scans last 3 days of downloads and reports what would be considered duplicates
WITHOUT actually moving or deleting anything.
"""

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))

from modules.unified_database import UnifiedDatabase
from modules.instagram_perceptual_duplicate_detector import InstagramPerceptualDuplicateDetector
import json
from datetime import datetime, timedelta
from collections import defaultdict

class DryRunLogger:
    """Logger that captures all messages"""
    def __init__(self):
        self.messages = []

    def __call__(self, msg, level):
        self.messages.append((level, msg))
        print(f"[{level.upper()}] {msg}")

def main():
    print("=" * 80)
    print("INSTAGRAM PERCEPTUAL DUPLICATE DETECTION - DRY RUN")
    print("=" * 80)
    print()

    # Initialize database
    db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db'
    db = UnifiedDatabase(str(db_path))

    # Get all Instagram files from last 3 days
    with db.get_connection() as conn:
        cursor = conn.cursor()
        cursor.execute("""
            SELECT
                filename,
                source,
                file_path,
                file_hash,
                download_date,
                content_type
            FROM downloads
            WHERE platform = 'instagram'
            AND download_date > datetime('now', '-3 days')
            AND file_path IS NOT NULL
            AND file_path NOT LIKE '%_phrase_checked_%'
            AND file_path NOT LIKE '%_old_post_%'
            ORDER BY source, download_date
        """)

        files = []
        for row in cursor.fetchall():
            if row[2] and Path(row[2]).exists():  # Only include files that exist
                files.append({
                    'filename': row[0],
                    'source': row[1],
                    'file_path': row[2],
                    'file_hash': row[3],
                    'download_date': row[4],
                    'content_type': row[5] or 'unknown'
                })

    print(f"Found {len(files)} Instagram files from last 3 days that exist on disk")
    print()

    if len(files) == 0:
        print("No files to analyze!")
        return

    # Initialize detector
    logger = DryRunLogger()
    detector = InstagramPerceptualDuplicateDetector(
        unified_db=db,
        log_callback=logger
    )

    # Get settings (will use defaults since feature is disabled)
    settings = {
        'enabled': False,
        'perceptual_hash_threshold': 12,
        'text_detection_enabled': True,
        'clean_score_weight': 3,
        'quality_score_weight': 1,
        'min_text_difference': 5
    }

    print(f"Using settings:")
    print(f"  - Perceptual hash threshold: {settings['perceptual_hash_threshold']}")
    print(f"  - Clean score weight: {settings['clean_score_weight']}")
    print(f"  - Quality score weight: {settings['quality_score_weight']}")
    print(f"  - Text detection: {'Enabled' if settings['text_detection_enabled'] else 'Disabled'}")
    print()

    # Process each file and collect data
    print("Analyzing files...")
    print("-" * 80)

    file_data = []

    for i, file_info in enumerate(files, 1):
        file_path = file_info['file_path']
        source = file_info['source']

        print(f"\n[{i}/{len(files)}] Processing: {Path(file_path).name}")

        # Calculate perceptual hash
        phash = detector._calculate_perceptual_hash(file_path)
        if not phash:
            print(f"  ⚠️  Could not calculate perceptual hash - skipping")
            continue

        # Detect text overlays
        if settings['text_detection_enabled']:
            text_count, text_chars = detector._detect_text_overlays(file_path)
        else:
            text_count, text_chars = 0, 0

        # Get quality metrics
        quality_metrics = detector._get_quality_metrics(file_path)

        # Calculate scores
        clean_score = detector._calculate_clean_score(text_count, text_chars)
        quality_score = detector._calculate_quality_score(quality_metrics)

        print(f"  Hash: {phash[:16]}...")
        print(f"  Text overlays: {text_count} regions, {text_chars} chars")
        print(f"  Resolution: {quality_metrics['width']}x{quality_metrics['height']}")
        print(f"  File size: {quality_metrics['file_size'] / 1024 / 1024:.1f} MB")
        print(f"  Clean score: {clean_score:.1f}/100")
        print(f"  Quality score: {quality_score:.1f}/100")
        print(f"  Total score: {(clean_score * settings['clean_score_weight']) + (quality_score * settings['quality_score_weight']):.1f}")

        file_data.append({
            'file_info': file_info,
            'phash': phash,
            'text_count': text_count,
            'text_chars': text_chars,
            'clean_score': clean_score,
            'quality_score': quality_score,
            'quality_metrics': quality_metrics,
            'total_score': (clean_score * settings['clean_score_weight']) + (quality_score * settings['quality_score_weight'])
        })

    print()
    print("=" * 80)
    print("DUPLICATE DETECTION ANALYSIS")
    print("=" * 80)
    print()

    # Find duplicates by comparing hashes
    duplicates = []
    processed = set()

    for i, data1 in enumerate(file_data):
        if i in processed:
            continue

        group = [data1]

        for j, data2 in enumerate(file_data[i+1:], start=i+1):
            if j in processed:
                continue

            # Same source only
            if data1['file_info']['source'] != data2['file_info']['source']:
                continue

            # Calculate Hamming distance
            distance = detector._hamming_distance(data1['phash'], data2['phash'])

            if distance <= settings['perceptual_hash_threshold']:
                group.append(data2)
                processed.add(j)

        if len(group) > 1:
            # Sort by total score (highest first)
            group.sort(key=lambda x: x['total_score'], reverse=True)
            duplicates.append(group)
            processed.add(i)

    if len(duplicates) == 0:
        print("✅ No perceptual duplicates found!")
        print()
        print("All files are unique or sufficiently different.")
        return

    print(f"Found {len(duplicates)} duplicate group(s):")
    print()

    total_would_remove = 0

    for group_num, group in enumerate(duplicates, 1):
        print(f"\n{'=' * 80}")
        print(f"DUPLICATE GROUP #{group_num}")
        print(f"{'=' * 80}")
        print(f"Source: {group[0]['file_info']['source']}")
        print(f"Files in group: {len(group)}")
        print()

        best = group[0]
        print(f"✅ WOULD KEEP:")
        print(f"   File: {Path(best['file_info']['file_path']).name}")
        print(f"   Path: {best['file_info']['file_path']}")
        print(f"   Clean score: {best['clean_score']:.1f}/100 ({best['text_count']} text regions)")
        print(f"   Quality score: {best['quality_score']:.1f}/100 ({best['quality_metrics']['width']}x{best['quality_metrics']['height']}, {best['quality_metrics']['file_size']/1024/1024:.1f}MB)")
        print(f"   Total score: {best['total_score']:.1f}")
        print(f"   Download date: {best['file_info']['download_date']}")
        print()

        print(f"❌ WOULD REMOVE ({len(group)-1} file(s)):")
        for data in group[1:]:
            total_would_remove += 1
            print(f"\n   File: {Path(data['file_info']['file_path']).name}")
            print(f"   Path: {data['file_info']['file_path']}")
            print(f"   Clean score: {data['clean_score']:.1f}/100 ({data['text_count']} text regions)")
            print(f"   Quality score: {data['quality_score']:.1f}/100 ({data['quality_metrics']['width']}x{data['quality_metrics']['height']}, {data['quality_metrics']['file_size']/1024/1024:.1f}MB)")
            print(f"   Total score: {data['total_score']:.1f}")
            print(f"   Download date: {data['file_info']['download_date']}")

            # Calculate hash distance
            distance = detector._hamming_distance(best['phash'], data['phash'])
            print(f"   Hash distance from best: {distance}")

            # Explain why it would be removed
            reasons = []
            if data['clean_score'] < best['clean_score'] - settings['min_text_difference']:
                reasons.append(f"Has more text overlays ({data['text_count']} vs {best['text_count']})")
            if data['quality_score'] < best['quality_score']:
                reasons.append(f"Lower quality ({data['quality_metrics']['width']}x{data['quality_metrics']['height']} vs {best['quality_metrics']['width']}x{best['quality_metrics']['height']})")
            if data['total_score'] < best['total_score']:
                reasons.append(f"Lower total score ({data['total_score']:.1f} vs {best['total_score']:.1f})")

            if reasons:
                print(f"   Reason(s): {'; '.join(reasons)}")

    print()
    print("=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Total files analyzed: {len(file_data)}")
    print(f"Duplicate groups found: {len(duplicates)}")
    print(f"Files that would be kept: {len(duplicates)}")
    print(f"Files that would be removed: {total_would_remove}")
    print()
    print("⚠️  NOTE: This is a DRY RUN - no files were actually moved or deleted!")
    print("    To enable this feature, set 'enabled: true' in Configuration > Instagram Perceptual Duplicate Detection")
    print()

if __name__ == '__main__':
    main()