#!/usr/bin/env python3 """ Dry-run test of Instagram Perceptual Duplicate Detection Scans last 3 days of downloads and reports what would be considered duplicates WITHOUT actually moving or deleting anything. """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from modules.unified_database import UnifiedDatabase from modules.instagram_perceptual_duplicate_detector import InstagramPerceptualDuplicateDetector import json from datetime import datetime, timedelta from collections import defaultdict class DryRunLogger: """Logger that captures all messages""" def __init__(self): self.messages = [] def __call__(self, msg, level): self.messages.append((level, msg)) print(f"[{level.upper()}] {msg}") def main(): print("=" * 80) print("INSTAGRAM PERCEPTUAL DUPLICATE DETECTION - DRY RUN") print("=" * 80) print() # Initialize database db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db' db = UnifiedDatabase(str(db_path)) # Get all Instagram files from last 3 days with db.get_connection() as conn: cursor = conn.cursor() cursor.execute(""" SELECT filename, source, file_path, file_hash, download_date, content_type FROM downloads WHERE platform = 'instagram' AND download_date > datetime('now', '-3 days') AND file_path IS NOT NULL AND file_path NOT LIKE '%_phrase_checked_%' AND file_path NOT LIKE '%_old_post_%' ORDER BY source, download_date """) files = [] for row in cursor.fetchall(): if row[2] and Path(row[2]).exists(): # Only include files that exist files.append({ 'filename': row[0], 'source': row[1], 'file_path': row[2], 'file_hash': row[3], 'download_date': row[4], 'content_type': row[5] or 'unknown' }) print(f"Found {len(files)} Instagram files from last 3 days that exist on disk") print() if len(files) == 0: print("No files to analyze!") return # Initialize detector logger = DryRunLogger() detector = InstagramPerceptualDuplicateDetector( unified_db=db, log_callback=logger ) # Get settings (will use defaults since feature is disabled) settings = { 'enabled': False, 'perceptual_hash_threshold': 12, 'text_detection_enabled': True, 'clean_score_weight': 3, 'quality_score_weight': 1, 'min_text_difference': 5 } print(f"Using settings:") print(f" - Perceptual hash threshold: {settings['perceptual_hash_threshold']}") print(f" - Clean score weight: {settings['clean_score_weight']}") print(f" - Quality score weight: {settings['quality_score_weight']}") print(f" - Text detection: {'Enabled' if settings['text_detection_enabled'] else 'Disabled'}") print() # Process each file and collect data print("Analyzing files...") print("-" * 80) file_data = [] for i, file_info in enumerate(files, 1): file_path = file_info['file_path'] source = file_info['source'] print(f"\n[{i}/{len(files)}] Processing: {Path(file_path).name}") # Calculate perceptual hash phash = detector._calculate_perceptual_hash(file_path) if not phash: print(f" ⚠️ Could not calculate perceptual hash - skipping") continue # Detect text overlays if settings['text_detection_enabled']: text_count, text_chars = detector._detect_text_overlays(file_path) else: text_count, text_chars = 0, 0 # Get quality metrics quality_metrics = detector._get_quality_metrics(file_path) # Calculate scores clean_score = detector._calculate_clean_score(text_count, text_chars) quality_score = detector._calculate_quality_score(quality_metrics) print(f" Hash: {phash[:16]}...") print(f" Text overlays: {text_count} regions, {text_chars} chars") print(f" Resolution: {quality_metrics['width']}x{quality_metrics['height']}") print(f" File size: {quality_metrics['file_size'] / 1024 / 1024:.1f} MB") print(f" Clean score: {clean_score:.1f}/100") print(f" Quality score: {quality_score:.1f}/100") print(f" Total score: {(clean_score * settings['clean_score_weight']) + (quality_score * settings['quality_score_weight']):.1f}") file_data.append({ 'file_info': file_info, 'phash': phash, 'text_count': text_count, 'text_chars': text_chars, 'clean_score': clean_score, 'quality_score': quality_score, 'quality_metrics': quality_metrics, 'total_score': (clean_score * settings['clean_score_weight']) + (quality_score * settings['quality_score_weight']) }) print() print("=" * 80) print("DUPLICATE DETECTION ANALYSIS") print("=" * 80) print() # Find duplicates by comparing hashes duplicates = [] processed = set() for i, data1 in enumerate(file_data): if i in processed: continue group = [data1] for j, data2 in enumerate(file_data[i+1:], start=i+1): if j in processed: continue # Same source only if data1['file_info']['source'] != data2['file_info']['source']: continue # Calculate Hamming distance distance = detector._hamming_distance(data1['phash'], data2['phash']) if distance <= settings['perceptual_hash_threshold']: group.append(data2) processed.add(j) if len(group) > 1: # Sort by total score (highest first) group.sort(key=lambda x: x['total_score'], reverse=True) duplicates.append(group) processed.add(i) if len(duplicates) == 0: print("✅ No perceptual duplicates found!") print() print("All files are unique or sufficiently different.") return print(f"Found {len(duplicates)} duplicate group(s):") print() total_would_remove = 0 for group_num, group in enumerate(duplicates, 1): print(f"\n{'=' * 80}") print(f"DUPLICATE GROUP #{group_num}") print(f"{'=' * 80}") print(f"Source: {group[0]['file_info']['source']}") print(f"Files in group: {len(group)}") print() best = group[0] print(f"✅ WOULD KEEP:") print(f" File: {Path(best['file_info']['file_path']).name}") print(f" Path: {best['file_info']['file_path']}") print(f" Clean score: {best['clean_score']:.1f}/100 ({best['text_count']} text regions)") print(f" Quality score: {best['quality_score']:.1f}/100 ({best['quality_metrics']['width']}x{best['quality_metrics']['height']}, {best['quality_metrics']['file_size']/1024/1024:.1f}MB)") print(f" Total score: {best['total_score']:.1f}") print(f" Download date: {best['file_info']['download_date']}") print() print(f"❌ WOULD REMOVE ({len(group)-1} file(s)):") for data in group[1:]: total_would_remove += 1 print(f"\n File: {Path(data['file_info']['file_path']).name}") print(f" Path: {data['file_info']['file_path']}") print(f" Clean score: {data['clean_score']:.1f}/100 ({data['text_count']} text regions)") print(f" Quality score: {data['quality_score']:.1f}/100 ({data['quality_metrics']['width']}x{data['quality_metrics']['height']}, {data['quality_metrics']['file_size']/1024/1024:.1f}MB)") print(f" Total score: {data['total_score']:.1f}") print(f" Download date: {data['file_info']['download_date']}") # Calculate hash distance distance = detector._hamming_distance(best['phash'], data['phash']) print(f" Hash distance from best: {distance}") # Explain why it would be removed reasons = [] if data['clean_score'] < best['clean_score'] - settings['min_text_difference']: reasons.append(f"Has more text overlays ({data['text_count']} vs {best['text_count']})") if data['quality_score'] < best['quality_score']: reasons.append(f"Lower quality ({data['quality_metrics']['width']}x{data['quality_metrics']['height']} vs {best['quality_metrics']['width']}x{best['quality_metrics']['height']})") if data['total_score'] < best['total_score']: reasons.append(f"Lower total score ({data['total_score']:.1f} vs {best['total_score']:.1f})") if reasons: print(f" Reason(s): {'; '.join(reasons)}") print() print("=" * 80) print("SUMMARY") print("=" * 80) print(f"Total files analyzed: {len(file_data)}") print(f"Duplicate groups found: {len(duplicates)}") print(f"Files that would be kept: {len(duplicates)}") print(f"Files that would be removed: {total_would_remove}") print() print("⚠️ NOTE: This is a DRY RUN - no files were actually moved or deleted!") print(" To enable this feature, set 'enabled: true' in Configuration > Instagram Perceptual Duplicate Detection") print() if __name__ == '__main__': main()