#!/usr/bin/env python3 """ Perceptual Duplicate Detection with Proper Source Mapping Maps UUID filenames from recycle bin back to original Instagram sources using the media-downloader's recycle_bin database. """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from modules.unified_database import UnifiedDatabase from modules.instagram_perceptual_duplicate_detector import InstagramPerceptualDuplicateDetector import sqlite3 import json from datetime import datetime from collections import defaultdict import re def get_source_mapping(backup_db_path): """Map UUID recycle filenames to original sources""" print("Loading recycle bin source mappings...") conn = sqlite3.connect(backup_db_path) conn.row_factory = sqlite3.Row cursor = conn.cursor() cursor.execute(""" SELECT recycle_path, original_filename, original_path, deleted_at FROM recycle_bin WHERE deleted_at > datetime('now', '-3 days') """) mapping = {} for row in cursor.fetchall(): recycle_path = row['recycle_path'] original_filename = row['original_filename'] # Extract source from Instagram filename pattern: source_date_id.ext source = 'unknown' match = re.match(r'^([a-z0-9._]+)_\d{8}', original_filename.lower()) if match: source = match.group(1) mapping[recycle_path] = { 'source': source, 'original_filename': original_filename, 'original_path': row['original_path'], 'deleted_at': row['deleted_at'] } conn.close() print(f" Mapped {len(mapping)} recycled files to original sources") return mapping def main(): print("=" * 80) print("INSTAGRAM PERCEPTUAL DUPLICATES - WITH SOURCE MAPPING") print("=" * 80) print() # Load source mapping from recycle bin database backup_db = Path(__file__).parent.parent / 'data' / 'backup_cache.db' source_mapping = get_source_mapping(str(backup_db)) # Load comprehensive scan results db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db' db = UnifiedDatabase(str(db_path)) # Get all files print("\nCollecting Instagram files...") all_files = {} # Database files with db.get_connection() as conn: cursor = conn.cursor() cursor.execute(""" SELECT filename, source, file_path, download_date FROM downloads WHERE platform = 'instagram' AND download_date > datetime('now', '-3 days') AND file_path IS NOT NULL AND file_path NOT LIKE '%_phrase_checked_%' AND file_path NOT LIKE '%_old_post_%' """) for row in cursor.fetchall(): if Path(row[2]).exists(): all_files[row[2]] = { 'source': row[1], 'filename': row[0], 'file_path': row[2], 'location': 'active' } # Recycle bin files with proper source mapping recycle_path = Path('/opt/immich/recycle') if recycle_path.exists(): cutoff = datetime.now().timestamp() - (3 * 24 * 60 * 60) for ext in ['*.mp4', '*.jpg', '*.jpeg', '*.webp', '*.png', '*.heic']: for file_path in recycle_path.rglob(ext): if file_path.stat().st_mtime > cutoff: file_path_str = str(file_path) # Look up source from mapping source_info = source_mapping.get(file_path_str, {}) source = source_info.get('source', 'unknown') original_filename = source_info.get('original_filename', file_path.name) all_files[file_path_str] = { 'source': source, 'filename': original_filename, 'file_path': file_path_str, 'location': 'recycle_bin' } print(f"Total files to analyze: {len(all_files)}") print() # Initialize detector detector = InstagramPerceptualDuplicateDetector( unified_db=db, log_callback=lambda msg, lvl: None # Suppress logs ) # Analyze files print("Analyzing files (this may take a while)...") file_data = [] for i, (path, info) in enumerate(all_files.items(), 1): if i % 50 == 0: print(f" Progress: {i}/{len(all_files)}...") phash = detector._calculate_perceptual_hash(path) if not phash: continue text_count, text_chars = detector._detect_text_overlays(path) quality_metrics = detector._get_quality_metrics(path) clean_score = detector._calculate_clean_score(text_count, text_chars) quality_score = detector._calculate_quality_score(quality_metrics) file_data.append({ 'info': info, 'phash': phash, 'text_count': text_count, 'text_chars': text_chars, 'clean_score': clean_score, 'quality_score': quality_score, 'quality_metrics': quality_metrics, 'total_score': (clean_score * 3) + (quality_score * 1) }) print(f"Analyzed {len(file_data)} files") print() # Find duplicates by source print("=" * 80) print("DUPLICATE DETECTION BY SOURCE") print("=" * 80) print() # Group by source first by_source = defaultdict(list) for data in file_data: by_source[data['info']['source']].append(data) # Find duplicates within each source duplicate_groups = [] for source, files in by_source.items(): if source == 'unknown' or len(files) < 2: continue processed = set() for i, data1 in enumerate(files): if i in processed: continue group = [data1] for j, data2 in enumerate(files[i+1:], start=i+1): if j in processed: continue distance = detector._hamming_distance(data1['phash'], data2['phash']) if distance <= 12: # threshold group.append(data2) processed.add(j) if len(group) > 1: group.sort(key=lambda x: x['total_score'], reverse=True) duplicate_groups.append((source, group)) processed.add(i) if len(duplicate_groups) == 0: print("✅ No duplicates found (excluding 'unknown' sources)") return # Report by source print(f"Found {len(duplicate_groups)} duplicate group(s) across {len(set(s for s, _ in duplicate_groups))} sources") print() # Group by source for reporting by_source_report = defaultdict(list) for source, group in duplicate_groups: by_source_report[source].append(group) total_would_remove = 0 total_size_freed = 0 for source in sorted(by_source_report.keys()): groups = by_source_report[source] print(f"\n{'=' * 80}") print(f"SOURCE: @{source}") print(f"{'=' * 80}") print(f"Duplicate groups: {len(groups)}") print() for group_num, group in enumerate(groups, 1): print(f"\n Group {group_num} ({len(group)} files):") print(f" {'-' * 76}") best = group[0] print(f" ✅ KEEP: {best['info']['filename'][:60]}") print(f" Location: {best['info']['location']}") print(f" Clean: {best['clean_score']:.0f}/100 ({best['text_count']} text), Quality: {best['quality_score']:.0f}/100") print(f" Resolution: {best['quality_metrics']['width']}x{best['quality_metrics']['height']}, Size: {best['quality_metrics']['file_size']/1024/1024:.1f}MB") print() for data in group[1:]: total_would_remove += 1 total_size_freed += data['quality_metrics']['file_size'] distance = detector._hamming_distance(best['phash'], data['phash']) print(f" ❌ REMOVE: {data['info']['filename'][:60]}") print(f" Location: {data['info']['location']}") print(f" Clean: {data['clean_score']:.0f}/100 ({data['text_count']} text), Quality: {data['quality_score']:.0f}/100") print(f" Hash distance: {distance}") reasons = [] if data['clean_score'] < best['clean_score'] - 5: reasons.append(f"More text ({data['text_count']} vs {best['text_count']})") if data['quality_score'] < best['quality_score']: reasons.append("Lower quality") if reasons: print(f" Reason: {', '.join(reasons)}") print() print() print("=" * 80) print("SUMMARY BY SOURCE") print("=" * 80) source_stats = defaultdict(lambda: {'groups': 0, 'would_remove': 0}) for source, group in duplicate_groups: source_stats[source]['groups'] += 1 source_stats[source]['would_remove'] += len(group) - 1 print() for source in sorted(source_stats.keys(), key=lambda s: source_stats[s]['would_remove'], reverse=True): stats = source_stats[source] print(f" @{source:30s} : {stats['groups']:2d} groups, {stats['would_remove']:3d} files to remove") print() print("=" * 80) print("OVERALL SUMMARY") print("=" * 80) print(f"Sources with duplicates: {len(source_stats)}") print(f"Total duplicate groups: {len(duplicate_groups)}") print(f"Files that would be removed: {total_would_remove}") print(f"Storage that would be freed: {total_size_freed / 1024 / 1024:.1f} MB") print() if __name__ == '__main__': main()