Files
media-downloader/tests/test_perceptual_with_sources.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

286 lines
9.6 KiB
Python

#!/usr/bin/env python3
"""
Perceptual Duplicate Detection with Proper Source Mapping
Maps UUID filenames from recycle bin back to original Instagram sources
using the media-downloader's recycle_bin database.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from modules.unified_database import UnifiedDatabase
from modules.instagram_perceptual_duplicate_detector import InstagramPerceptualDuplicateDetector
import sqlite3
import json
from datetime import datetime
from collections import defaultdict
import re
def get_source_mapping(backup_db_path):
"""Map UUID recycle filenames to original sources"""
print("Loading recycle bin source mappings...")
conn = sqlite3.connect(backup_db_path)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute("""
SELECT
recycle_path,
original_filename,
original_path,
deleted_at
FROM recycle_bin
WHERE deleted_at > datetime('now', '-3 days')
""")
mapping = {}
for row in cursor.fetchall():
recycle_path = row['recycle_path']
original_filename = row['original_filename']
# Extract source from Instagram filename pattern: source_date_id.ext
source = 'unknown'
match = re.match(r'^([a-z0-9._]+)_\d{8}', original_filename.lower())
if match:
source = match.group(1)
mapping[recycle_path] = {
'source': source,
'original_filename': original_filename,
'original_path': row['original_path'],
'deleted_at': row['deleted_at']
}
conn.close()
print(f" Mapped {len(mapping)} recycled files to original sources")
return mapping
def main():
print("=" * 80)
print("INSTAGRAM PERCEPTUAL DUPLICATES - WITH SOURCE MAPPING")
print("=" * 80)
print()
# Load source mapping from recycle bin database
backup_db = Path(__file__).parent.parent / 'data' / 'backup_cache.db'
source_mapping = get_source_mapping(str(backup_db))
# Load comprehensive scan results
db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db'
db = UnifiedDatabase(str(db_path))
# Get all files
print("\nCollecting Instagram files...")
all_files = {}
# Database files
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT filename, source, file_path, download_date
FROM downloads
WHERE platform = 'instagram'
AND download_date > datetime('now', '-3 days')
AND file_path IS NOT NULL
AND file_path NOT LIKE '%_phrase_checked_%'
AND file_path NOT LIKE '%_old_post_%'
""")
for row in cursor.fetchall():
if Path(row[2]).exists():
all_files[row[2]] = {
'source': row[1],
'filename': row[0],
'file_path': row[2],
'location': 'active'
}
# Recycle bin files with proper source mapping
recycle_path = Path('/opt/immich/recycle')
if recycle_path.exists():
cutoff = datetime.now().timestamp() - (3 * 24 * 60 * 60)
for ext in ['*.mp4', '*.jpg', '*.jpeg', '*.webp', '*.png', '*.heic']:
for file_path in recycle_path.rglob(ext):
if file_path.stat().st_mtime > cutoff:
file_path_str = str(file_path)
# Look up source from mapping
source_info = source_mapping.get(file_path_str, {})
source = source_info.get('source', 'unknown')
original_filename = source_info.get('original_filename', file_path.name)
all_files[file_path_str] = {
'source': source,
'filename': original_filename,
'file_path': file_path_str,
'location': 'recycle_bin'
}
print(f"Total files to analyze: {len(all_files)}")
print()
# Initialize detector
detector = InstagramPerceptualDuplicateDetector(
unified_db=db,
log_callback=lambda msg, lvl: None # Suppress logs
)
# Analyze files
print("Analyzing files (this may take a while)...")
file_data = []
for i, (path, info) in enumerate(all_files.items(), 1):
if i % 50 == 0:
print(f" Progress: {i}/{len(all_files)}...")
phash = detector._calculate_perceptual_hash(path)
if not phash:
continue
text_count, text_chars = detector._detect_text_overlays(path)
quality_metrics = detector._get_quality_metrics(path)
clean_score = detector._calculate_clean_score(text_count, text_chars)
quality_score = detector._calculate_quality_score(quality_metrics)
file_data.append({
'info': info,
'phash': phash,
'text_count': text_count,
'text_chars': text_chars,
'clean_score': clean_score,
'quality_score': quality_score,
'quality_metrics': quality_metrics,
'total_score': (clean_score * 3) + (quality_score * 1)
})
print(f"Analyzed {len(file_data)} files")
print()
# Find duplicates by source
print("=" * 80)
print("DUPLICATE DETECTION BY SOURCE")
print("=" * 80)
print()
# Group by source first
by_source = defaultdict(list)
for data in file_data:
by_source[data['info']['source']].append(data)
# Find duplicates within each source
duplicate_groups = []
for source, files in by_source.items():
if source == 'unknown' or len(files) < 2:
continue
processed = set()
for i, data1 in enumerate(files):
if i in processed:
continue
group = [data1]
for j, data2 in enumerate(files[i+1:], start=i+1):
if j in processed:
continue
distance = detector._hamming_distance(data1['phash'], data2['phash'])
if distance <= 12: # threshold
group.append(data2)
processed.add(j)
if len(group) > 1:
group.sort(key=lambda x: x['total_score'], reverse=True)
duplicate_groups.append((source, group))
processed.add(i)
if len(duplicate_groups) == 0:
print("✅ No duplicates found (excluding 'unknown' sources)")
return
# Report by source
print(f"Found {len(duplicate_groups)} duplicate group(s) across {len(set(s for s, _ in duplicate_groups))} sources")
print()
# Group by source for reporting
by_source_report = defaultdict(list)
for source, group in duplicate_groups:
by_source_report[source].append(group)
total_would_remove = 0
total_size_freed = 0
for source in sorted(by_source_report.keys()):
groups = by_source_report[source]
print(f"\n{'=' * 80}")
print(f"SOURCE: @{source}")
print(f"{'=' * 80}")
print(f"Duplicate groups: {len(groups)}")
print()
for group_num, group in enumerate(groups, 1):
print(f"\n Group {group_num} ({len(group)} files):")
print(f" {'-' * 76}")
best = group[0]
print(f" ✅ KEEP: {best['info']['filename'][:60]}")
print(f" Location: {best['info']['location']}")
print(f" Clean: {best['clean_score']:.0f}/100 ({best['text_count']} text), Quality: {best['quality_score']:.0f}/100")
print(f" Resolution: {best['quality_metrics']['width']}x{best['quality_metrics']['height']}, Size: {best['quality_metrics']['file_size']/1024/1024:.1f}MB")
print()
for data in group[1:]:
total_would_remove += 1
total_size_freed += data['quality_metrics']['file_size']
distance = detector._hamming_distance(best['phash'], data['phash'])
print(f" ❌ REMOVE: {data['info']['filename'][:60]}")
print(f" Location: {data['info']['location']}")
print(f" Clean: {data['clean_score']:.0f}/100 ({data['text_count']} text), Quality: {data['quality_score']:.0f}/100")
print(f" Hash distance: {distance}")
reasons = []
if data['clean_score'] < best['clean_score'] - 5:
reasons.append(f"More text ({data['text_count']} vs {best['text_count']})")
if data['quality_score'] < best['quality_score']:
reasons.append("Lower quality")
if reasons:
print(f" Reason: {', '.join(reasons)}")
print()
print()
print("=" * 80)
print("SUMMARY BY SOURCE")
print("=" * 80)
source_stats = defaultdict(lambda: {'groups': 0, 'would_remove': 0})
for source, group in duplicate_groups:
source_stats[source]['groups'] += 1
source_stats[source]['would_remove'] += len(group) - 1
print()
for source in sorted(source_stats.keys(), key=lambda s: source_stats[s]['would_remove'], reverse=True):
stats = source_stats[source]
print(f" @{source:30s} : {stats['groups']:2d} groups, {stats['would_remove']:3d} files to remove")
print()
print("=" * 80)
print("OVERALL SUMMARY")
print("=" * 80)
print(f"Sources with duplicates: {len(source_stats)}")
print(f"Total duplicate groups: {len(duplicate_groups)}")
print(f"Files that would be removed: {total_would_remove}")
print(f"Storage that would be freed: {total_size_freed / 1024 / 1024:.1f} MB")
print()
if __name__ == '__main__':
main()