286 lines
9.6 KiB
Python
286 lines
9.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Perceptual Duplicate Detection with Proper Source Mapping
|
|
|
|
Maps UUID filenames from recycle bin back to original Instagram sources
|
|
using the media-downloader's recycle_bin database.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from modules.unified_database import UnifiedDatabase
|
|
from modules.instagram_perceptual_duplicate_detector import InstagramPerceptualDuplicateDetector
|
|
import sqlite3
|
|
import json
|
|
from datetime import datetime
|
|
from collections import defaultdict
|
|
import re
|
|
|
|
def get_source_mapping(backup_db_path):
|
|
"""Map UUID recycle filenames to original sources"""
|
|
print("Loading recycle bin source mappings...")
|
|
|
|
conn = sqlite3.connect(backup_db_path)
|
|
conn.row_factory = sqlite3.Row
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("""
|
|
SELECT
|
|
recycle_path,
|
|
original_filename,
|
|
original_path,
|
|
deleted_at
|
|
FROM recycle_bin
|
|
WHERE deleted_at > datetime('now', '-3 days')
|
|
""")
|
|
|
|
mapping = {}
|
|
for row in cursor.fetchall():
|
|
recycle_path = row['recycle_path']
|
|
original_filename = row['original_filename']
|
|
|
|
# Extract source from Instagram filename pattern: source_date_id.ext
|
|
source = 'unknown'
|
|
match = re.match(r'^([a-z0-9._]+)_\d{8}', original_filename.lower())
|
|
if match:
|
|
source = match.group(1)
|
|
|
|
mapping[recycle_path] = {
|
|
'source': source,
|
|
'original_filename': original_filename,
|
|
'original_path': row['original_path'],
|
|
'deleted_at': row['deleted_at']
|
|
}
|
|
|
|
conn.close()
|
|
print(f" Mapped {len(mapping)} recycled files to original sources")
|
|
return mapping
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("INSTAGRAM PERCEPTUAL DUPLICATES - WITH SOURCE MAPPING")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load source mapping from recycle bin database
|
|
backup_db = Path(__file__).parent.parent / 'data' / 'backup_cache.db'
|
|
source_mapping = get_source_mapping(str(backup_db))
|
|
|
|
# Load comprehensive scan results
|
|
db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db'
|
|
db = UnifiedDatabase(str(db_path))
|
|
|
|
# Get all files
|
|
print("\nCollecting Instagram files...")
|
|
all_files = {}
|
|
|
|
# Database files
|
|
with db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT filename, source, file_path, download_date
|
|
FROM downloads
|
|
WHERE platform = 'instagram'
|
|
AND download_date > datetime('now', '-3 days')
|
|
AND file_path IS NOT NULL
|
|
AND file_path NOT LIKE '%_phrase_checked_%'
|
|
AND file_path NOT LIKE '%_old_post_%'
|
|
""")
|
|
|
|
for row in cursor.fetchall():
|
|
if Path(row[2]).exists():
|
|
all_files[row[2]] = {
|
|
'source': row[1],
|
|
'filename': row[0],
|
|
'file_path': row[2],
|
|
'location': 'active'
|
|
}
|
|
|
|
# Recycle bin files with proper source mapping
|
|
recycle_path = Path('/opt/immich/recycle')
|
|
if recycle_path.exists():
|
|
cutoff = datetime.now().timestamp() - (3 * 24 * 60 * 60)
|
|
for ext in ['*.mp4', '*.jpg', '*.jpeg', '*.webp', '*.png', '*.heic']:
|
|
for file_path in recycle_path.rglob(ext):
|
|
if file_path.stat().st_mtime > cutoff:
|
|
file_path_str = str(file_path)
|
|
|
|
# Look up source from mapping
|
|
source_info = source_mapping.get(file_path_str, {})
|
|
source = source_info.get('source', 'unknown')
|
|
original_filename = source_info.get('original_filename', file_path.name)
|
|
|
|
all_files[file_path_str] = {
|
|
'source': source,
|
|
'filename': original_filename,
|
|
'file_path': file_path_str,
|
|
'location': 'recycle_bin'
|
|
}
|
|
|
|
print(f"Total files to analyze: {len(all_files)}")
|
|
print()
|
|
|
|
# Initialize detector
|
|
detector = InstagramPerceptualDuplicateDetector(
|
|
unified_db=db,
|
|
log_callback=lambda msg, lvl: None # Suppress logs
|
|
)
|
|
|
|
# Analyze files
|
|
print("Analyzing files (this may take a while)...")
|
|
file_data = []
|
|
|
|
for i, (path, info) in enumerate(all_files.items(), 1):
|
|
if i % 50 == 0:
|
|
print(f" Progress: {i}/{len(all_files)}...")
|
|
|
|
phash = detector._calculate_perceptual_hash(path)
|
|
if not phash:
|
|
continue
|
|
|
|
text_count, text_chars = detector._detect_text_overlays(path)
|
|
quality_metrics = detector._get_quality_metrics(path)
|
|
clean_score = detector._calculate_clean_score(text_count, text_chars)
|
|
quality_score = detector._calculate_quality_score(quality_metrics)
|
|
|
|
file_data.append({
|
|
'info': info,
|
|
'phash': phash,
|
|
'text_count': text_count,
|
|
'text_chars': text_chars,
|
|
'clean_score': clean_score,
|
|
'quality_score': quality_score,
|
|
'quality_metrics': quality_metrics,
|
|
'total_score': (clean_score * 3) + (quality_score * 1)
|
|
})
|
|
|
|
print(f"Analyzed {len(file_data)} files")
|
|
print()
|
|
|
|
# Find duplicates by source
|
|
print("=" * 80)
|
|
print("DUPLICATE DETECTION BY SOURCE")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Group by source first
|
|
by_source = defaultdict(list)
|
|
for data in file_data:
|
|
by_source[data['info']['source']].append(data)
|
|
|
|
# Find duplicates within each source
|
|
duplicate_groups = []
|
|
|
|
for source, files in by_source.items():
|
|
if source == 'unknown' or len(files) < 2:
|
|
continue
|
|
|
|
processed = set()
|
|
|
|
for i, data1 in enumerate(files):
|
|
if i in processed:
|
|
continue
|
|
|
|
group = [data1]
|
|
|
|
for j, data2 in enumerate(files[i+1:], start=i+1):
|
|
if j in processed:
|
|
continue
|
|
|
|
distance = detector._hamming_distance(data1['phash'], data2['phash'])
|
|
|
|
if distance <= 12: # threshold
|
|
group.append(data2)
|
|
processed.add(j)
|
|
|
|
if len(group) > 1:
|
|
group.sort(key=lambda x: x['total_score'], reverse=True)
|
|
duplicate_groups.append((source, group))
|
|
processed.add(i)
|
|
|
|
if len(duplicate_groups) == 0:
|
|
print("✅ No duplicates found (excluding 'unknown' sources)")
|
|
return
|
|
|
|
# Report by source
|
|
print(f"Found {len(duplicate_groups)} duplicate group(s) across {len(set(s for s, _ in duplicate_groups))} sources")
|
|
print()
|
|
|
|
# Group by source for reporting
|
|
by_source_report = defaultdict(list)
|
|
for source, group in duplicate_groups:
|
|
by_source_report[source].append(group)
|
|
|
|
total_would_remove = 0
|
|
total_size_freed = 0
|
|
|
|
for source in sorted(by_source_report.keys()):
|
|
groups = by_source_report[source]
|
|
|
|
print(f"\n{'=' * 80}")
|
|
print(f"SOURCE: @{source}")
|
|
print(f"{'=' * 80}")
|
|
print(f"Duplicate groups: {len(groups)}")
|
|
print()
|
|
|
|
for group_num, group in enumerate(groups, 1):
|
|
print(f"\n Group {group_num} ({len(group)} files):")
|
|
print(f" {'-' * 76}")
|
|
|
|
best = group[0]
|
|
print(f" ✅ KEEP: {best['info']['filename'][:60]}")
|
|
print(f" Location: {best['info']['location']}")
|
|
print(f" Clean: {best['clean_score']:.0f}/100 ({best['text_count']} text), Quality: {best['quality_score']:.0f}/100")
|
|
print(f" Resolution: {best['quality_metrics']['width']}x{best['quality_metrics']['height']}, Size: {best['quality_metrics']['file_size']/1024/1024:.1f}MB")
|
|
print()
|
|
|
|
for data in group[1:]:
|
|
total_would_remove += 1
|
|
total_size_freed += data['quality_metrics']['file_size']
|
|
|
|
distance = detector._hamming_distance(best['phash'], data['phash'])
|
|
|
|
print(f" ❌ REMOVE: {data['info']['filename'][:60]}")
|
|
print(f" Location: {data['info']['location']}")
|
|
print(f" Clean: {data['clean_score']:.0f}/100 ({data['text_count']} text), Quality: {data['quality_score']:.0f}/100")
|
|
print(f" Hash distance: {distance}")
|
|
|
|
reasons = []
|
|
if data['clean_score'] < best['clean_score'] - 5:
|
|
reasons.append(f"More text ({data['text_count']} vs {best['text_count']})")
|
|
if data['quality_score'] < best['quality_score']:
|
|
reasons.append("Lower quality")
|
|
if reasons:
|
|
print(f" Reason: {', '.join(reasons)}")
|
|
print()
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("SUMMARY BY SOURCE")
|
|
print("=" * 80)
|
|
|
|
source_stats = defaultdict(lambda: {'groups': 0, 'would_remove': 0})
|
|
for source, group in duplicate_groups:
|
|
source_stats[source]['groups'] += 1
|
|
source_stats[source]['would_remove'] += len(group) - 1
|
|
|
|
print()
|
|
for source in sorted(source_stats.keys(), key=lambda s: source_stats[s]['would_remove'], reverse=True):
|
|
stats = source_stats[source]
|
|
print(f" @{source:30s} : {stats['groups']:2d} groups, {stats['would_remove']:3d} files to remove")
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("OVERALL SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Sources with duplicates: {len(source_stats)}")
|
|
print(f"Total duplicate groups: {len(duplicate_groups)}")
|
|
print(f"Files that would be removed: {total_would_remove}")
|
|
print(f"Storage that would be freed: {total_size_freed / 1024 / 1024:.1f} MB")
|
|
print()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|