Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

View File

@@ -0,0 +1,257 @@
#!/usr/bin/env python3
"""
Dry-run test of Instagram Perceptual Duplicate Detection
Scans last 3 days of downloads and reports what would be considered duplicates
WITHOUT actually moving or deleting anything.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from modules.unified_database import UnifiedDatabase
from modules.instagram_perceptual_duplicate_detector import InstagramPerceptualDuplicateDetector
import json
from datetime import datetime, timedelta
from collections import defaultdict
class DryRunLogger:
"""Logger that captures all messages"""
def __init__(self):
self.messages = []
def __call__(self, msg, level):
self.messages.append((level, msg))
print(f"[{level.upper()}] {msg}")
def main():
print("=" * 80)
print("INSTAGRAM PERCEPTUAL DUPLICATE DETECTION - DRY RUN")
print("=" * 80)
print()
# Initialize database
db_path = Path(__file__).parent.parent / 'database' / 'media_downloader.db'
db = UnifiedDatabase(str(db_path))
# Get all Instagram files from last 3 days
with db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT
filename,
source,
file_path,
file_hash,
download_date,
content_type
FROM downloads
WHERE platform = 'instagram'
AND download_date > datetime('now', '-3 days')
AND file_path IS NOT NULL
AND file_path NOT LIKE '%_phrase_checked_%'
AND file_path NOT LIKE '%_old_post_%'
ORDER BY source, download_date
""")
files = []
for row in cursor.fetchall():
if row[2] and Path(row[2]).exists(): # Only include files that exist
files.append({
'filename': row[0],
'source': row[1],
'file_path': row[2],
'file_hash': row[3],
'download_date': row[4],
'content_type': row[5] or 'unknown'
})
print(f"Found {len(files)} Instagram files from last 3 days that exist on disk")
print()
if len(files) == 0:
print("No files to analyze!")
return
# Initialize detector
logger = DryRunLogger()
detector = InstagramPerceptualDuplicateDetector(
unified_db=db,
log_callback=logger
)
# Get settings (will use defaults since feature is disabled)
settings = {
'enabled': False,
'perceptual_hash_threshold': 12,
'text_detection_enabled': True,
'clean_score_weight': 3,
'quality_score_weight': 1,
'min_text_difference': 5
}
print(f"Using settings:")
print(f" - Perceptual hash threshold: {settings['perceptual_hash_threshold']}")
print(f" - Clean score weight: {settings['clean_score_weight']}")
print(f" - Quality score weight: {settings['quality_score_weight']}")
print(f" - Text detection: {'Enabled' if settings['text_detection_enabled'] else 'Disabled'}")
print()
# Process each file and collect data
print("Analyzing files...")
print("-" * 80)
file_data = []
for i, file_info in enumerate(files, 1):
file_path = file_info['file_path']
source = file_info['source']
print(f"\n[{i}/{len(files)}] Processing: {Path(file_path).name}")
# Calculate perceptual hash
phash = detector._calculate_perceptual_hash(file_path)
if not phash:
print(f" ⚠️ Could not calculate perceptual hash - skipping")
continue
# Detect text overlays
if settings['text_detection_enabled']:
text_count, text_chars = detector._detect_text_overlays(file_path)
else:
text_count, text_chars = 0, 0
# Get quality metrics
quality_metrics = detector._get_quality_metrics(file_path)
# Calculate scores
clean_score = detector._calculate_clean_score(text_count, text_chars)
quality_score = detector._calculate_quality_score(quality_metrics)
print(f" Hash: {phash[:16]}...")
print(f" Text overlays: {text_count} regions, {text_chars} chars")
print(f" Resolution: {quality_metrics['width']}x{quality_metrics['height']}")
print(f" File size: {quality_metrics['file_size'] / 1024 / 1024:.1f} MB")
print(f" Clean score: {clean_score:.1f}/100")
print(f" Quality score: {quality_score:.1f}/100")
print(f" Total score: {(clean_score * settings['clean_score_weight']) + (quality_score * settings['quality_score_weight']):.1f}")
file_data.append({
'file_info': file_info,
'phash': phash,
'text_count': text_count,
'text_chars': text_chars,
'clean_score': clean_score,
'quality_score': quality_score,
'quality_metrics': quality_metrics,
'total_score': (clean_score * settings['clean_score_weight']) + (quality_score * settings['quality_score_weight'])
})
print()
print("=" * 80)
print("DUPLICATE DETECTION ANALYSIS")
print("=" * 80)
print()
# Find duplicates by comparing hashes
duplicates = []
processed = set()
for i, data1 in enumerate(file_data):
if i in processed:
continue
group = [data1]
for j, data2 in enumerate(file_data[i+1:], start=i+1):
if j in processed:
continue
# Same source only
if data1['file_info']['source'] != data2['file_info']['source']:
continue
# Calculate Hamming distance
distance = detector._hamming_distance(data1['phash'], data2['phash'])
if distance <= settings['perceptual_hash_threshold']:
group.append(data2)
processed.add(j)
if len(group) > 1:
# Sort by total score (highest first)
group.sort(key=lambda x: x['total_score'], reverse=True)
duplicates.append(group)
processed.add(i)
if len(duplicates) == 0:
print("✅ No perceptual duplicates found!")
print()
print("All files are unique or sufficiently different.")
return
print(f"Found {len(duplicates)} duplicate group(s):")
print()
total_would_remove = 0
for group_num, group in enumerate(duplicates, 1):
print(f"\n{'=' * 80}")
print(f"DUPLICATE GROUP #{group_num}")
print(f"{'=' * 80}")
print(f"Source: {group[0]['file_info']['source']}")
print(f"Files in group: {len(group)}")
print()
best = group[0]
print(f"✅ WOULD KEEP:")
print(f" File: {Path(best['file_info']['file_path']).name}")
print(f" Path: {best['file_info']['file_path']}")
print(f" Clean score: {best['clean_score']:.1f}/100 ({best['text_count']} text regions)")
print(f" Quality score: {best['quality_score']:.1f}/100 ({best['quality_metrics']['width']}x{best['quality_metrics']['height']}, {best['quality_metrics']['file_size']/1024/1024:.1f}MB)")
print(f" Total score: {best['total_score']:.1f}")
print(f" Download date: {best['file_info']['download_date']}")
print()
print(f"❌ WOULD REMOVE ({len(group)-1} file(s)):")
for data in group[1:]:
total_would_remove += 1
print(f"\n File: {Path(data['file_info']['file_path']).name}")
print(f" Path: {data['file_info']['file_path']}")
print(f" Clean score: {data['clean_score']:.1f}/100 ({data['text_count']} text regions)")
print(f" Quality score: {data['quality_score']:.1f}/100 ({data['quality_metrics']['width']}x{data['quality_metrics']['height']}, {data['quality_metrics']['file_size']/1024/1024:.1f}MB)")
print(f" Total score: {data['total_score']:.1f}")
print(f" Download date: {data['file_info']['download_date']}")
# Calculate hash distance
distance = detector._hamming_distance(best['phash'], data['phash'])
print(f" Hash distance from best: {distance}")
# Explain why it would be removed
reasons = []
if data['clean_score'] < best['clean_score'] - settings['min_text_difference']:
reasons.append(f"Has more text overlays ({data['text_count']} vs {best['text_count']})")
if data['quality_score'] < best['quality_score']:
reasons.append(f"Lower quality ({data['quality_metrics']['width']}x{data['quality_metrics']['height']} vs {best['quality_metrics']['width']}x{best['quality_metrics']['height']})")
if data['total_score'] < best['total_score']:
reasons.append(f"Lower total score ({data['total_score']:.1f} vs {best['total_score']:.1f})")
if reasons:
print(f" Reason(s): {'; '.join(reasons)}")
print()
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Total files analyzed: {len(file_data)}")
print(f"Duplicate groups found: {len(duplicates)}")
print(f"Files that would be kept: {len(duplicates)}")
print(f"Files that would be removed: {total_would_remove}")
print()
print("⚠️ NOTE: This is a DRY RUN - no files were actually moved or deleted!")
print(" To enable this feature, set 'enabled: true' in Configuration > Instagram Perceptual Duplicate Detection")
print()
if __name__ == '__main__':
main()