media-downloader/tests/test_repost_detection_manual.py

#!/usr/bin/env python3
"""
Manual Integration Test for Instagram Repost Detector

This script tests the repost detector with real files and can be run manually
to validate the implementation before integrating into the main system.

Usage:
    python3 tests/test_repost_detection_manual.py [test_file_path] [source_username]

Example:
    python3 tests/test_repost_detection_manual.py \
        "/media/.../evalongoria_20251109_154548_story6.mp4" \
        "evalongoria"
"""

import sys
import os
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from modules.instagram_repost_detector import InstagramRepostDetector
from modules.unified_database import UnifiedDatabase


def test_dependencies():
    """Test if all dependencies are installed"""
    print("=" * 70)
    print("CHECKING DEPENDENCIES")
    print("=" * 70)

    missing = []

    try:
        import pytesseract
        from PIL import Image
        print("✓ pytesseract and PIL installed")
    except ImportError:
        print("✗ pytesseract or PIL not installed")
        print("  Install: pip3 install pytesseract pillow")
        missing.append("pytesseract/PIL")

    try:
        import cv2
        print("✓ opencv-python installed")
    except ImportError:
        print("✗ opencv-python not installed")
        print("  Install: pip3 install opencv-python")
        missing.append("opencv-python")

    try:
        import imagehash
        print("✓ imagehash installed")
    except ImportError:
        print("✗ imagehash not installed")
        print("  Install: pip3 install imagehash")
        missing.append("imagehash")

    # Check tesseract binary
    try:
        import pytesseract
        pytesseract.get_tesseract_version()
        print("✓ tesseract-ocr binary installed")
    except Exception:
        print("✗ tesseract-ocr binary not installed")
        print("  Install: sudo apt-get install tesseract-ocr tesseract-ocr-eng")
        missing.append("tesseract-ocr")

    print()

    if missing:
        print(f"❌ Missing dependencies: {', '.join(missing)}")
        print("\nPlease install missing dependencies before running tests.")
        return False
    else:
        print("✅ All dependencies installed")
        return True


def test_ocr_extraction(file_path: str):
    """Test OCR username extraction on a file"""
    print("\n" + "=" * 70)
    print("TEST 1: OCR USERNAME EXTRACTION")
    print("=" * 70)
    print(f"File: {file_path}")

    # Create mock database for testing
    db = UnifiedDatabase()
    detector = InstagramRepostDetector(unified_db=db)

    username = detector._extract_username_from_repost(file_path)

    if username:
        print(f"✅ SUCCESS: Extracted username: @{username}")
        return username
    else:
        print("❌ FAILED: No username found")
        return None


def test_monitored_check(username: str):
    """Test if username is in monitored accounts"""
    print("\n" + "=" * 70)
    print("TEST 2: MONITORED ACCOUNT CHECK")
    print("=" * 70)
    print(f"Username: @{username}")

    db = UnifiedDatabase()
    detector = InstagramRepostDetector(unified_db=db)

    is_monitored = detector._is_monitored_account(username)

    if is_monitored:
        print(f"✅ @{username} IS monitored (will use normal download path)")
    else:
        print(f"ℹ️  @{username} NOT monitored (will use temp queue)")

    return is_monitored


def test_perceptual_hash(file_path: str):
    """Test perceptual hash calculation"""
    print("\n" + "=" * 70)
    print("TEST 3: PERCEPTUAL HASH CALCULATION")
    print("=" * 70)
    print(f"File: {file_path}")

    db = UnifiedDatabase()
    detector = InstagramRepostDetector(unified_db=db)

    hash_value = detector._get_perceptual_hash(file_path)

    if hash_value:
        print(f"✅ SUCCESS: Hash = {hash_value}")
        return hash_value
    else:
        print("❌ FAILED: Could not calculate hash")
        return None


def test_full_detection(file_path: str, source_username: str, dry_run: bool = True):
    """Test full repost detection workflow"""
    print("\n" + "=" * 70)
    print("TEST 4: FULL REPOST DETECTION WORKFLOW")
    print("=" * 70)
    print(f"File: {file_path}")
    print(f"Source: @{source_username}")
    print(f"Mode: {'DRY RUN (no downloads)' if dry_run else 'LIVE (will download)'}")

    if dry_run:
        print("\n⚠️  DRY RUN MODE - Will not download content from ImgInn")
        print("To test with actual downloads, run with --live flag")
        return None

    db = UnifiedDatabase()
    detector = InstagramRepostDetector(unified_db=db)

    print("\nStarting detection...")
    replacement = detector.check_and_replace_repost(file_path, source_username)

    if replacement:
        print(f"\n✅ SUCCESS: Repost replaced!")
        print(f"Original file: {file_path}")
        print(f"Replacement file: {replacement}")
        return replacement
    else:
        print("\n❌ FAILED: No replacement found")
        print("Possible reasons:")
        print("  - No @username detected in the file")
        print("  - Original content not available")
        print("  - No matching content found via perceptual hash")
        return None


def test_database_tracking():
    """Test database tracking tables"""
    print("\n" + "=" * 70)
    print("TEST 5: DATABASE TRACKING")
    print("=" * 70)

    db = UnifiedDatabase()

    # Check if repost_fetch_cache table exists
    with db.get_connection() as conn:
        cursor = conn.cursor()

        # Check fetch cache
        cursor.execute("""
            SELECT name FROM sqlite_master
            WHERE type='table' AND name='repost_fetch_cache'
        """)
        has_cache = cursor.fetchone() is not None

        # Check replacements table
        cursor.execute("""
            SELECT name FROM sqlite_master
            WHERE type='table' AND name='repost_replacements'
        """)
        has_replacements = cursor.fetchone() is not None

    if has_cache:
        print("✓ repost_fetch_cache table exists")

        with db.get_connection() as conn:
            cursor = conn.cursor()
            cursor.execute("SELECT COUNT(*) FROM repost_fetch_cache")
            count = cursor.fetchone()[0]
            print(f"  {count} usernames in cache")
    else:
        print("ℹ️  repost_fetch_cache table will be created on first use")

    if has_replacements:
        print("✓ repost_replacements table exists")

        with db.get_connection() as conn:
            cursor = conn.cursor()
            cursor.execute("SELECT COUNT(*) FROM repost_replacements")
            count = cursor.fetchone()[0]
            print(f"  {count} replacements tracked")

            if count > 0:
                print("\nRecent replacements:")
                cursor.execute("""
                    SELECT repost_source, original_username, repost_filename, detected_at
                    FROM repost_replacements
                    ORDER BY detected_at DESC
                    LIMIT 5
                """)
                for row in cursor.fetchall():
                    print(f"  - @{row[0]} reposted from @{row[1]}: {row[2]} ({row[3]})")
    else:
        print("ℹ️  repost_replacements table will be created on first use")


def main():
    """Main test runner"""
    print("\n" + "=" * 70)
    print("INSTAGRAM REPOST DETECTOR - MANUAL TEST SUITE")
    print("=" * 70)

    # Check if test file provided
    if len(sys.argv) < 2:
        print("\nUsage:")
        print("  python3 tests/test_repost_detection_manual.py [file_path] [source_username] [--live]")
        print("\nExamples:")
        print("  # Test with real example file (dry run)")
        print('  python3 tests/test_repost_detection_manual.py \\')
        print('      "/media/.../evalongoria_20251109_154548_story6.mp4" \\')
        print('      "evalongoria"')
        print()
        print("  # Test with actual downloads")
        print('  python3 tests/test_repost_detection_manual.py \\')
        print('      "/media/.../evalongoria_20251109_154548_story6.mp4" \\')
        print('      "evalongoria" \\')
        print('      --live')
        print()

        # Run dependency check and database check only
        deps_ok = test_dependencies()
        if deps_ok:
            test_database_tracking()
        return

    file_path = sys.argv[1]
    source_username = sys.argv[2] if len(sys.argv) >= 3 else "unknown"
    dry_run = "--live" not in sys.argv

    # Validate file exists
    if not os.path.exists(file_path):
        print(f"\n❌ ERROR: File not found: {file_path}")
        return

    # Test 1: Dependencies
    deps_ok = test_dependencies()
    if not deps_ok:
        print("\n⚠️  Cannot proceed with tests - missing dependencies")
        return

    # Test 2: OCR extraction
    username = test_ocr_extraction(file_path)

    # Test 3: Monitored check (if username found)
    if username:
        test_monitored_check(username)

    # Test 4: Perceptual hash
    test_perceptual_hash(file_path)

    # Test 5: Database tracking
    test_database_tracking()

    # Test 6: Full detection (if not dry run)
    if not dry_run:
        test_full_detection(file_path, source_username, dry_run=False)
    else:
        print("\n" + "=" * 70)
        print("SKIPPING FULL WORKFLOW TEST (DRY RUN)")
        print("=" * 70)
        print("To test full workflow with actual downloads, add --live flag")

    print("\n" + "=" * 70)
    print("TEST SUITE COMPLETE")
    print("=" * 70)


if __name__ == "__main__":
    main()