media-downloader/modules/instagram_utils.py

#!/usr/bin/env python3
"""
Instagram Utilities Module

Shared utility functions for Instagram downloaders (imginn, fastdl, toolzu, instaloader).
Centralizes common functionality like media ID extraction to avoid code duplication.
"""

import re
from datetime import datetime
from pathlib import Path
from typing import Optional, Set, Dict, Any


def extract_instagram_media_id(filename_or_id: str) -> str:
    """Extract the actual Instagram media ID from a filename or ID string.

    Instagram image filenames follow the pattern:
    {user_id}_{media_id}_{post_id}_n.ext
    Where media_id is a 17-18 digit number starting with 18xxxxxxx

    For video stories with AQ... format, these are story keys and
    we use the whole key as the media ID.

    Args:
        filename_or_id: A filename like '591164014_18551181784006538_2284814566270897032_n'
                       or just a media ID string

    Returns:
        The extracted Instagram media ID (17-18 digit number) or the original string
        if no pattern matches

    Examples:
        >>> extract_instagram_media_id('591164014_18551181784006538_2284814566270897032_n')
        '18551181784006538'
        >>> extract_instagram_media_id('18551181784006538')
        '18551181784006538'
        >>> extract_instagram_media_id('AQOOlj6M4PlGHBuYl02KzwUXefsdiou9q3ooFiNF4cUy3DEY6QKxROoUe9BKCeVJA4UF5BiVPIuqXheCU')
        'AQOOlj6M4PlGHBuYl02KzwUXefsdiou9q3ooFiNF4cUy3DEY6QKxROoUe9BKCeVJA4UF5BiVPIuqXheCU'
    """
    if not filename_or_id:
        return filename_or_id

    # Pattern 1: Standard Instagram image format with underscore separators
    # {user_id}_{media_id}_{post_id}_n
    # Media ID is the 17-18 digit number starting with 18
    # Use underscore or start/end as boundaries (not \b which doesn't work with underscores)
    ig_media_id_pattern = r'(?:^|_)(18\d{15,17})(?:_|$)'
    match = re.search(ig_media_id_pattern, filename_or_id)
    if match:
        return match.group(1)

    # Pattern 2: If it's already a valid media ID (17-18 digits starting with 18)
    if re.match(r'^18\d{15,17}$', filename_or_id):
        return filename_or_id

    # Pattern 3: Story key format (AQ... encoded string) - use as-is
    if filename_or_id.startswith('AQ') and len(filename_or_id) > 50:
        return filename_or_id

    # Pattern 4: Short post code format (like DRkaDSFD-U2) - use as-is
    if re.match(r'^[A-Za-z0-9_-]{10,15}$', filename_or_id):
        return filename_or_id

    # No pattern matched - return original string
    return filename_or_id


def extract_media_id_from_url(url: str) -> Optional[str]:
    """Extract Instagram media ID from a CDN URL.

    Instagram CDN URLs contain media IDs in patterns like:
    561378837_18538674661006538_479694548187839800_n.jpg

    The second number (18538674661006538) is the Instagram media ID.

    Args:
        url: Instagram CDN URL string

    Returns:
        Media ID string or None if not found
    """
    if not url:
        return None

    # Pattern: number_MEDIAID_number_n.jpg or .mp4
    pattern = r'(\d+)_(\d{17,19})_\d+_n\.(jpg|mp4|jpeg|png)'
    match = re.search(pattern, url)
    if match:
        return match.group(2)  # Return the media ID

    return None


def extract_media_ids_from_url(url: str) -> list:
    """Extract all Instagram media IDs from a URL.

    Similar to extract_media_id_from_url but returns all matches as a list.

    Args:
        url: URL string that may contain Instagram media IDs

    Returns:
        List of media IDs found in the URL
    """
    if not url:
        return []

    # Pattern: number_MEDIAID_number_n.jpg
    pattern = r'(\d+)_(\d{17,19})_\d+_n\.(jpg|mp4|jpeg|png)'
    matches = re.findall(pattern, url)

    if matches:
        # Return the media ID (second capture group) from each match
        return [match[1] for match in matches]

    return []


def extract_post_shortcode(url: str) -> Optional[str]:
    """Extract Instagram post shortcode from a URL.

    Args:
        url: Instagram URL like https://www.instagram.com/p/ABC123/

    Returns:
        Shortcode string or None if not found
    """
    if not url:
        return None

    match = re.search(r'/p/([^/]+)/?', url)
    if match:
        return match.group(1)

    return None


def media_id_to_shortcode(media_id: str) -> str:
    """Convert Instagram media ID to shortcode.

    Args:
        media_id: Numeric media ID string

    Returns:
        Instagram shortcode string
    """
    alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'

    try:
        media_id_int = int(media_id)
    except (ValueError, TypeError):
        return media_id  # Return as-is if not a valid number

    shortcode = ''
    while media_id_int > 0:
        remainder = media_id_int % 64
        media_id_int = media_id_int // 64
        shortcode = alphabet[remainder] + shortcode

    return shortcode or 'A'


def scan_existing_files_for_media_ids(output_dir: Path, profile_name: str = None,
                                      min_file_size: int = 0, recursive: bool = True) -> Set[str]:
    """Scan existing files and extract media IDs for duplicate detection.

    Scans image and video files in the output directory, extracts both the
    full media ID string and the normalized Instagram media ID (18-digit number).

    Args:
        output_dir: Directory to scan for existing files
        profile_name: Optional profile name to filter files
        min_file_size: Minimum file size in bytes (skip smaller files as corrupted)
        recursive: If True, search subdirectories (rglob), otherwise only top level (glob)

    Returns:
        Set of media IDs (both full and normalized) found in existing files
    """
    media_ids = set()

    if not output_dir.exists():
        return media_ids

    glob_func = output_dir.rglob if recursive else output_dir.glob

    for pattern in ["*.jpg", "*.jpeg", "*.png", "*.heic", "*.mp4", "*.mov"]:
        for filepath in glob_func(pattern):
            # Skip files smaller than min_file_size (likely corrupted/incomplete)
            if min_file_size > 0:
                try:
                    if filepath.stat().st_size < min_file_size:
                        continue
                except OSError:
                    continue

            filename = filepath.stem

            # Format is: profile_YYYYMMDD_HHMMSS_mediaid
            # Split into parts: [profile, date, time, ...rest is media_id]
            parts = filename.split('_', 3)

            if len(parts) >= 4:
                # Check profile name if provided
                if profile_name and parts[0] != profile_name:
                    continue
                media_id_full = parts[3]
            elif len(parts) > 1:
                media_id_full = parts[-1]
            else:
                media_id_full = filename

            if media_id_full:
                # Add the full media ID string
                media_ids.add(media_id_full)

                # Also add the normalized Instagram media ID (18-digit number)
                normalized_id = extract_instagram_media_id(media_id_full)
                if normalized_id and normalized_id != media_id_full:
                    media_ids.add(normalized_id)

    return media_ids


def parse_instagram_filename(filename: str) -> dict:
    """Parse an Instagram filename into its components.

    Args:
        filename: Filename like 'evalongoria_20251205_120406_591164014_18551181784006538_2284814566270897032_n_story1.jpg'

    Returns:
        Dictionary with parsed components:
        - username: str or None
        - date: str or None (YYYYMMDD format)
        - time: str or None (HHMMSS format)
        - media_id_full: str or None (full ID after date/time)
        - media_id: str or None (normalized 18-digit Instagram media ID)
        - suffix: str or None (e.g., 'story1')
        - extension: str or None
    """
    result = {
        'username': None,
        'date': None,
        'time': None,
        'media_id_full': None,
        'media_id': None,
        'suffix': None,
        'extension': None
    }

    if not filename:
        return result

    # Get extension
    path = Path(filename)
    result['extension'] = path.suffix.lower() if path.suffix else None
    basename = path.stem

    # Split into parts
    parts = basename.split('_')

    if len(parts) >= 4:
        result['username'] = parts[0]

        # Check if parts[1] and parts[2] look like date/time
        if len(parts[1]) == 8 and parts[1].isdigit():
            result['date'] = parts[1]
        if len(parts[2]) == 6 and parts[2].isdigit():
            result['time'] = parts[2]

        # Everything after date/time is the media ID (possibly with suffix)
        media_id_full = '_'.join(parts[3:])
        result['media_id_full'] = media_id_full

        # Check for story suffix
        if '_story' in media_id_full:
            media_part, suffix_part = media_id_full.rsplit('_story', 1)
            result['media_id_full'] = media_part
            result['suffix'] = f'story{suffix_part}'

        # Extract normalized media ID
        result['media_id'] = extract_instagram_media_id(result['media_id_full'])

    return result


def record_instagram_download(db, media_id: str, username: str, content_type: str,
                              filename: str, url: str = None, download_url: str = None,
                              post_date: datetime = None, file_path: str = None,
                              method: str = None, extra_metadata: Dict[str, Any] = None) -> bool:
    """Record an Instagram download in the database with normalized media_id.

    This is the centralized function for recording Instagram downloads across all
    Instagram downloader modules (imginn, fastdl, toolzu, instaloader). It ensures
    the media_id is always normalized for cross-module duplicate detection.

    Args:
        db: Database instance (UnifiedDatabase or adapter with record_download method)
        media_id: The media ID (will be normalized automatically)
        username: Instagram username
        content_type: Type of content (posts, stories, reels, highlights)
        filename: Filename of the downloaded file
        url: Original Instagram URL (e.g., https://instagram.com/p/ABC123/)
        download_url: Direct download URL (CDN URL)
        post_date: Post date/time
        file_path: Full file path on disk
        method: Download method (imginn, fastdl, toolzu, instaloader)
        extra_metadata: Additional metadata to include

    Returns:
        True if successfully recorded, False otherwise
    """
    if not db:
        return False

    # Normalize the media_id for consistent cross-module detection
    normalized_media_id = extract_instagram_media_id(media_id) if media_id else media_id

    # Build metadata with normalized media_id
    metadata = {
        'media_id': normalized_media_id,
        'original_media_id': media_id if media_id != normalized_media_id else None,
    }

    # Add extra metadata if provided
    if extra_metadata:
        metadata.update(extra_metadata)

    # Remove None values
    metadata = {k: v for k, v in metadata.items() if v is not None}

    # Determine URL for database (use download_url or construct from media_id)
    db_url = url or download_url or f"instagram://{normalized_media_id}"

    # Calculate file hash if file_path provided
    file_hash = None
    if file_path:
        try:
            from modules.unified_database import UnifiedDatabase
            file_hash = UnifiedDatabase.get_file_hash(file_path)
        except Exception:
            pass

    try:
        # Try to use the db's record_download method directly
        if hasattr(db, 'record_download'):
            return db.record_download(
                url=db_url,
                platform='instagram',
                source=username,
                content_type=content_type,
                filename=filename,
                file_path=file_path,
                file_hash=file_hash,
                post_date=post_date,
                metadata=metadata,
                method=method
            )
        # Fallback for adapter-style databases
        elif hasattr(db, 'mark_downloaded'):
            return db.mark_downloaded(
                username=username,
                url=db_url,
                filename=filename,
                post_date=post_date,
                metadata=metadata,
                file_path=file_path,
                content_type=content_type
            )
        else:
            return False
    except Exception:
        return False


def is_instagram_downloaded(db, media_id: str, username: str = None) -> bool:
    """Check if Instagram content is already downloaded by media_id.

    Checks for both the original and normalized media_id to ensure cross-module
    duplicate detection works correctly.

    Args:
        db: Database instance (UnifiedDatabase or adapter)
        media_id: The media ID to check (will check both original and normalized)
        username: Optional username to scope the check

    Returns:
        True if already downloaded, False otherwise
    """
    if not db or not media_id:
        return False

    # Normalize the media_id
    normalized_media_id = extract_instagram_media_id(media_id)

    # Check if this looks like a shortcode (10-15 alphanumeric chars, no 18xxx pattern)
    is_shortcode = (normalized_media_id == media_id and
                    re.match(r'^[A-Za-z0-9_-]{10,15}$', media_id) and
                    not re.match(r'^18\d{15,17}$', media_id))

    try:
        # Check if db has get_connection (UnifiedDatabase) - query directly
        if hasattr(db, 'get_connection'):
            with db.get_connection() as conn:
                cursor = conn.cursor()
                # Check both normalized and original media_id
                # Also verify file_path is set (download was actually completed)
                if normalized_media_id != media_id:
                    cursor.execute('''
                        SELECT 1 FROM downloads
                        WHERE platform = 'instagram'
                        AND (media_id = ? OR media_id = ?)
                        AND file_path IS NOT NULL AND file_path != ''
                        LIMIT 1
                    ''', (normalized_media_id, media_id))
                else:
                    cursor.execute('''
                        SELECT 1 FROM downloads
                        WHERE platform = 'instagram'
                        AND media_id = ?
                        AND file_path IS NOT NULL AND file_path != ''
                        LIMIT 1
                    ''', (normalized_media_id,))
                if cursor.fetchone() is not None:
                    return True

                # For shortcodes, also check the metadata JSON column
                if is_shortcode:
                    cursor.execute('''
                        SELECT 1 FROM downloads
                        WHERE platform = 'instagram'
                        AND metadata LIKE ?
                        AND file_path IS NOT NULL AND file_path != ''
                        LIMIT 1
                    ''', (f'%"shortcode": "{media_id}"%',))
                    if cursor.fetchone() is not None:
                        return True

                # Check recycle bin — files previously downloaded then deleted
                # should not be re-downloaded
                cursor.execute('''
                    SELECT 1 FROM recycle_bin
                    WHERE original_filename LIKE ?
                    LIMIT 1
                ''', (f'%{normalized_media_id}%',))
                if cursor.fetchone() is not None:
                    return True

                return False

        # Fallback for adapters with is_already_downloaded method
        elif hasattr(db, 'is_already_downloaded'):
            if db.is_already_downloaded(normalized_media_id):
                return True
            # Also check original if different
            if normalized_media_id != media_id and db.is_already_downloaded(media_id):
                return True

        return False
    except Exception:
        return False