Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/modules/instagram_utils.py
+++ b/modules/instagram_utils.py
@@ -0,0 +1,461 @@
+#!/usr/bin/env python3
+"""
+Instagram Utilities Module
+
+Shared utility functions for Instagram downloaders (imginn, fastdl, toolzu, instaloader).
+Centralizes common functionality like media ID extraction to avoid code duplication.
+"""
+
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Set, Dict, Any
+
+
+def extract_instagram_media_id(filename_or_id: str) -> str:
+    """Extract the actual Instagram media ID from a filename or ID string.
+
+    Instagram image filenames follow the pattern:
+    {user_id}_{media_id}_{post_id}_n.ext
+    Where media_id is a 17-18 digit number starting with 18xxxxxxx
+
+    For video stories with AQ... format, these are story keys and
+    we use the whole key as the media ID.
+
+    Args:
+        filename_or_id: A filename like '591164014_18551181784006538_2284814566270897032_n'
+                       or just a media ID string
+
+    Returns:
+        The extracted Instagram media ID (17-18 digit number) or the original string
+        if no pattern matches
+
+    Examples:
+        >>> extract_instagram_media_id('591164014_18551181784006538_2284814566270897032_n')
+        '18551181784006538'
+        >>> extract_instagram_media_id('18551181784006538')
+        '18551181784006538'
+        >>> extract_instagram_media_id('AQOOlj6M4PlGHBuYl02KzwUXefsdiou9q3ooFiNF4cUy3DEY6QKxROoUe9BKCeVJA4UF5BiVPIuqXheCU')
+        'AQOOlj6M4PlGHBuYl02KzwUXefsdiou9q3ooFiNF4cUy3DEY6QKxROoUe9BKCeVJA4UF5BiVPIuqXheCU'
+    """
+    if not filename_or_id:
+        return filename_or_id
+
+    # Pattern 1: Standard Instagram image format with underscore separators
+    # {user_id}_{media_id}_{post_id}_n
+    # Media ID is the 17-18 digit number starting with 18
+    # Use underscore or start/end as boundaries (not \b which doesn't work with underscores)
+    ig_media_id_pattern = r'(?:^|_)(18\d{15,17})(?:_|$)'
+    match = re.search(ig_media_id_pattern, filename_or_id)
+    if match:
+        return match.group(1)
+
+    # Pattern 2: If it's already a valid media ID (17-18 digits starting with 18)
+    if re.match(r'^18\d{15,17}$', filename_or_id):
+        return filename_or_id
+
+    # Pattern 3: Story key format (AQ... encoded string) - use as-is
+    if filename_or_id.startswith('AQ') and len(filename_or_id) > 50:
+        return filename_or_id
+
+    # Pattern 4: Short post code format (like DRkaDSFD-U2) - use as-is
+    if re.match(r'^[A-Za-z0-9_-]{10,15}$', filename_or_id):
+        return filename_or_id
+
+    # No pattern matched - return original string
+    return filename_or_id
+
+
+def extract_media_id_from_url(url: str) -> Optional[str]:
+    """Extract Instagram media ID from a CDN URL.
+
+    Instagram CDN URLs contain media IDs in patterns like:
+    561378837_18538674661006538_479694548187839800_n.jpg
+
+    The second number (18538674661006538) is the Instagram media ID.
+
+    Args:
+        url: Instagram CDN URL string
+
+    Returns:
+        Media ID string or None if not found
+    """
+    if not url:
+        return None
+
+    # Pattern: number_MEDIAID_number_n.jpg or .mp4
+    pattern = r'(\d+)_(\d{17,19})_\d+_n\.(jpg|mp4|jpeg|png)'
+    match = re.search(pattern, url)
+    if match:
+        return match.group(2)  # Return the media ID
+
+    return None
+
+
+def extract_media_ids_from_url(url: str) -> list:
+    """Extract all Instagram media IDs from a URL.
+
+    Similar to extract_media_id_from_url but returns all matches as a list.
+
+    Args:
+        url: URL string that may contain Instagram media IDs
+
+    Returns:
+        List of media IDs found in the URL
+    """
+    if not url:
+        return []
+
+    # Pattern: number_MEDIAID_number_n.jpg
+    pattern = r'(\d+)_(\d{17,19})_\d+_n\.(jpg|mp4|jpeg|png)'
+    matches = re.findall(pattern, url)
+
+    if matches:
+        # Return the media ID (second capture group) from each match
+        return [match[1] for match in matches]
+
+    return []
+
+
+def extract_post_shortcode(url: str) -> Optional[str]:
+    """Extract Instagram post shortcode from a URL.
+
+    Args:
+        url: Instagram URL like https://www.instagram.com/p/ABC123/
+
+    Returns:
+        Shortcode string or None if not found
+    """
+    if not url:
+        return None
+
+    match = re.search(r'/p/([^/]+)/?', url)
+    if match:
+        return match.group(1)
+
+    return None
+
+
+def media_id_to_shortcode(media_id: str) -> str:
+    """Convert Instagram media ID to shortcode.
+
+    Args:
+        media_id: Numeric media ID string
+
+    Returns:
+        Instagram shortcode string
+    """
+    alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
+
+    try:
+        media_id_int = int(media_id)
+    except (ValueError, TypeError):
+        return media_id  # Return as-is if not a valid number
+
+    shortcode = ''
+    while media_id_int > 0:
+        remainder = media_id_int % 64
+        media_id_int = media_id_int // 64
+        shortcode = alphabet[remainder] + shortcode
+
+    return shortcode or 'A'
+
+
+def scan_existing_files_for_media_ids(output_dir: Path, profile_name: str = None,
+                                      min_file_size: int = 0, recursive: bool = True) -> Set[str]:
+    """Scan existing files and extract media IDs for duplicate detection.
+
+    Scans image and video files in the output directory, extracts both the
+    full media ID string and the normalized Instagram media ID (18-digit number).
+
+    Args:
+        output_dir: Directory to scan for existing files
+        profile_name: Optional profile name to filter files
+        min_file_size: Minimum file size in bytes (skip smaller files as corrupted)
+        recursive: If True, search subdirectories (rglob), otherwise only top level (glob)
+
+    Returns:
+        Set of media IDs (both full and normalized) found in existing files
+    """
+    media_ids = set()
+
+    if not output_dir.exists():
+        return media_ids
+
+    glob_func = output_dir.rglob if recursive else output_dir.glob
+
+    for pattern in ["*.jpg", "*.jpeg", "*.png", "*.heic", "*.mp4", "*.mov"]:
+        for filepath in glob_func(pattern):
+            # Skip files smaller than min_file_size (likely corrupted/incomplete)
+            if min_file_size > 0:
+                try:
+                    if filepath.stat().st_size < min_file_size:
+                        continue
+                except OSError:
+                    continue
+
+            filename = filepath.stem
+
+            # Format is: profile_YYYYMMDD_HHMMSS_mediaid
+            # Split into parts: [profile, date, time, ...rest is media_id]
+            parts = filename.split('_', 3)
+
+            if len(parts) >= 4:
+                # Check profile name if provided
+                if profile_name and parts[0] != profile_name:
+                    continue
+                media_id_full = parts[3]
+            elif len(parts) > 1:
+                media_id_full = parts[-1]
+            else:
+                media_id_full = filename
+
+            if media_id_full:
+                # Add the full media ID string
+                media_ids.add(media_id_full)
+
+                # Also add the normalized Instagram media ID (18-digit number)
+                normalized_id = extract_instagram_media_id(media_id_full)
+                if normalized_id and normalized_id != media_id_full:
+                    media_ids.add(normalized_id)
+
+    return media_ids
+
+
+def parse_instagram_filename(filename: str) -> dict:
+    """Parse an Instagram filename into its components.
+
+    Args:
+        filename: Filename like 'evalongoria_20251205_120406_591164014_18551181784006538_2284814566270897032_n_story1.jpg'
+
+    Returns:
+        Dictionary with parsed components:
+        - username: str or None
+        - date: str or None (YYYYMMDD format)
+        - time: str or None (HHMMSS format)
+        - media_id_full: str or None (full ID after date/time)
+        - media_id: str or None (normalized 18-digit Instagram media ID)
+        - suffix: str or None (e.g., 'story1')
+        - extension: str or None
+    """
+    result = {
+        'username': None,
+        'date': None,
+        'time': None,
+        'media_id_full': None,
+        'media_id': None,
+        'suffix': None,
+        'extension': None
+    }
+
+    if not filename:
+        return result
+
+    # Get extension
+    path = Path(filename)
+    result['extension'] = path.suffix.lower() if path.suffix else None
+    basename = path.stem
+
+    # Split into parts
+    parts = basename.split('_')
+
+    if len(parts) >= 4:
+        result['username'] = parts[0]
+
+        # Check if parts[1] and parts[2] look like date/time
+        if len(parts[1]) == 8 and parts[1].isdigit():
+            result['date'] = parts[1]
+        if len(parts[2]) == 6 and parts[2].isdigit():
+            result['time'] = parts[2]
+
+        # Everything after date/time is the media ID (possibly with suffix)
+        media_id_full = '_'.join(parts[3:])
+        result['media_id_full'] = media_id_full
+
+        # Check for story suffix
+        if '_story' in media_id_full:
+            media_part, suffix_part = media_id_full.rsplit('_story', 1)
+            result['media_id_full'] = media_part
+            result['suffix'] = f'story{suffix_part}'
+
+        # Extract normalized media ID
+        result['media_id'] = extract_instagram_media_id(result['media_id_full'])
+
+    return result
+
+
+def record_instagram_download(db, media_id: str, username: str, content_type: str,
+                              filename: str, url: str = None, download_url: str = None,
+                              post_date: datetime = None, file_path: str = None,
+                              method: str = None, extra_metadata: Dict[str, Any] = None) -> bool:
+    """Record an Instagram download in the database with normalized media_id.
+
+    This is the centralized function for recording Instagram downloads across all
+    Instagram downloader modules (imginn, fastdl, toolzu, instaloader). It ensures
+    the media_id is always normalized for cross-module duplicate detection.
+
+    Args:
+        db: Database instance (UnifiedDatabase or adapter with record_download method)
+        media_id: The media ID (will be normalized automatically)
+        username: Instagram username
+        content_type: Type of content (posts, stories, reels, highlights)
+        filename: Filename of the downloaded file
+        url: Original Instagram URL (e.g., https://instagram.com/p/ABC123/)
+        download_url: Direct download URL (CDN URL)
+        post_date: Post date/time
+        file_path: Full file path on disk
+        method: Download method (imginn, fastdl, toolzu, instaloader)
+        extra_metadata: Additional metadata to include
+
+    Returns:
+        True if successfully recorded, False otherwise
+    """
+    if not db:
+        return False
+
+    # Normalize the media_id for consistent cross-module detection
+    normalized_media_id = extract_instagram_media_id(media_id) if media_id else media_id
+
+    # Build metadata with normalized media_id
+    metadata = {
+        'media_id': normalized_media_id,
+        'original_media_id': media_id if media_id != normalized_media_id else None,
+    }
+
+    # Add extra metadata if provided
+    if extra_metadata:
+        metadata.update(extra_metadata)
+
+    # Remove None values
+    metadata = {k: v for k, v in metadata.items() if v is not None}
+
+    # Determine URL for database (use download_url or construct from media_id)
+    db_url = url or download_url or f"instagram://{normalized_media_id}"
+
+    # Calculate file hash if file_path provided
+    file_hash = None
+    if file_path:
+        try:
+            from modules.unified_database import UnifiedDatabase
+            file_hash = UnifiedDatabase.get_file_hash(file_path)
+        except Exception:
+            pass
+
+    try:
+        # Try to use the db's record_download method directly
+        if hasattr(db, 'record_download'):
+            return db.record_download(
+                url=db_url,
+                platform='instagram',
+                source=username,
+                content_type=content_type,
+                filename=filename,
+                file_path=file_path,
+                file_hash=file_hash,
+                post_date=post_date,
+                metadata=metadata,
+                method=method
+            )
+        # Fallback for adapter-style databases
+        elif hasattr(db, 'mark_downloaded'):
+            return db.mark_downloaded(
+                username=username,
+                url=db_url,
+                filename=filename,
+                post_date=post_date,
+                metadata=metadata,
+                file_path=file_path,
+                content_type=content_type
+            )
+        else:
+            return False
+    except Exception:
+        return False
+
+
+def is_instagram_downloaded(db, media_id: str, username: str = None) -> bool:
+    """Check if Instagram content is already downloaded by media_id.
+
+    Checks for both the original and normalized media_id to ensure cross-module
+    duplicate detection works correctly.
+
+    Args:
+        db: Database instance (UnifiedDatabase or adapter)
+        media_id: The media ID to check (will check both original and normalized)
+        username: Optional username to scope the check
+
+    Returns:
+        True if already downloaded, False otherwise
+    """
+    if not db or not media_id:
+        return False
+
+    # Normalize the media_id
+    normalized_media_id = extract_instagram_media_id(media_id)
+
+    # Check if this looks like a shortcode (10-15 alphanumeric chars, no 18xxx pattern)
+    is_shortcode = (normalized_media_id == media_id and
+                    re.match(r'^[A-Za-z0-9_-]{10,15}$', media_id) and
+                    not re.match(r'^18\d{15,17}$', media_id))
+
+    try:
+        # Check if db has get_connection (UnifiedDatabase) - query directly
+        if hasattr(db, 'get_connection'):
+            with db.get_connection() as conn:
+                cursor = conn.cursor()
+                # Check both normalized and original media_id
+                # Also verify file_path is set (download was actually completed)
+                if normalized_media_id != media_id:
+                    cursor.execute('''
+                        SELECT 1 FROM downloads
+                        WHERE platform = 'instagram'
+                        AND (media_id = ? OR media_id = ?)
+                        AND file_path IS NOT NULL AND file_path != ''
+                        LIMIT 1
+                    ''', (normalized_media_id, media_id))
+                else:
+                    cursor.execute('''
+                        SELECT 1 FROM downloads
+                        WHERE platform = 'instagram'
+                        AND media_id = ?
+                        AND file_path IS NOT NULL AND file_path != ''
+                        LIMIT 1
+                    ''', (normalized_media_id,))
+                if cursor.fetchone() is not None:
+                    return True
+
+                # For shortcodes, also check the metadata JSON column
+                if is_shortcode:
+                    cursor.execute('''
+                        SELECT 1 FROM downloads
+                        WHERE platform = 'instagram'
+                        AND metadata LIKE ?
+                        AND file_path IS NOT NULL AND file_path != ''
+                        LIMIT 1
+                    ''', (f'%"shortcode": "{media_id}"%',))
+                    if cursor.fetchone() is not None:
+                        return True
+
+                # Check recycle bin — files previously downloaded then deleted
+                # should not be re-downloaded
+                cursor.execute('''
+                    SELECT 1 FROM recycle_bin
+                    WHERE original_filename LIKE ?
+                    LIMIT 1
+                ''', (f'%{normalized_media_id}%',))
+                if cursor.fetchone() is not None:
+                    return True
+
+                return False
+
+        # Fallback for adapters with is_already_downloaded method
+        elif hasattr(db, 'is_already_downloaded'):
+            if db.is_already_downloaded(normalized_media_id):
+                return True
+            # Also check original if different
+            if normalized_media_id != media_id and db.is_already_downloaded(media_id):
+                return True
+
+        return False
+    except Exception:
+        return False