media-downloader/modules/instaloader_module.py

#!/usr/bin/env python3
"""
Instaloader Module for Instagram Downloads
Based on FastDL module architecture with detection safeguards
Bypasses Cloudflare by using Instagram directly
"""

from pathlib import Path
from datetime import datetime, timedelta
import os
import sys
import time
import random
import json
from typing import Optional, Dict
import pickle
from modules.base_module import LoggingMixin
from modules.instagram_utils import (
    extract_instagram_media_id,
    record_instagram_download,
    is_instagram_downloaded
)

class InstaLoaderModule(LoggingMixin):
    """
    Instagram downloader using Instaloader with safeguards

    Features:
    - Rate limiting to avoid detection
    - Session persistence and rotation
    - Database tracking to avoid re-downloads
    - Automatic retry with exponential backoff
    - Detection avoidance strategies
    """

    def __init__(self,
                 username: Optional[str] = None,
                 password: Optional[str] = None,
                 session_file: Optional[str] = None,
                 totp_secret: Optional[str] = None,
                 use_database: bool = True,
                 log_callback=None,
                 show_progress: bool = True,
                 max_rate: int = 100,  # Max requests per hour
                 unified_db=None,
                 require_valid_session: bool = False):
        """
        Initialize the Instaloader module

        Args:
            username: Instagram username for login (optional)
            password: Instagram password for reauthorization (optional)
            session_file: Path to saved session file
            totp_secret: TOTP secret key for 2FA (optional)
            use_database: Track downloads in database
            log_callback: Callback for logging (tag, level, message)
            show_progress: Print progress messages
            max_rate: Maximum requests per hour (rate limiting)
            unified_db: Unified database instance
            require_valid_session: If True, skip downloads if session is invalid
        """
        # Initialize logging via mixin
        self._init_logger('Instagram', log_callback, default_module='Download')

        self.username = username
        self.password = password
        self.totp_secret = totp_secret
        self.session_file = session_file
        self.use_database = use_database
        self.show_progress = show_progress
        self.max_rate = max_rate
        self.unified_db = unified_db
        self.require_valid_session = require_valid_session
        self.session_is_valid = False  # Track session validity

        # Rate limiting
        self.request_times = []
        self.last_request_time = 0

        # Session management - use script directory
        script_dir = Path(__file__).parent.parent  # Go up from modules/ to script root
        self.session_dir = script_dir / "sessions"
        self.session_dir.mkdir(parents=True, exist_ok=True)

        # Initialize Instaloader
        self.loader = None
        self._init_loader()

        # Debug: Check what credentials we have (without exposing sensitive data)
        self.log(f"Module initialized with username: {self.username}", "debug")
        self.log(f"Password provided: {self.password is not None}", "debug")

        # No separate database initialization needed - using unified database only

        # Initialize activity status manager for real-time updates
        from modules.activity_status import get_activity_manager
        self.activity_manager = get_activity_manager(unified_db)

        # Detection avoidance settings - increased to avoid Instagram detection
        # Based on GitHub issue #2391 recommendations
        self.min_delay = 5  # Minimum seconds between requests (increased from 3)
        self.max_delay = 15  # Maximum seconds between requests (increased from 10)
        self.error_delay = 120  # Delay after error (increased from 60)
        self.max_retries = 3
        self.download_batch_size = 10  # Download in smaller batches
        self.batch_delay = 30  # Delay between batches (seconds)

        self.pending_downloads = []  # Track downloads for deferred database recording

    def _init_loader(self):
        """Initialize Instaloader with safeguards"""
        try:
            import instaloader
        except ImportError:
            self.log("Installing instaloader...", "info")
            import subprocess
            subprocess.run(
                ["pip", "install", "--quiet", "--break-system-packages", "instaloader"],
                capture_output=True,
                check=False
            )
            import instaloader

        # Suppress instaloader's direct output and redirect to our logger
        class LoggerAdapter:
            def __init__(self, parent_log_func):
                self.parent_log = parent_log_func

            def write(self, message):
                if message.strip():
                    # Filter and format instaloader messages
                    msg = message.strip()
                    if 'JSON Query' in msg or '401 Unauthorized' in msg or '403 Forbidden' in msg:
                        # Convert to our format
                        if '401 Unauthorized' in msg:
                            self.parent_log("Session authentication issue - retrying", "warning")
                        elif '403 Forbidden' in msg:
                            self.parent_log("Access forbidden - rate limited", "warning")
                        elif 'Error when checking' in msg:
                            self.parent_log("Session validation failed", "debug")
                    elif msg and not msg.startswith('['):
                        self.parent_log(msg, "debug")

            def flush(self):
                pass

        # Configure Instaloader with conservative settings
        self.loader = instaloader.Instaloader(
            quiet=True,  # Always quiet to suppress direct output
            download_videos=True,
            download_video_thumbnails=False,
            download_geotags=False,
            download_comments=False,
            save_metadata=True,  # Need JSON to get media IDs
            compress_json=True,  # Save space with compression
            post_metadata_txt_pattern="",  # Don't save txt files
            storyitem_metadata_txt_pattern="",
            max_connection_attempts=5,  # More retries
            request_timeout=300,
            # Don't treat 403 as fatal - Instagram returns this often
            fatal_status_codes=[429],  # Only stop on rate limit
            # Use a desktop user agent to avoid mobile restrictions
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
        )

        # Redirect stderr to capture instaloader errors
        if self.show_progress:
            self._original_stderr = sys.stderr
            sys.stderr = LoggerAdapter(self.log)

        # Don't refresh session at startup - will do it on each download
        self.session_is_valid = False

    def _rate_limit(self):
        """Rate limiting to avoid detection"""
        current_time = time.time()

        # Clean old request times (older than 1 hour)
        self.request_times = [t for t in self.request_times if current_time - t < 3600]

        # Check rate limit (be more conservative)
        if len(self.request_times) >= self.max_rate:
            # Calculate wait time
            oldest_request = min(self.request_times)
            wait_time = 3600 - (current_time - oldest_request) + random.uniform(5, 15)
            if wait_time > 0:
                self.log(f"Rate limit reached, waiting {wait_time:.0f} seconds", "warning")
                time.sleep(wait_time)

        # Add longer random delay between requests to avoid 403
        if self.last_request_time > 0:
            elapsed = current_time - self.last_request_time
            # Increase delays to avoid detection
            min_wait = random.uniform(self.min_delay * 2, self.max_delay * 2)
            if elapsed < min_wait:
                wait = min_wait - elapsed
                self.log(f"Waiting {wait:.1f}s between requests", "debug")
                time.sleep(wait)

        # Record request
        self.request_times.append(current_time)
        self.last_request_time = current_time

    def is_ready(self) -> bool:
        """Check if the module is ready to download (will refresh session on download)"""
        # Always return True since we refresh session on each download
        return True


    def _load_session(self):
        """Load saved session"""
        import pickle
        # Try multiple session sources
        session_loaded = False

        # 1. Try provided session file (pickle format)
        if self.session_file:
            session_path = Path(self.session_file).expanduser()
            if session_path.exists():
                try:
                    with open(session_path, 'rb') as f:
                        session_data = pickle.load(f)
                    # Set cookies directly
                    self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
                    if session_data.get('csrftoken'):
                        self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')
                    # Just set the username and mark as loaded
                    try:
                        self.loader.context.username = self.username
                        self.log(f"Session loaded from {self.session_file}", "success")
                        session_loaded = True
                    except Exception as e:
                        self.log(f"Error setting session username: {e}", "warning")
                        session_loaded = False
                except Exception as e:
                    self.log(f"Could not load session file: {e}", "warning")

        # 2. Try saved sessions directory (pickle format)
        if not session_loaded and self.username:
            session_path = self.session_dir / f"session-{self.username}"
            if session_path.exists():
                try:
                    with open(session_path, 'rb') as f:
                        session_data = pickle.load(f)
                    self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
                    if session_data.get('csrftoken'):
                        self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')
                    # Set the username on the context
                    self.loader.context.username = self.username
                    self.log(f"Session loaded for {self.username}", "success")
                    session_loaded = True
                except Exception as e:
                    self.log(f"Could not load saved session: {e}", "warning")

        # 3. Try script sessions directory (pickle format)
        if not session_loaded and self.username:
            script_session = self.session_dir / f"session-{self.username}"
            if script_session.exists():
                try:
                    with open(script_session, 'rb') as f:
                        session_data = pickle.load(f)
                    self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
                    if session_data.get('csrftoken'):
                        self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')
                    # Set the username on the context
                    self.loader.context.username = self.username
                    self.log(f"Session loaded from {script_session}", "success")
                    session_loaded = True
                except Exception as e:
                    self.log(f"Could not load session: {e}", "warning")

        if not session_loaded:
            self.log("No session loaded - anonymous access only", "warning")
            self.log("Some features may be limited without login", "warning")
        # Don't validate here - it will be done after _load_session returns

    def reauthorize_session(self, force_new: bool = False) -> bool:
        """
        Reauthorize Instagram session using stored credentials

        Args:
            force_new: Force a completely new login even if session exists

        Returns:
            True if reauthorization successful, False otherwise
        """
        if not self.username or not self.password:
            self.log("Cannot reauthorize - no credentials available", "error")
            self.log("Please provide username and password in config", "info")
            return False

        # Always create fresh session for each download
        if self.totp_secret:
            self.log("Using CLI method for login with 2FA", "info")
            try:
                try:
                    import pyotp
                except ImportError:
                    self.log("pyotp not installed, attempting to install...", "warning")
                    import subprocess
                    import sys
                    subprocess.check_call([sys.executable, "-m", "pip", "install", "pyotp"])
                    import pyotp
                import subprocess
                import pickle

                # Generate 2FA code
                totp = pyotp.TOTP(self.totp_secret)
                two_factor_code = totp.now()
                self.log(f"Generated 2FA code: {two_factor_code}", "info")

                # Use instaloader CLI with the 2FA code
                # Use configured session file path or default
                if self.session_file:
                    session_file = Path(self.session_file).expanduser()
                    session_file.parent.mkdir(parents=True, exist_ok=True)
                else:
                    session_file = self.session_dir / f"session-{self.username}"
                # Pass password as separate argument to avoid shell escaping issues
                cmd = [
                    'instaloader',
                    '--login', self.username,
                    '--password', self.password,
                    '--sessionfile', str(session_file)
                ]

                self.log("Using instaloader CLI for login...", "info")
                self.log(f"Debug - Command: instaloader --login {self.username} --password [HIDDEN] --sessionfile {session_file}", "debug")
                self.log(f"Debug - Password length being passed: {len(self.password)}", "debug")

                # Run with 2FA code piped via stdin (avoids shell=True security risk)
                self.log(f"Running command with 2FA code via stdin", "debug")

                result = subprocess.run(
                    cmd,
                    input=two_factor_code + '\n',
                    capture_output=True,
                    text=True,
                    timeout=30
                )

                # Check if login was successful by looking for success messages and session file
                login_success = ("Logged in as" in result.stdout and
                               "Saved session to" in result.stdout and
                               session_file.exists())

                if login_success:
                    self.log("Successfully logged in via CLI", "success")

                    # Wait a moment for file to be fully written
                    time.sleep(1)

                    # Load the new session
                    with open(session_file, 'rb') as f:
                        session_data = pickle.load(f)

                    # Apply session to our loader
                    self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
                    if session_data.get('csrftoken'):
                        self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')

                    # Set username in context
                    self.loader.context.username = self.username

                    self.session_is_valid = True
                    return True
                else:
                    # Log details about the failure
                    if not session_file.exists():
                        self.log("Session file was not created", "error")
                    self.log(f"CLI stdout: {result.stdout[:300]}", "info")
                    if result.stderr:
                        self.log(f"CLI stderr: {result.stderr[:200]}", "info")
                    return False
            except Exception as e:
                self.log(f"CLI login error: {str(e)[:100]}", "error")
                # Fall back to Python API method
                pass

        # Fallback to Python API login
        try:
            import instaloader
            import pickle

            # Clear existing session if force_new
            if force_new:
                self.log("Clearing existing session for fresh login", "info")
                # Clear all cookies properly to avoid duplicate sessionid issue
                try:
                    self.loader.context._session.cookies.clear()
                except Exception:
                    pass
                # Create a fresh loader instance to avoid cookie conflicts
                self.loader = instaloader.Instaloader(
                    download_pictures=False,
                    download_videos=False,
                    download_video_thumbnails=False,
                    compress_json=False,
                    save_metadata=False,
                    post_metadata_txt_pattern="",
                    quiet=True,
                    fatal_status_codes=[],
                    max_connection_attempts=3
                )

            # Add delay before login attempt to avoid rate limiting
            import random
            delay = random.uniform(3, 5)
            self.log(f"Waiting {delay:.1f}s before login attempt", "debug")
            time.sleep(delay)

            # Attempt login
            self.log(f"Logging in as {self.username}...", "info")
            try:
                self.loader.login(self.username, self.password)

                # Save the new session
                # Use configured session file path or default
                if self.session_file:
                    session_file = Path(self.session_file).expanduser()
                    session_file.parent.mkdir(parents=True, exist_ok=True)
                else:
                    session_file = self.session_dir / f"session-{self.username}"
                session_data = {
                    'sessionid': self.loader.context._session.cookies.get('sessionid'),
                    'csrftoken': self.loader.context._session.cookies.get('csrftoken'),
                    'username': self.username,
                    'timestamp': datetime.now().isoformat()
                }

                with open(session_file, 'wb') as f:
                    pickle.dump(session_data, f)

                self.log(f"Session saved to {session_file}", "success")

                self.session_is_valid = True
                self.log("Successfully reauthorized session", "success")
                return True

            except instaloader.exceptions.BadCredentialsException:
                self.log("Invalid username or password", "error")
                self.log("Please check your Instagram credentials in the config file", "info")
                self.log("The password may have been changed or the account may be locked", "info")
                return False
            except instaloader.exceptions.TwoFactorAuthRequiredException:
                self.log("Two-factor authentication required", "info")

                # Use subprocess to call instaloader CLI which handles 2FA better
                if self.totp_secret:
                    try:
                        try:
                            import pyotp
                        except ImportError:
                            self.log("pyotp not installed, attempting to install...", "warning")
                            import sys
                            subprocess.check_call([sys.executable, "-m", "pip", "install", "pyotp"])
                            import pyotp
                        import subprocess

                        # Generate 2FA code
                        totp = pyotp.TOTP(self.totp_secret)
                        two_factor_code = totp.now()
                        self.log(f"Generated 2FA code: {two_factor_code}", "info")

                        # Use instaloader CLI with the 2FA code
                        # Use configured session file path or default
                        if self.session_file:
                            session_file = Path(self.session_file).expanduser()
                            session_file.parent.mkdir(parents=True, exist_ok=True)
                        else:
                            session_file = self.session_dir / f"session-{self.username}"
                        # Pass password as separate argument to avoid shell escaping issues
                        cmd = [
                            'instaloader',
                            '--login', self.username,
                            '--password', self.password,
                            '--sessionfile', str(session_file)
                        ]

                        self.log("Using instaloader CLI for 2FA login...", "info")

                        # Run with 2FA code as input
                        result = subprocess.run(
                            cmd,
                            input=f"{two_factor_code}\n",
                            capture_output=True,
                            text=True,
                            timeout=30
                        )

                        # Check if login was successful by looking for success messages and session file
                        login_success = ("Logged in as" in result.stdout and
                                       "Saved session to" in result.stdout and
                                       session_file.exists())

                        if login_success:
                            self.log("Successfully logged in with 2FA via CLI", "success")

                            # Wait a moment for file to be fully written
                            time.sleep(1)

                            # Load the new session
                            import pickle
                            with open(session_file, 'rb') as f:
                                session_data = pickle.load(f)

                            # Apply session to our loader
                            self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
                            if session_data.get('csrftoken'):
                                self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')

                            # Set username in context
                            self.loader.context.username = self.username

                            self.session_is_valid = True
                            return True
                        else:
                            # Log details about the failure
                            if not session_file.exists():
                                self.log("Session file was not created", "error")
                            else:
                                self.log("Login output unclear, treating as failure", "error")
                            self.log(f"CLI output: {result.stdout[:300]}", "debug")
                            return False

                    except Exception as e:
                        self.log(f"2FA login failed: {str(e)[:100]}", "error")
                        return False
                else:
                    self.log("No 2FA code available - login failed", "error")
                    self.log("Options:", "info")
                    self.log("1. Add 'totp_secret' to config with your 2FA secret key", "info")
                    self.log(f"2. Create file: {self.session_dir}/2fa_code_USERNAME.txt with code", "info")
                    self.log("3. Run interactively to enter code when prompted", "info")
                    return False
            except instaloader.exceptions.ConnectionException as e:
                if "checkpoint" in str(e).lower():
                    self.log("Instagram checkpoint required - please verify account in browser", "error")
                elif "429" in str(e):
                    self.log("Rate limited during login - please wait before retrying", "error")
                else:
                    self.log(f"Connection error during login: {str(e)[:100]}", "error")
                return False

        except Exception as e:
            self.log(f"Reauthorization failed: {str(e)[:100]}", "error")
            return False


    def _is_already_downloaded(self, post_id: str) -> bool:
        """Check if post has been downloaded (uses centralized function for cross-module detection)"""
        if not self.use_database or not self.unified_db:
            return False

        # Check by URL first
        url = f"https://www.instagram.com/p/{post_id}/"
        if self.unified_db.is_downloaded(url):
            return True

        # Use centralized function for consistent cross-module detection
        return is_instagram_downloaded(self.unified_db, str(post_id))

    def _record_download(self, post_id: str, username: str, content_type: str,
                        filename: str = None, post_date: datetime = None,
                        caption: str = None, likes: int = None, comments: int = None,
                        deferred: bool = False):
        """Record successful download (uses centralized function for normalized media_id)

        Args:
            deferred: If True, don't record to database now - add to pending_downloads list
                     for later recording after file move is complete
        """
        # Use centralized function for consistent cross-module storage
        url = f"https://www.instagram.com/p/{post_id}/"
        extra_metadata = {
            'username': username,
            'caption': caption[:500] if caption else None,
            'likes': likes,
            'comments': comments
        }

        # If deferred, store for later recording instead of recording now
        if deferred:
            self.pending_downloads.append({
                'media_id': str(post_id),
                'username': username,
                'filename': filename,
                'url': url,
                'post_date': post_date.isoformat() if post_date else None,
                'content_type': content_type,
                'metadata': extra_metadata
            })
            self.log(f"Deferred recording for {post_id}", "debug")
            return True

        if not self.use_database or not self.unified_db:
            return

        record_instagram_download(
            db=self.unified_db,
            media_id=str(post_id),
            username=username,
            content_type=content_type,
            filename=filename,
            url=url,
            post_date=post_date,
            method='instaloader',
            extra_metadata=extra_metadata
        )

    def get_pending_downloads(self):
        """Get list of downloads that were deferred for later recording"""
        return self.pending_downloads.copy()

    def clear_pending_downloads(self):
        """Clear the pending downloads list after they've been recorded"""
        self.pending_downloads = []

    def download(self, username: str, output_dir: str = "downloads",
                 content_type: str = "posts", max_downloads: int = None,
                 days_back: int = None, date_from: datetime = None,
                 date_to: datetime = None, defer_database: bool = False) -> int:
        """
        Download content from Instagram user

        Args:
            username: Instagram username to download from
            output_dir: Directory to save downloads
            content_type: Type of content (posts, stories, reels, all)
            max_downloads: Maximum number to download
            days_back: Download content from last N days
            date_from: Start date for downloads
            date_to: End date for downloads
            defer_database: If True, don't record to database immediately - store in
                           pending_downloads for later recording after file move is complete

        Returns:
            Number of items downloaded
        """
        self.defer_database = defer_database  # Store for use in _record_download
        # Refresh session before each download
        if self.username and self.password:
            self.log("Refreshing session for download...", "info")
            if not self.reauthorize_session():
                self.log("Failed to refresh session", "error")
                if self.require_valid_session:
                    self.log(f"Skipping download for @{username} - session refresh failed and require_valid_session is True", "warning")
                    return 0
                self.session_is_valid = False
            else:
                self.session_is_valid = True
                self.log(f"Session ready for @{username}", "success")
        elif self.require_valid_session:
            self.log(f"Skipping download for @{username} - no credentials and require_valid_session is True", "warning")
            return 0

        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)

        # Setup date filtering
        if days_back and not date_from:
            date_from = datetime.now() - timedelta(days=days_back)

        if not date_to:
            date_to = datetime.now()

        self.log(f"Downloading {content_type} for @{username}", "info")
        if date_from:
            self.log(f"Date range: {date_from.strftime('%Y-%m-%d')} to {date_to.strftime('%Y-%m-%d')}", "info")

        downloaded_count = 0
        skipped_count = 0
        error_count = 0

        try:
            # Get profile with retry
            profile = self._get_profile_with_retry(username)
            if not profile:
                return 0

            # Download based on content type
            if content_type in ["posts", "all"]:
                count = self._download_posts(profile, output_path, max_downloads,
                                            date_from, date_to)
                downloaded_count += count

            if content_type in ["stories", "all"]:
                count = self._download_stories(profile, output_path)
                downloaded_count += count

            if content_type in ["reels", "all"]:
                count = self._download_reels(profile, output_path, max_downloads,
                                            date_from, date_to)
                downloaded_count += count

        except Exception as e:
            self.log(f"Download error: {e}", "error")

            # Handle specific errors
            if "429" in str(e) or "rate" in str(e).lower():
                self.log("Rate limited by Instagram! Try again later.", "error")
                self.log(f"Waiting {self.error_delay} seconds...", "warning")
                time.sleep(self.error_delay)
            elif "login" in str(e).lower():
                self.log("Login required for this content!", "error")
                self.log("Create a session file first (see documentation)", "info")
            elif "not found" in str(e).lower():
                self.log(f"User {username} not found or private", "error")

        self.log(f"Download complete: {downloaded_count} downloaded", "success")
        return downloaded_count

    def _get_profile_with_retry(self, username: str):
        """Get Instagram profile with retry logic"""
        import instaloader

        # Ensure loader is initialized
        if not self.loader:
            self._init_loader()

        for attempt in range(self.max_retries):
            try:
                self.log(f"Fetching profile: {username}", "info")

                # Check if context is available
                if not hasattr(self.loader, 'context') or self.loader.context is None:
                    self.log("Reinitializing loader context...", "debug")
                    self._init_loader()

                profile = instaloader.Profile.from_username(self.loader.context, username)

                # Log profile info
                self.log(f"Profile found: {profile.full_name} ({profile.mediacount} posts)", "success")

                if profile.is_private and not profile.followed_by_viewer:
                    self.log("Profile is private and not followed", "warning")

                return profile

            except Exception as e:
                if attempt < self.max_retries - 1:
                    wait = self.error_delay * (attempt + 1)
                    self.log(f"Error getting profile (attempt {attempt + 1}): {e}", "warning")
                    self.log(f"Retrying in {wait} seconds...", "info")
                    time.sleep(wait)
                else:
                    self.log(f"Failed to get profile after {self.max_retries} attempts", "error")
                    raise

        return None

    def _download_posts(self, profile, output_path: Path, max_downloads: int,
                       date_from: datetime, date_to: datetime) -> int:
        """Download posts from profile"""
        downloaded = 0
        skipped = 0

        self.log(f"Downloading posts...", "info")
        self.activity_manager.update_status("Checking posts")

        try:
            posts = profile.get_posts()

            for post in posts:
                # Check date range
                if date_from and post.date < date_from:
                    self.log(f"Reached posts older than date range, stopping", "info")
                    break

                if date_to and post.date > date_to:
                    continue

                # Check if already downloaded
                media_id = str(post.mediaid)
                shortcode = post.shortcode
                if self._is_already_downloaded(media_id):
                    self.log(f"Skipping already downloaded: {shortcode}", "debug")
                    skipped += 1
                    continue

                # Download post
                try:
                    self.log(f"Downloading post {shortcode} from {post.date.strftime('%Y-%m-%d')}", "info")

                    # Create temp directory for instaloader
                    temp_dir = output_path / f"temp_{shortcode}"
                    temp_dir.mkdir(parents=True, exist_ok=True)

                    # Download with Instaloader to temp dir
                    self.loader.download_post(post, target=temp_dir)

                    # Move and rename files to match FastDL format
                    self._process_downloaded_files(temp_dir, output_path, post.owner_username, media_id, post.date)

                    # Clean up temp directory
                    import shutil
                    shutil.rmtree(temp_dir, ignore_errors=True)

                    # Record in database
                    self._record_download(
                        post_id=media_id,
                        username=post.owner_username,
                        content_type="post",
                        post_date=post.date,
                        caption=post.caption[:500] if post.caption else None,
                        likes=post.likes,
                        comments=post.comments,
                        deferred=self.defer_database
                    )

                    downloaded += 1

                    # Update status
                    self.activity_manager.update_status(
                        "Downloading posts",
                        progress_current=downloaded,
                        progress_total=max_downloads
                    )

                    # Check max downloads
                    if max_downloads and downloaded >= max_downloads:
                        self.log(f"Reached max downloads ({max_downloads})", "info")
                        break

                    # Random delay to avoid detection
                    self._smart_delay(downloaded)

                except Exception as e:
                    self.log(f"Error downloading post {media_id}: {e}", "error")
                    if "429" in str(e):
                        self.log("Rate limited! Stopping downloads.", "error")
                        break

        except Exception as e:
            error_msg = str(e)
            self.log(f"Error iterating posts: {e}", "error")

            # Check if Instagram is blocking us
            if "401" in error_msg or "Please wait a few minutes" in error_msg:
                self.log("Instagram is blocking requests - session may be compromised", "error")
                self.log("Aborting all downloads to prevent further issues", "error")
                # Mark session as invalid to prevent further attempts
                self.session_is_valid = False
                return 0
            elif "403" in error_msg or "forbidden" in error_msg.lower():
                self.log("Access forbidden - Instagram has blocked this session", "error")
                self.session_is_valid = False
                return 0

        self.log(f"Posts: {downloaded} downloaded, {skipped} skipped", "info")
        return downloaded

    def _process_downloaded_files(self, temp_dir: Path, output_path: Path, username: str, fallback_id: str, post_date: datetime):
        """Process downloaded files to match FastDL naming and timestamps

        Returns:
            list: List of processed filenames, or empty list if no files processed
        """
        import shutil
        import re
        import json
        import lzma
        from datetime import timedelta

        processed_files = []

        # Format date for filename - subtract 4 hours for timezone adjustment
        adjusted_date_for_filename = post_date - timedelta(hours=4)
        date_str = adjusted_date_for_filename.strftime('%Y%m%d_%H%M%S')

        # Build a mapping of original filenames to media IDs from JSON
        media_id_map = {}

        # Load JSON file to get media IDs from URLs
        json_files = list(temp_dir.glob('*.json.xz'))
        if not json_files:
            json_files = list(temp_dir.glob('*.json'))

        if json_files:
            try:
                json_file = json_files[0]
                if json_file.suffix == '.xz':
                    with lzma.open(json_file, 'rt') as f:
                        data = json.load(f)
                else:
                    with open(json_file, 'r') as f:
                        data = json.load(f)

                # Extract media IDs from URLs in carousel or single image
                if 'node' in data:
                    node = data['node']

                    # Check for carousel in iphone_struct
                    if 'iphone_struct' in node and 'carousel_media' in node['iphone_struct']:
                        # Carousel post - each image has its own media ID
                        for idx, item in enumerate(node['iphone_struct']['carousel_media'], 1):
                            if 'image_versions2' in item and 'candidates' in item['image_versions2']:
                                url = item['image_versions2']['candidates'][0]['url']
                                # Extract media ID from URL
                                parts = url.split('/')
                                for part in parts:
                                    if '.jpg' in part or '.mp4' in part:
                                        filename = part.split('?')[0]
                                        # Remove extension and _n suffix
                                        media_id = filename.replace('.jpg', '').replace('.mp4', '').replace('_n', '')
                                        # Map the index to media ID
                                        media_id_map[str(idx)] = media_id
                                        break

                    # Check for single image/video
                    elif 'display_url' in node or ('iphone_struct' in node and 'image_versions2' in node['iphone_struct']):
                        # Single post
                        url = node.get('display_url', '')
                        if not url and 'iphone_struct' in node and 'image_versions2' in node['iphone_struct']:
                            url = node['iphone_struct']['image_versions2']['candidates'][0]['url']

                        if url:
                            parts = url.split('/')
                            for part in parts:
                                if '.jpg' in part or '.mp4' in part:
                                    filename = part.split('?')[0]
                                    media_id = filename.replace('.jpg', '').replace('.mp4', '').replace('_n', '')
                                    media_id_map['single'] = media_id
                                    break

            except Exception as e:
                self.log(f"Could not extract media IDs from JSON: {e}", "debug")

        # Process all downloaded files
        for file_path in temp_dir.iterdir():
            if file_path.is_file():
                # Skip JSON metadata files
                if file_path.suffix.lower() in ['.json', '.xz', '.txt']:
                    continue

                # Get file extension
                ext = file_path.suffix.lower()

                # Check if it's a multi-image post (has _1, _2, etc. in filename)
                match = re.search(r'_(\d+)\.(jpg|jpeg|png|mp4|mov)', file_path.name, re.IGNORECASE)
                if match:
                    index = match.group(1)
                    # Use the media ID for this specific index
                    media_id = media_id_map.get(index, fallback_id)
                    new_filename = f"{username}_{date_str}_{media_id}{ext}"
                else:
                    # Single image/video
                    media_id = media_id_map.get('single', fallback_id)
                    new_filename = f"{username}_{date_str}_{media_id}{ext}"

                # Move and rename file
                new_path = output_path / new_filename
                shutil.move(str(file_path), str(new_path))

                # Check for duplicate hash before finalizing (hash blacklist persists even if original deleted)
                file_hash = self.unified_db.get_file_hash(str(new_path)) if self.unified_db else None
                if file_hash:
                    existing = self.unified_db.get_download_by_file_hash(file_hash)
                    if existing and existing.get('file_path') and str(new_path) != existing.get('file_path'):
                        # Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
                        self.log(f"⚠ Duplicate content detected (hash match): {new_filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
                        # Delete the duplicate regardless of whether original file still exists
                        try:
                            new_path.unlink()
                            self.log(f"Deleted duplicate (hash blacklist): {new_filename}", "debug")
                            continue
                        except Exception as e:
                            self.log(f"Failed to delete duplicate {new_filename}: {e}", "warning")

                # Set file timestamps to post date
                self._update_file_timestamp(new_path, post_date)

                # Add to processed files list
                processed_files.append(new_filename)

        return processed_files

    def _update_file_timestamp(self, filepath: Path, post_date: datetime):
        """Update file timestamps to match post date"""
        try:
            import os
            # Convert datetime to timestamp with 4-hour adjustment
            from datetime import timedelta
            adjusted_date = post_date - timedelta(hours=4)
            timestamp = adjusted_date.timestamp()
            # Set both access and modification time
            os.utime(filepath, (timestamp, timestamp))
            self.log(f"Updated timestamp for {filepath.name} (adjusted -4 hours)", "debug")
        except Exception as e:
            self.log(f"Failed to update timestamp: {e}", "warning")

    def _download_stories(self, profile, output_path: Path) -> int:
        """Download stories from profile"""
        downloaded = 0

        if not self.username:
            self.log("Login required to download stories", "warning")
            return 0

        self.log(f"Downloading stories...", "info")
        self.activity_manager.update_status("Checking stories")

        try:
            import instaloader
            import shutil

            # Get user ID for stories
            user_id = profile.userid

            # Download stories
            for story in self.loader.get_stories([user_id]):
                for item in story.get_items():
                    media_id = str(item.mediaid)

                    if self._is_already_downloaded(media_id):
                        self.log(f"Skipping already downloaded story: {media_id}", "debug")
                        continue

                    try:
                        self.log(f"Downloading story {media_id}", "info")

                        # Download story to temp dir
                        temp_dir = output_path / f"temp_story_{media_id}"
                        temp_dir.mkdir(parents=True, exist_ok=True)

                        self.loader.download_storyitem(item, target=temp_dir)

                        # Process and move files to match FastDL format
                        processed_files = self._process_downloaded_files(temp_dir, output_path, profile.username, media_id, item.date)

                        # Clean up temp directory
                        shutil.rmtree(temp_dir, ignore_errors=True)

                        # Only record in database if files were successfully processed
                        if processed_files:
                            # Get the first processed filename for database record
                            filename = processed_files[0] if isinstance(processed_files, list) else None
                            self._record_download(
                                post_id=media_id,
                                username=profile.username,
                                content_type="story",
                                filename=filename,
                                post_date=item.date,
                                deferred=self.defer_database
                            )
                            downloaded += 1
                            self.activity_manager.update_status(
                                "Downloading stories",
                                progress_current=downloaded,
                                progress_total=max_downloads
                            )
                        else:
                            self.log(f"No files processed for story {media_id}, not recording in database", "warning")

                        self._smart_delay()

                    except Exception as e:
                        self.log(f"Error downloading story {media_id}: {e}", "error")

        except Exception as e:
            self.log(f"Error downloading stories: {e}", "error")
            if "login" in str(e).lower():
                self.log("Stories require login!", "warning")

        self.log(f"Stories: {downloaded} downloaded", "info")
        return downloaded

    def _download_reels(self, profile, output_path: Path, max_downloads: int,
                       date_from: datetime, date_to: datetime) -> int:
        """Download reels from profile"""
        downloaded = 0

        self.log(f"Downloading reels...", "info")
        self.activity_manager.update_status("Checking reels")

        try:
            # Reels are part of posts, filter for videos
            posts = profile.get_posts()

            for post in posts:
                # Check if it's a reel (video post)
                if not post.is_video:
                    continue

                # Check date range
                if date_from and post.date < date_from:
                    break

                if date_to and post.date > date_to:
                    continue

                # Check if already downloaded
                media_id = str(post.mediaid)
                shortcode = post.shortcode
                if self._is_already_downloaded(media_id):
                    self.log(f"Skipping already downloaded reel: {shortcode}", "debug")
                    continue

                try:
                    self.log(f"Downloading reel {shortcode}", "info")

                    # Download reel to temp dir
                    temp_dir = output_path / f"temp_reel_{shortcode}"
                    temp_dir.mkdir(parents=True, exist_ok=True)

                    self.loader.download_post(post, target=temp_dir)

                    # Process and move files to match FastDL format
                    self._process_downloaded_files(temp_dir, output_path, post.owner_username, media_id, post.date)

                    # Clean up temp directory
                    import shutil
                    shutil.rmtree(temp_dir, ignore_errors=True)

                    # Record in database
                    self._record_download(
                        post_id=media_id,
                        username=post.owner_username,
                        content_type="reel",
                        post_date=post.date,
                        likes=post.likes,
                        comments=post.comments,
                        deferred=self.defer_database
                    )

                    downloaded += 1

                    # Update status
                    self.activity_manager.update_status(
                        "Downloading reels",
                        progress_current=downloaded,
                        progress_total=max_downloads
                    )

                    if max_downloads and downloaded >= max_downloads:
                        break

                    self._smart_delay()

                except Exception as e:
                    self.log(f"Error downloading reel {media_id}: {e}", "error")

        except Exception as e:
            self.log(f"Error downloading reels: {e}", "error")

        self.log(f"Reels: {downloaded} downloaded", "info")
        return downloaded

    def _smart_delay(self, batch_count=0):
        """Smart delay between downloads to avoid detection"""
        # Random delay with exponential backoff if needed
        base_delay = random.uniform(self.min_delay, self.max_delay)

        # Add batch delay if we've downloaded a batch
        if batch_count > 0 and batch_count % self.download_batch_size == 0:
            self.log(f"Batch limit reached ({self.download_batch_size} items), taking a longer break", "info")
            base_delay = self.batch_delay + random.uniform(0, 10)

        # Add extra delay if we're downloading fast
        elif len(self.request_times) > 10:
            recent_requests = self.request_times[-10:]
            avg_interval = (recent_requests[-1] - recent_requests[0]) / 9
            if avg_interval < 5:  # Too fast
                base_delay += random.uniform(5, 10)
                self.log("Slowing down to avoid detection", "debug")

        time.sleep(base_delay)

    def login(self, username: str, password: str = None) -> bool:
        """
        Login to Instagram and save session

        Args:
            username: Instagram username
            password: Instagram password (will prompt if not provided)

        Returns:
            True if login successful
        """
        try:
            if not password:
                import getpass
                password = getpass.getpass(f"Password for {username}: ")

            self.log(f"Logging in as {username}...", "info")
            self.loader.login(username, password)

            # Save session
            # Use configured session file path or default
            if self.session_file:
                session_file = Path(self.session_file).expanduser()
                session_file.parent.mkdir(parents=True, exist_ok=True)
            else:
                session_file = self.session_dir / f"session-{username}"
            self.loader.save_session_to_file(session_file)
            self.log(f"Session saved to {session_file}", "success")

            self.username = username
            return True

        except Exception as e:
            self.log(f"Login failed: {e}", "error")

            if "checkpoint" in str(e).lower():
                self.log("Instagram requires verification (checkpoint)", "warning")
                self.log("Complete verification in browser, then export session", "info")
            elif "bad password" in str(e).lower():
                self.log("Invalid username or password", "error")
            elif "429" in str(e):
                self.log("Too many login attempts, try again later", "error")

            return False

    def get_database_stats(self) -> Dict:
        """Get database statistics"""
        if not self.use_database or not self.unified_db:
            return {"enabled": False}

        # Use unified database statistics
        return self.unified_db.get_statistics(platform='instagram')

# Test function
def test_module():
    """Test the InstaLoader module"""
    from pathlib import Path
    print("Testing InstaLoader Module")
    print("=" * 60)

    # Use proper path in database directory for testing
    test_db_path = str(Path(__file__).parent.parent / 'database' / 'test_instaloader.db')
    module = InstaLoaderModule(
        show_progress=True,
        use_database=True,
        db_path=test_db_path
    )

    # Test download (limited)
    count = module.download(
        username="evalongoria",
        output_dir="/opt/temp/test/instagram/posts",
        content_type="posts",
        max_downloads=2,
        days_back=30
    )

    print(f"\nDownloaded {count} items")

    # Show stats
    stats = module.get_database_stats()
    print(f"\nDatabase stats:")
    print(f"  Total: {stats.get('total_downloads', 0)}")
    print(f"  By type: {stats.get('by_type', {})}")

    return count > 0

if __name__ == "__main__":
    import sys
    success = test_module()
    sys.exit(0 if success else 1)