#!/usr/bin/env python3 """ Instaloader Module for Instagram Downloads Based on FastDL module architecture with detection safeguards Bypasses Cloudflare by using Instagram directly """ from pathlib import Path from datetime import datetime, timedelta import os import sys import time import random import json from typing import Optional, Dict import pickle from modules.base_module import LoggingMixin from modules.instagram_utils import ( extract_instagram_media_id, record_instagram_download, is_instagram_downloaded ) class InstaLoaderModule(LoggingMixin): """ Instagram downloader using Instaloader with safeguards Features: - Rate limiting to avoid detection - Session persistence and rotation - Database tracking to avoid re-downloads - Automatic retry with exponential backoff - Detection avoidance strategies """ def __init__(self, username: Optional[str] = None, password: Optional[str] = None, session_file: Optional[str] = None, totp_secret: Optional[str] = None, use_database: bool = True, log_callback=None, show_progress: bool = True, max_rate: int = 100, # Max requests per hour unified_db=None, require_valid_session: bool = False): """ Initialize the Instaloader module Args: username: Instagram username for login (optional) password: Instagram password for reauthorization (optional) session_file: Path to saved session file totp_secret: TOTP secret key for 2FA (optional) use_database: Track downloads in database log_callback: Callback for logging (tag, level, message) show_progress: Print progress messages max_rate: Maximum requests per hour (rate limiting) unified_db: Unified database instance require_valid_session: If True, skip downloads if session is invalid """ # Initialize logging via mixin self._init_logger('Instagram', log_callback, default_module='Download') self.username = username self.password = password self.totp_secret = totp_secret self.session_file = session_file self.use_database = use_database self.show_progress = show_progress self.max_rate = max_rate self.unified_db = unified_db self.require_valid_session = require_valid_session self.session_is_valid = False # Track session validity # Rate limiting self.request_times = [] self.last_request_time = 0 # Session management - use script directory script_dir = Path(__file__).parent.parent # Go up from modules/ to script root self.session_dir = script_dir / "sessions" self.session_dir.mkdir(parents=True, exist_ok=True) # Initialize Instaloader self.loader = None self._init_loader() # Debug: Check what credentials we have (without exposing sensitive data) self.log(f"Module initialized with username: {self.username}", "debug") self.log(f"Password provided: {self.password is not None}", "debug") # No separate database initialization needed - using unified database only # Initialize activity status manager for real-time updates from modules.activity_status import get_activity_manager self.activity_manager = get_activity_manager(unified_db) # Detection avoidance settings - increased to avoid Instagram detection # Based on GitHub issue #2391 recommendations self.min_delay = 5 # Minimum seconds between requests (increased from 3) self.max_delay = 15 # Maximum seconds between requests (increased from 10) self.error_delay = 120 # Delay after error (increased from 60) self.max_retries = 3 self.download_batch_size = 10 # Download in smaller batches self.batch_delay = 30 # Delay between batches (seconds) self.pending_downloads = [] # Track downloads for deferred database recording def _init_loader(self): """Initialize Instaloader with safeguards""" try: import instaloader except ImportError: self.log("Installing instaloader...", "info") import subprocess subprocess.run( ["pip", "install", "--quiet", "--break-system-packages", "instaloader"], capture_output=True, check=False ) import instaloader # Suppress instaloader's direct output and redirect to our logger class LoggerAdapter: def __init__(self, parent_log_func): self.parent_log = parent_log_func def write(self, message): if message.strip(): # Filter and format instaloader messages msg = message.strip() if 'JSON Query' in msg or '401 Unauthorized' in msg or '403 Forbidden' in msg: # Convert to our format if '401 Unauthorized' in msg: self.parent_log("Session authentication issue - retrying", "warning") elif '403 Forbidden' in msg: self.parent_log("Access forbidden - rate limited", "warning") elif 'Error when checking' in msg: self.parent_log("Session validation failed", "debug") elif msg and not msg.startswith('['): self.parent_log(msg, "debug") def flush(self): pass # Configure Instaloader with conservative settings self.loader = instaloader.Instaloader( quiet=True, # Always quiet to suppress direct output download_videos=True, download_video_thumbnails=False, download_geotags=False, download_comments=False, save_metadata=True, # Need JSON to get media IDs compress_json=True, # Save space with compression post_metadata_txt_pattern="", # Don't save txt files storyitem_metadata_txt_pattern="", max_connection_attempts=5, # More retries request_timeout=300, # Don't treat 403 as fatal - Instagram returns this often fatal_status_codes=[429], # Only stop on rate limit # Use a desktop user agent to avoid mobile restrictions user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36' ) # Redirect stderr to capture instaloader errors if self.show_progress: self._original_stderr = sys.stderr sys.stderr = LoggerAdapter(self.log) # Don't refresh session at startup - will do it on each download self.session_is_valid = False def _rate_limit(self): """Rate limiting to avoid detection""" current_time = time.time() # Clean old request times (older than 1 hour) self.request_times = [t for t in self.request_times if current_time - t < 3600] # Check rate limit (be more conservative) if len(self.request_times) >= self.max_rate: # Calculate wait time oldest_request = min(self.request_times) wait_time = 3600 - (current_time - oldest_request) + random.uniform(5, 15) if wait_time > 0: self.log(f"Rate limit reached, waiting {wait_time:.0f} seconds", "warning") time.sleep(wait_time) # Add longer random delay between requests to avoid 403 if self.last_request_time > 0: elapsed = current_time - self.last_request_time # Increase delays to avoid detection min_wait = random.uniform(self.min_delay * 2, self.max_delay * 2) if elapsed < min_wait: wait = min_wait - elapsed self.log(f"Waiting {wait:.1f}s between requests", "debug") time.sleep(wait) # Record request self.request_times.append(current_time) self.last_request_time = current_time def is_ready(self) -> bool: """Check if the module is ready to download (will refresh session on download)""" # Always return True since we refresh session on each download return True def _load_session(self): """Load saved session""" import pickle # Try multiple session sources session_loaded = False # 1. Try provided session file (pickle format) if self.session_file: session_path = Path(self.session_file).expanduser() if session_path.exists(): try: with open(session_path, 'rb') as f: session_data = pickle.load(f) # Set cookies directly self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/') if session_data.get('csrftoken'): self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/') # Just set the username and mark as loaded try: self.loader.context.username = self.username self.log(f"Session loaded from {self.session_file}", "success") session_loaded = True except Exception as e: self.log(f"Error setting session username: {e}", "warning") session_loaded = False except Exception as e: self.log(f"Could not load session file: {e}", "warning") # 2. Try saved sessions directory (pickle format) if not session_loaded and self.username: session_path = self.session_dir / f"session-{self.username}" if session_path.exists(): try: with open(session_path, 'rb') as f: session_data = pickle.load(f) self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/') if session_data.get('csrftoken'): self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/') # Set the username on the context self.loader.context.username = self.username self.log(f"Session loaded for {self.username}", "success") session_loaded = True except Exception as e: self.log(f"Could not load saved session: {e}", "warning") # 3. Try script sessions directory (pickle format) if not session_loaded and self.username: script_session = self.session_dir / f"session-{self.username}" if script_session.exists(): try: with open(script_session, 'rb') as f: session_data = pickle.load(f) self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/') if session_data.get('csrftoken'): self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/') # Set the username on the context self.loader.context.username = self.username self.log(f"Session loaded from {script_session}", "success") session_loaded = True except Exception as e: self.log(f"Could not load session: {e}", "warning") if not session_loaded: self.log("No session loaded - anonymous access only", "warning") self.log("Some features may be limited without login", "warning") # Don't validate here - it will be done after _load_session returns def reauthorize_session(self, force_new: bool = False) -> bool: """ Reauthorize Instagram session using stored credentials Args: force_new: Force a completely new login even if session exists Returns: True if reauthorization successful, False otherwise """ if not self.username or not self.password: self.log("Cannot reauthorize - no credentials available", "error") self.log("Please provide username and password in config", "info") return False # Always create fresh session for each download if self.totp_secret: self.log("Using CLI method for login with 2FA", "info") try: try: import pyotp except ImportError: self.log("pyotp not installed, attempting to install...", "warning") import subprocess import sys subprocess.check_call([sys.executable, "-m", "pip", "install", "pyotp"]) import pyotp import subprocess import pickle # Generate 2FA code totp = pyotp.TOTP(self.totp_secret) two_factor_code = totp.now() self.log(f"Generated 2FA code: {two_factor_code}", "info") # Use instaloader CLI with the 2FA code # Use configured session file path or default if self.session_file: session_file = Path(self.session_file).expanduser() session_file.parent.mkdir(parents=True, exist_ok=True) else: session_file = self.session_dir / f"session-{self.username}" # Pass password as separate argument to avoid shell escaping issues cmd = [ 'instaloader', '--login', self.username, '--password', self.password, '--sessionfile', str(session_file) ] self.log("Using instaloader CLI for login...", "info") self.log(f"Debug - Command: instaloader --login {self.username} --password [HIDDEN] --sessionfile {session_file}", "debug") self.log(f"Debug - Password length being passed: {len(self.password)}", "debug") # Run with 2FA code piped via stdin (avoids shell=True security risk) self.log(f"Running command with 2FA code via stdin", "debug") result = subprocess.run( cmd, input=two_factor_code + '\n', capture_output=True, text=True, timeout=30 ) # Check if login was successful by looking for success messages and session file login_success = ("Logged in as" in result.stdout and "Saved session to" in result.stdout and session_file.exists()) if login_success: self.log("Successfully logged in via CLI", "success") # Wait a moment for file to be fully written time.sleep(1) # Load the new session with open(session_file, 'rb') as f: session_data = pickle.load(f) # Apply session to our loader self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/') if session_data.get('csrftoken'): self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/') # Set username in context self.loader.context.username = self.username self.session_is_valid = True return True else: # Log details about the failure if not session_file.exists(): self.log("Session file was not created", "error") self.log(f"CLI stdout: {result.stdout[:300]}", "info") if result.stderr: self.log(f"CLI stderr: {result.stderr[:200]}", "info") return False except Exception as e: self.log(f"CLI login error: {str(e)[:100]}", "error") # Fall back to Python API method pass # Fallback to Python API login try: import instaloader import pickle # Clear existing session if force_new if force_new: self.log("Clearing existing session for fresh login", "info") # Clear all cookies properly to avoid duplicate sessionid issue try: self.loader.context._session.cookies.clear() except Exception: pass # Create a fresh loader instance to avoid cookie conflicts self.loader = instaloader.Instaloader( download_pictures=False, download_videos=False, download_video_thumbnails=False, compress_json=False, save_metadata=False, post_metadata_txt_pattern="", quiet=True, fatal_status_codes=[], max_connection_attempts=3 ) # Add delay before login attempt to avoid rate limiting import random delay = random.uniform(3, 5) self.log(f"Waiting {delay:.1f}s before login attempt", "debug") time.sleep(delay) # Attempt login self.log(f"Logging in as {self.username}...", "info") try: self.loader.login(self.username, self.password) # Save the new session # Use configured session file path or default if self.session_file: session_file = Path(self.session_file).expanduser() session_file.parent.mkdir(parents=True, exist_ok=True) else: session_file = self.session_dir / f"session-{self.username}" session_data = { 'sessionid': self.loader.context._session.cookies.get('sessionid'), 'csrftoken': self.loader.context._session.cookies.get('csrftoken'), 'username': self.username, 'timestamp': datetime.now().isoformat() } with open(session_file, 'wb') as f: pickle.dump(session_data, f) self.log(f"Session saved to {session_file}", "success") self.session_is_valid = True self.log("Successfully reauthorized session", "success") return True except instaloader.exceptions.BadCredentialsException: self.log("Invalid username or password", "error") self.log("Please check your Instagram credentials in the config file", "info") self.log("The password may have been changed or the account may be locked", "info") return False except instaloader.exceptions.TwoFactorAuthRequiredException: self.log("Two-factor authentication required", "info") # Use subprocess to call instaloader CLI which handles 2FA better if self.totp_secret: try: try: import pyotp except ImportError: self.log("pyotp not installed, attempting to install...", "warning") import sys subprocess.check_call([sys.executable, "-m", "pip", "install", "pyotp"]) import pyotp import subprocess # Generate 2FA code totp = pyotp.TOTP(self.totp_secret) two_factor_code = totp.now() self.log(f"Generated 2FA code: {two_factor_code}", "info") # Use instaloader CLI with the 2FA code # Use configured session file path or default if self.session_file: session_file = Path(self.session_file).expanduser() session_file.parent.mkdir(parents=True, exist_ok=True) else: session_file = self.session_dir / f"session-{self.username}" # Pass password as separate argument to avoid shell escaping issues cmd = [ 'instaloader', '--login', self.username, '--password', self.password, '--sessionfile', str(session_file) ] self.log("Using instaloader CLI for 2FA login...", "info") # Run with 2FA code as input result = subprocess.run( cmd, input=f"{two_factor_code}\n", capture_output=True, text=True, timeout=30 ) # Check if login was successful by looking for success messages and session file login_success = ("Logged in as" in result.stdout and "Saved session to" in result.stdout and session_file.exists()) if login_success: self.log("Successfully logged in with 2FA via CLI", "success") # Wait a moment for file to be fully written time.sleep(1) # Load the new session import pickle with open(session_file, 'rb') as f: session_data = pickle.load(f) # Apply session to our loader self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/') if session_data.get('csrftoken'): self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/') # Set username in context self.loader.context.username = self.username self.session_is_valid = True return True else: # Log details about the failure if not session_file.exists(): self.log("Session file was not created", "error") else: self.log("Login output unclear, treating as failure", "error") self.log(f"CLI output: {result.stdout[:300]}", "debug") return False except Exception as e: self.log(f"2FA login failed: {str(e)[:100]}", "error") return False else: self.log("No 2FA code available - login failed", "error") self.log("Options:", "info") self.log("1. Add 'totp_secret' to config with your 2FA secret key", "info") self.log(f"2. Create file: {self.session_dir}/2fa_code_USERNAME.txt with code", "info") self.log("3. Run interactively to enter code when prompted", "info") return False except instaloader.exceptions.ConnectionException as e: if "checkpoint" in str(e).lower(): self.log("Instagram checkpoint required - please verify account in browser", "error") elif "429" in str(e): self.log("Rate limited during login - please wait before retrying", "error") else: self.log(f"Connection error during login: {str(e)[:100]}", "error") return False except Exception as e: self.log(f"Reauthorization failed: {str(e)[:100]}", "error") return False def _is_already_downloaded(self, post_id: str) -> bool: """Check if post has been downloaded (uses centralized function for cross-module detection)""" if not self.use_database or not self.unified_db: return False # Check by URL first url = f"https://www.instagram.com/p/{post_id}/" if self.unified_db.is_downloaded(url): return True # Use centralized function for consistent cross-module detection return is_instagram_downloaded(self.unified_db, str(post_id)) def _record_download(self, post_id: str, username: str, content_type: str, filename: str = None, post_date: datetime = None, caption: str = None, likes: int = None, comments: int = None, deferred: bool = False): """Record successful download (uses centralized function for normalized media_id) Args: deferred: If True, don't record to database now - add to pending_downloads list for later recording after file move is complete """ # Use centralized function for consistent cross-module storage url = f"https://www.instagram.com/p/{post_id}/" extra_metadata = { 'username': username, 'caption': caption[:500] if caption else None, 'likes': likes, 'comments': comments } # If deferred, store for later recording instead of recording now if deferred: self.pending_downloads.append({ 'media_id': str(post_id), 'username': username, 'filename': filename, 'url': url, 'post_date': post_date.isoformat() if post_date else None, 'content_type': content_type, 'metadata': extra_metadata }) self.log(f"Deferred recording for {post_id}", "debug") return True if not self.use_database or not self.unified_db: return record_instagram_download( db=self.unified_db, media_id=str(post_id), username=username, content_type=content_type, filename=filename, url=url, post_date=post_date, method='instaloader', extra_metadata=extra_metadata ) def get_pending_downloads(self): """Get list of downloads that were deferred for later recording""" return self.pending_downloads.copy() def clear_pending_downloads(self): """Clear the pending downloads list after they've been recorded""" self.pending_downloads = [] def download(self, username: str, output_dir: str = "downloads", content_type: str = "posts", max_downloads: int = None, days_back: int = None, date_from: datetime = None, date_to: datetime = None, defer_database: bool = False) -> int: """ Download content from Instagram user Args: username: Instagram username to download from output_dir: Directory to save downloads content_type: Type of content (posts, stories, reels, all) max_downloads: Maximum number to download days_back: Download content from last N days date_from: Start date for downloads date_to: End date for downloads defer_database: If True, don't record to database immediately - store in pending_downloads for later recording after file move is complete Returns: Number of items downloaded """ self.defer_database = defer_database # Store for use in _record_download # Refresh session before each download if self.username and self.password: self.log("Refreshing session for download...", "info") if not self.reauthorize_session(): self.log("Failed to refresh session", "error") if self.require_valid_session: self.log(f"Skipping download for @{username} - session refresh failed and require_valid_session is True", "warning") return 0 self.session_is_valid = False else: self.session_is_valid = True self.log(f"Session ready for @{username}", "success") elif self.require_valid_session: self.log(f"Skipping download for @{username} - no credentials and require_valid_session is True", "warning") return 0 output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) # Setup date filtering if days_back and not date_from: date_from = datetime.now() - timedelta(days=days_back) if not date_to: date_to = datetime.now() self.log(f"Downloading {content_type} for @{username}", "info") if date_from: self.log(f"Date range: {date_from.strftime('%Y-%m-%d')} to {date_to.strftime('%Y-%m-%d')}", "info") downloaded_count = 0 skipped_count = 0 error_count = 0 try: # Get profile with retry profile = self._get_profile_with_retry(username) if not profile: return 0 # Download based on content type if content_type in ["posts", "all"]: count = self._download_posts(profile, output_path, max_downloads, date_from, date_to) downloaded_count += count if content_type in ["stories", "all"]: count = self._download_stories(profile, output_path) downloaded_count += count if content_type in ["reels", "all"]: count = self._download_reels(profile, output_path, max_downloads, date_from, date_to) downloaded_count += count except Exception as e: self.log(f"Download error: {e}", "error") # Handle specific errors if "429" in str(e) or "rate" in str(e).lower(): self.log("Rate limited by Instagram! Try again later.", "error") self.log(f"Waiting {self.error_delay} seconds...", "warning") time.sleep(self.error_delay) elif "login" in str(e).lower(): self.log("Login required for this content!", "error") self.log("Create a session file first (see documentation)", "info") elif "not found" in str(e).lower(): self.log(f"User {username} not found or private", "error") self.log(f"Download complete: {downloaded_count} downloaded", "success") return downloaded_count def _get_profile_with_retry(self, username: str): """Get Instagram profile with retry logic""" import instaloader # Ensure loader is initialized if not self.loader: self._init_loader() for attempt in range(self.max_retries): try: self.log(f"Fetching profile: {username}", "info") # Check if context is available if not hasattr(self.loader, 'context') or self.loader.context is None: self.log("Reinitializing loader context...", "debug") self._init_loader() profile = instaloader.Profile.from_username(self.loader.context, username) # Log profile info self.log(f"Profile found: {profile.full_name} ({profile.mediacount} posts)", "success") if profile.is_private and not profile.followed_by_viewer: self.log("Profile is private and not followed", "warning") return profile except Exception as e: if attempt < self.max_retries - 1: wait = self.error_delay * (attempt + 1) self.log(f"Error getting profile (attempt {attempt + 1}): {e}", "warning") self.log(f"Retrying in {wait} seconds...", "info") time.sleep(wait) else: self.log(f"Failed to get profile after {self.max_retries} attempts", "error") raise return None def _download_posts(self, profile, output_path: Path, max_downloads: int, date_from: datetime, date_to: datetime) -> int: """Download posts from profile""" downloaded = 0 skipped = 0 self.log(f"Downloading posts...", "info") self.activity_manager.update_status("Checking posts") try: posts = profile.get_posts() for post in posts: # Check date range if date_from and post.date < date_from: self.log(f"Reached posts older than date range, stopping", "info") break if date_to and post.date > date_to: continue # Check if already downloaded media_id = str(post.mediaid) shortcode = post.shortcode if self._is_already_downloaded(media_id): self.log(f"Skipping already downloaded: {shortcode}", "debug") skipped += 1 continue # Download post try: self.log(f"Downloading post {shortcode} from {post.date.strftime('%Y-%m-%d')}", "info") # Create temp directory for instaloader temp_dir = output_path / f"temp_{shortcode}" temp_dir.mkdir(parents=True, exist_ok=True) # Download with Instaloader to temp dir self.loader.download_post(post, target=temp_dir) # Move and rename files to match FastDL format self._process_downloaded_files(temp_dir, output_path, post.owner_username, media_id, post.date) # Clean up temp directory import shutil shutil.rmtree(temp_dir, ignore_errors=True) # Record in database self._record_download( post_id=media_id, username=post.owner_username, content_type="post", post_date=post.date, caption=post.caption[:500] if post.caption else None, likes=post.likes, comments=post.comments, deferred=self.defer_database ) downloaded += 1 # Update status self.activity_manager.update_status( "Downloading posts", progress_current=downloaded, progress_total=max_downloads ) # Check max downloads if max_downloads and downloaded >= max_downloads: self.log(f"Reached max downloads ({max_downloads})", "info") break # Random delay to avoid detection self._smart_delay(downloaded) except Exception as e: self.log(f"Error downloading post {media_id}: {e}", "error") if "429" in str(e): self.log("Rate limited! Stopping downloads.", "error") break except Exception as e: error_msg = str(e) self.log(f"Error iterating posts: {e}", "error") # Check if Instagram is blocking us if "401" in error_msg or "Please wait a few minutes" in error_msg: self.log("Instagram is blocking requests - session may be compromised", "error") self.log("Aborting all downloads to prevent further issues", "error") # Mark session as invalid to prevent further attempts self.session_is_valid = False return 0 elif "403" in error_msg or "forbidden" in error_msg.lower(): self.log("Access forbidden - Instagram has blocked this session", "error") self.session_is_valid = False return 0 self.log(f"Posts: {downloaded} downloaded, {skipped} skipped", "info") return downloaded def _process_downloaded_files(self, temp_dir: Path, output_path: Path, username: str, fallback_id: str, post_date: datetime): """Process downloaded files to match FastDL naming and timestamps Returns: list: List of processed filenames, or empty list if no files processed """ import shutil import re import json import lzma from datetime import timedelta processed_files = [] # Format date for filename - subtract 4 hours for timezone adjustment adjusted_date_for_filename = post_date - timedelta(hours=4) date_str = adjusted_date_for_filename.strftime('%Y%m%d_%H%M%S') # Build a mapping of original filenames to media IDs from JSON media_id_map = {} # Load JSON file to get media IDs from URLs json_files = list(temp_dir.glob('*.json.xz')) if not json_files: json_files = list(temp_dir.glob('*.json')) if json_files: try: json_file = json_files[0] if json_file.suffix == '.xz': with lzma.open(json_file, 'rt') as f: data = json.load(f) else: with open(json_file, 'r') as f: data = json.load(f) # Extract media IDs from URLs in carousel or single image if 'node' in data: node = data['node'] # Check for carousel in iphone_struct if 'iphone_struct' in node and 'carousel_media' in node['iphone_struct']: # Carousel post - each image has its own media ID for idx, item in enumerate(node['iphone_struct']['carousel_media'], 1): if 'image_versions2' in item and 'candidates' in item['image_versions2']: url = item['image_versions2']['candidates'][0]['url'] # Extract media ID from URL parts = url.split('/') for part in parts: if '.jpg' in part or '.mp4' in part: filename = part.split('?')[0] # Remove extension and _n suffix media_id = filename.replace('.jpg', '').replace('.mp4', '').replace('_n', '') # Map the index to media ID media_id_map[str(idx)] = media_id break # Check for single image/video elif 'display_url' in node or ('iphone_struct' in node and 'image_versions2' in node['iphone_struct']): # Single post url = node.get('display_url', '') if not url and 'iphone_struct' in node and 'image_versions2' in node['iphone_struct']: url = node['iphone_struct']['image_versions2']['candidates'][0]['url'] if url: parts = url.split('/') for part in parts: if '.jpg' in part or '.mp4' in part: filename = part.split('?')[0] media_id = filename.replace('.jpg', '').replace('.mp4', '').replace('_n', '') media_id_map['single'] = media_id break except Exception as e: self.log(f"Could not extract media IDs from JSON: {e}", "debug") # Process all downloaded files for file_path in temp_dir.iterdir(): if file_path.is_file(): # Skip JSON metadata files if file_path.suffix.lower() in ['.json', '.xz', '.txt']: continue # Get file extension ext = file_path.suffix.lower() # Check if it's a multi-image post (has _1, _2, etc. in filename) match = re.search(r'_(\d+)\.(jpg|jpeg|png|mp4|mov)', file_path.name, re.IGNORECASE) if match: index = match.group(1) # Use the media ID for this specific index media_id = media_id_map.get(index, fallback_id) new_filename = f"{username}_{date_str}_{media_id}{ext}" else: # Single image/video media_id = media_id_map.get('single', fallback_id) new_filename = f"{username}_{date_str}_{media_id}{ext}" # Move and rename file new_path = output_path / new_filename shutil.move(str(file_path), str(new_path)) # Check for duplicate hash before finalizing (hash blacklist persists even if original deleted) file_hash = self.unified_db.get_file_hash(str(new_path)) if self.unified_db else None if file_hash: existing = self.unified_db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(new_path) != existing.get('file_path'): # Duplicate hash found - content was already downloaded (prevents redownload of deleted content) self.log(f"⚠ Duplicate content detected (hash match): {new_filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") # Delete the duplicate regardless of whether original file still exists try: new_path.unlink() self.log(f"Deleted duplicate (hash blacklist): {new_filename}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {new_filename}: {e}", "warning") # Set file timestamps to post date self._update_file_timestamp(new_path, post_date) # Add to processed files list processed_files.append(new_filename) return processed_files def _update_file_timestamp(self, filepath: Path, post_date: datetime): """Update file timestamps to match post date""" try: import os # Convert datetime to timestamp with 4-hour adjustment from datetime import timedelta adjusted_date = post_date - timedelta(hours=4) timestamp = adjusted_date.timestamp() # Set both access and modification time os.utime(filepath, (timestamp, timestamp)) self.log(f"Updated timestamp for {filepath.name} (adjusted -4 hours)", "debug") except Exception as e: self.log(f"Failed to update timestamp: {e}", "warning") def _download_stories(self, profile, output_path: Path) -> int: """Download stories from profile""" downloaded = 0 if not self.username: self.log("Login required to download stories", "warning") return 0 self.log(f"Downloading stories...", "info") self.activity_manager.update_status("Checking stories") try: import instaloader import shutil # Get user ID for stories user_id = profile.userid # Download stories for story in self.loader.get_stories([user_id]): for item in story.get_items(): media_id = str(item.mediaid) if self._is_already_downloaded(media_id): self.log(f"Skipping already downloaded story: {media_id}", "debug") continue try: self.log(f"Downloading story {media_id}", "info") # Download story to temp dir temp_dir = output_path / f"temp_story_{media_id}" temp_dir.mkdir(parents=True, exist_ok=True) self.loader.download_storyitem(item, target=temp_dir) # Process and move files to match FastDL format processed_files = self._process_downloaded_files(temp_dir, output_path, profile.username, media_id, item.date) # Clean up temp directory shutil.rmtree(temp_dir, ignore_errors=True) # Only record in database if files were successfully processed if processed_files: # Get the first processed filename for database record filename = processed_files[0] if isinstance(processed_files, list) else None self._record_download( post_id=media_id, username=profile.username, content_type="story", filename=filename, post_date=item.date, deferred=self.defer_database ) downloaded += 1 self.activity_manager.update_status( "Downloading stories", progress_current=downloaded, progress_total=max_downloads ) else: self.log(f"No files processed for story {media_id}, not recording in database", "warning") self._smart_delay() except Exception as e: self.log(f"Error downloading story {media_id}: {e}", "error") except Exception as e: self.log(f"Error downloading stories: {e}", "error") if "login" in str(e).lower(): self.log("Stories require login!", "warning") self.log(f"Stories: {downloaded} downloaded", "info") return downloaded def _download_reels(self, profile, output_path: Path, max_downloads: int, date_from: datetime, date_to: datetime) -> int: """Download reels from profile""" downloaded = 0 self.log(f"Downloading reels...", "info") self.activity_manager.update_status("Checking reels") try: # Reels are part of posts, filter for videos posts = profile.get_posts() for post in posts: # Check if it's a reel (video post) if not post.is_video: continue # Check date range if date_from and post.date < date_from: break if date_to and post.date > date_to: continue # Check if already downloaded media_id = str(post.mediaid) shortcode = post.shortcode if self._is_already_downloaded(media_id): self.log(f"Skipping already downloaded reel: {shortcode}", "debug") continue try: self.log(f"Downloading reel {shortcode}", "info") # Download reel to temp dir temp_dir = output_path / f"temp_reel_{shortcode}" temp_dir.mkdir(parents=True, exist_ok=True) self.loader.download_post(post, target=temp_dir) # Process and move files to match FastDL format self._process_downloaded_files(temp_dir, output_path, post.owner_username, media_id, post.date) # Clean up temp directory import shutil shutil.rmtree(temp_dir, ignore_errors=True) # Record in database self._record_download( post_id=media_id, username=post.owner_username, content_type="reel", post_date=post.date, likes=post.likes, comments=post.comments, deferred=self.defer_database ) downloaded += 1 # Update status self.activity_manager.update_status( "Downloading reels", progress_current=downloaded, progress_total=max_downloads ) if max_downloads and downloaded >= max_downloads: break self._smart_delay() except Exception as e: self.log(f"Error downloading reel {media_id}: {e}", "error") except Exception as e: self.log(f"Error downloading reels: {e}", "error") self.log(f"Reels: {downloaded} downloaded", "info") return downloaded def _smart_delay(self, batch_count=0): """Smart delay between downloads to avoid detection""" # Random delay with exponential backoff if needed base_delay = random.uniform(self.min_delay, self.max_delay) # Add batch delay if we've downloaded a batch if batch_count > 0 and batch_count % self.download_batch_size == 0: self.log(f"Batch limit reached ({self.download_batch_size} items), taking a longer break", "info") base_delay = self.batch_delay + random.uniform(0, 10) # Add extra delay if we're downloading fast elif len(self.request_times) > 10: recent_requests = self.request_times[-10:] avg_interval = (recent_requests[-1] - recent_requests[0]) / 9 if avg_interval < 5: # Too fast base_delay += random.uniform(5, 10) self.log("Slowing down to avoid detection", "debug") time.sleep(base_delay) def login(self, username: str, password: str = None) -> bool: """ Login to Instagram and save session Args: username: Instagram username password: Instagram password (will prompt if not provided) Returns: True if login successful """ try: if not password: import getpass password = getpass.getpass(f"Password for {username}: ") self.log(f"Logging in as {username}...", "info") self.loader.login(username, password) # Save session # Use configured session file path or default if self.session_file: session_file = Path(self.session_file).expanduser() session_file.parent.mkdir(parents=True, exist_ok=True) else: session_file = self.session_dir / f"session-{username}" self.loader.save_session_to_file(session_file) self.log(f"Session saved to {session_file}", "success") self.username = username return True except Exception as e: self.log(f"Login failed: {e}", "error") if "checkpoint" in str(e).lower(): self.log("Instagram requires verification (checkpoint)", "warning") self.log("Complete verification in browser, then export session", "info") elif "bad password" in str(e).lower(): self.log("Invalid username or password", "error") elif "429" in str(e): self.log("Too many login attempts, try again later", "error") return False def get_database_stats(self) -> Dict: """Get database statistics""" if not self.use_database or not self.unified_db: return {"enabled": False} # Use unified database statistics return self.unified_db.get_statistics(platform='instagram') # Test function def test_module(): """Test the InstaLoader module""" from pathlib import Path print("Testing InstaLoader Module") print("=" * 60) # Use proper path in database directory for testing test_db_path = str(Path(__file__).parent.parent / 'database' / 'test_instaloader.db') module = InstaLoaderModule( show_progress=True, use_database=True, db_path=test_db_path ) # Test download (limited) count = module.download( username="evalongoria", output_dir="/opt/temp/test/instagram/posts", content_type="posts", max_downloads=2, days_back=30 ) print(f"\nDownloaded {count} items") # Show stats stats = module.get_database_stats() print(f"\nDatabase stats:") print(f" Total: {stats.get('total_downloads', 0)}") print(f" By type: {stats.get('by_type', {})}") return count > 0 if __name__ == "__main__": import sys success = test_module() sys.exit(0 if success else 1)