Files
media-downloader/modules/instaloader_module.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

1259 lines
56 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Instaloader Module for Instagram Downloads
Based on FastDL module architecture with detection safeguards
Bypasses Cloudflare by using Instagram directly
"""
from pathlib import Path
from datetime import datetime, timedelta
import os
import sys
import time
import random
import json
from typing import Optional, Dict
import pickle
from modules.base_module import LoggingMixin
from modules.instagram_utils import (
extract_instagram_media_id,
record_instagram_download,
is_instagram_downloaded
)
class InstaLoaderModule(LoggingMixin):
"""
Instagram downloader using Instaloader with safeguards
Features:
- Rate limiting to avoid detection
- Session persistence and rotation
- Database tracking to avoid re-downloads
- Automatic retry with exponential backoff
- Detection avoidance strategies
"""
def __init__(self,
username: Optional[str] = None,
password: Optional[str] = None,
session_file: Optional[str] = None,
totp_secret: Optional[str] = None,
use_database: bool = True,
log_callback=None,
show_progress: bool = True,
max_rate: int = 100, # Max requests per hour
unified_db=None,
require_valid_session: bool = False):
"""
Initialize the Instaloader module
Args:
username: Instagram username for login (optional)
password: Instagram password for reauthorization (optional)
session_file: Path to saved session file
totp_secret: TOTP secret key for 2FA (optional)
use_database: Track downloads in database
log_callback: Callback for logging (tag, level, message)
show_progress: Print progress messages
max_rate: Maximum requests per hour (rate limiting)
unified_db: Unified database instance
require_valid_session: If True, skip downloads if session is invalid
"""
# Initialize logging via mixin
self._init_logger('Instagram', log_callback, default_module='Download')
self.username = username
self.password = password
self.totp_secret = totp_secret
self.session_file = session_file
self.use_database = use_database
self.show_progress = show_progress
self.max_rate = max_rate
self.unified_db = unified_db
self.require_valid_session = require_valid_session
self.session_is_valid = False # Track session validity
# Rate limiting
self.request_times = []
self.last_request_time = 0
# Session management - use script directory
script_dir = Path(__file__).parent.parent # Go up from modules/ to script root
self.session_dir = script_dir / "sessions"
self.session_dir.mkdir(parents=True, exist_ok=True)
# Initialize Instaloader
self.loader = None
self._init_loader()
# Debug: Check what credentials we have (without exposing sensitive data)
self.log(f"Module initialized with username: {self.username}", "debug")
self.log(f"Password provided: {self.password is not None}", "debug")
# No separate database initialization needed - using unified database only
# Initialize activity status manager for real-time updates
from modules.activity_status import get_activity_manager
self.activity_manager = get_activity_manager(unified_db)
# Detection avoidance settings - increased to avoid Instagram detection
# Based on GitHub issue #2391 recommendations
self.min_delay = 5 # Minimum seconds between requests (increased from 3)
self.max_delay = 15 # Maximum seconds between requests (increased from 10)
self.error_delay = 120 # Delay after error (increased from 60)
self.max_retries = 3
self.download_batch_size = 10 # Download in smaller batches
self.batch_delay = 30 # Delay between batches (seconds)
self.pending_downloads = [] # Track downloads for deferred database recording
def _init_loader(self):
"""Initialize Instaloader with safeguards"""
try:
import instaloader
except ImportError:
self.log("Installing instaloader...", "info")
import subprocess
subprocess.run(
["pip", "install", "--quiet", "--break-system-packages", "instaloader"],
capture_output=True,
check=False
)
import instaloader
# Suppress instaloader's direct output and redirect to our logger
class LoggerAdapter:
def __init__(self, parent_log_func):
self.parent_log = parent_log_func
def write(self, message):
if message.strip():
# Filter and format instaloader messages
msg = message.strip()
if 'JSON Query' in msg or '401 Unauthorized' in msg or '403 Forbidden' in msg:
# Convert to our format
if '401 Unauthorized' in msg:
self.parent_log("Session authentication issue - retrying", "warning")
elif '403 Forbidden' in msg:
self.parent_log("Access forbidden - rate limited", "warning")
elif 'Error when checking' in msg:
self.parent_log("Session validation failed", "debug")
elif msg and not msg.startswith('['):
self.parent_log(msg, "debug")
def flush(self):
pass
# Configure Instaloader with conservative settings
self.loader = instaloader.Instaloader(
quiet=True, # Always quiet to suppress direct output
download_videos=True,
download_video_thumbnails=False,
download_geotags=False,
download_comments=False,
save_metadata=True, # Need JSON to get media IDs
compress_json=True, # Save space with compression
post_metadata_txt_pattern="", # Don't save txt files
storyitem_metadata_txt_pattern="",
max_connection_attempts=5, # More retries
request_timeout=300,
# Don't treat 403 as fatal - Instagram returns this often
fatal_status_codes=[429], # Only stop on rate limit
# Use a desktop user agent to avoid mobile restrictions
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
)
# Redirect stderr to capture instaloader errors
if self.show_progress:
self._original_stderr = sys.stderr
sys.stderr = LoggerAdapter(self.log)
# Don't refresh session at startup - will do it on each download
self.session_is_valid = False
def _rate_limit(self):
"""Rate limiting to avoid detection"""
current_time = time.time()
# Clean old request times (older than 1 hour)
self.request_times = [t for t in self.request_times if current_time - t < 3600]
# Check rate limit (be more conservative)
if len(self.request_times) >= self.max_rate:
# Calculate wait time
oldest_request = min(self.request_times)
wait_time = 3600 - (current_time - oldest_request) + random.uniform(5, 15)
if wait_time > 0:
self.log(f"Rate limit reached, waiting {wait_time:.0f} seconds", "warning")
time.sleep(wait_time)
# Add longer random delay between requests to avoid 403
if self.last_request_time > 0:
elapsed = current_time - self.last_request_time
# Increase delays to avoid detection
min_wait = random.uniform(self.min_delay * 2, self.max_delay * 2)
if elapsed < min_wait:
wait = min_wait - elapsed
self.log(f"Waiting {wait:.1f}s between requests", "debug")
time.sleep(wait)
# Record request
self.request_times.append(current_time)
self.last_request_time = current_time
def is_ready(self) -> bool:
"""Check if the module is ready to download (will refresh session on download)"""
# Always return True since we refresh session on each download
return True
def _load_session(self):
"""Load saved session"""
import pickle
# Try multiple session sources
session_loaded = False
# 1. Try provided session file (pickle format)
if self.session_file:
session_path = Path(self.session_file).expanduser()
if session_path.exists():
try:
with open(session_path, 'rb') as f:
session_data = pickle.load(f)
# Set cookies directly
self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
if session_data.get('csrftoken'):
self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')
# Just set the username and mark as loaded
try:
self.loader.context.username = self.username
self.log(f"Session loaded from {self.session_file}", "success")
session_loaded = True
except Exception as e:
self.log(f"Error setting session username: {e}", "warning")
session_loaded = False
except Exception as e:
self.log(f"Could not load session file: {e}", "warning")
# 2. Try saved sessions directory (pickle format)
if not session_loaded and self.username:
session_path = self.session_dir / f"session-{self.username}"
if session_path.exists():
try:
with open(session_path, 'rb') as f:
session_data = pickle.load(f)
self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
if session_data.get('csrftoken'):
self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')
# Set the username on the context
self.loader.context.username = self.username
self.log(f"Session loaded for {self.username}", "success")
session_loaded = True
except Exception as e:
self.log(f"Could not load saved session: {e}", "warning")
# 3. Try script sessions directory (pickle format)
if not session_loaded and self.username:
script_session = self.session_dir / f"session-{self.username}"
if script_session.exists():
try:
with open(script_session, 'rb') as f:
session_data = pickle.load(f)
self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
if session_data.get('csrftoken'):
self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')
# Set the username on the context
self.loader.context.username = self.username
self.log(f"Session loaded from {script_session}", "success")
session_loaded = True
except Exception as e:
self.log(f"Could not load session: {e}", "warning")
if not session_loaded:
self.log("No session loaded - anonymous access only", "warning")
self.log("Some features may be limited without login", "warning")
# Don't validate here - it will be done after _load_session returns
def reauthorize_session(self, force_new: bool = False) -> bool:
"""
Reauthorize Instagram session using stored credentials
Args:
force_new: Force a completely new login even if session exists
Returns:
True if reauthorization successful, False otherwise
"""
if not self.username or not self.password:
self.log("Cannot reauthorize - no credentials available", "error")
self.log("Please provide username and password in config", "info")
return False
# Always create fresh session for each download
if self.totp_secret:
self.log("Using CLI method for login with 2FA", "info")
try:
try:
import pyotp
except ImportError:
self.log("pyotp not installed, attempting to install...", "warning")
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "pyotp"])
import pyotp
import subprocess
import pickle
# Generate 2FA code
totp = pyotp.TOTP(self.totp_secret)
two_factor_code = totp.now()
self.log(f"Generated 2FA code: {two_factor_code}", "info")
# Use instaloader CLI with the 2FA code
# Use configured session file path or default
if self.session_file:
session_file = Path(self.session_file).expanduser()
session_file.parent.mkdir(parents=True, exist_ok=True)
else:
session_file = self.session_dir / f"session-{self.username}"
# Pass password as separate argument to avoid shell escaping issues
cmd = [
'instaloader',
'--login', self.username,
'--password', self.password,
'--sessionfile', str(session_file)
]
self.log("Using instaloader CLI for login...", "info")
self.log(f"Debug - Command: instaloader --login {self.username} --password [HIDDEN] --sessionfile {session_file}", "debug")
self.log(f"Debug - Password length being passed: {len(self.password)}", "debug")
# Run with 2FA code piped via stdin (avoids shell=True security risk)
self.log(f"Running command with 2FA code via stdin", "debug")
result = subprocess.run(
cmd,
input=two_factor_code + '\n',
capture_output=True,
text=True,
timeout=30
)
# Check if login was successful by looking for success messages and session file
login_success = ("Logged in as" in result.stdout and
"Saved session to" in result.stdout and
session_file.exists())
if login_success:
self.log("Successfully logged in via CLI", "success")
# Wait a moment for file to be fully written
time.sleep(1)
# Load the new session
with open(session_file, 'rb') as f:
session_data = pickle.load(f)
# Apply session to our loader
self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
if session_data.get('csrftoken'):
self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')
# Set username in context
self.loader.context.username = self.username
self.session_is_valid = True
return True
else:
# Log details about the failure
if not session_file.exists():
self.log("Session file was not created", "error")
self.log(f"CLI stdout: {result.stdout[:300]}", "info")
if result.stderr:
self.log(f"CLI stderr: {result.stderr[:200]}", "info")
return False
except Exception as e:
self.log(f"CLI login error: {str(e)[:100]}", "error")
# Fall back to Python API method
pass
# Fallback to Python API login
try:
import instaloader
import pickle
# Clear existing session if force_new
if force_new:
self.log("Clearing existing session for fresh login", "info")
# Clear all cookies properly to avoid duplicate sessionid issue
try:
self.loader.context._session.cookies.clear()
except Exception:
pass
# Create a fresh loader instance to avoid cookie conflicts
self.loader = instaloader.Instaloader(
download_pictures=False,
download_videos=False,
download_video_thumbnails=False,
compress_json=False,
save_metadata=False,
post_metadata_txt_pattern="",
quiet=True,
fatal_status_codes=[],
max_connection_attempts=3
)
# Add delay before login attempt to avoid rate limiting
import random
delay = random.uniform(3, 5)
self.log(f"Waiting {delay:.1f}s before login attempt", "debug")
time.sleep(delay)
# Attempt login
self.log(f"Logging in as {self.username}...", "info")
try:
self.loader.login(self.username, self.password)
# Save the new session
# Use configured session file path or default
if self.session_file:
session_file = Path(self.session_file).expanduser()
session_file.parent.mkdir(parents=True, exist_ok=True)
else:
session_file = self.session_dir / f"session-{self.username}"
session_data = {
'sessionid': self.loader.context._session.cookies.get('sessionid'),
'csrftoken': self.loader.context._session.cookies.get('csrftoken'),
'username': self.username,
'timestamp': datetime.now().isoformat()
}
with open(session_file, 'wb') as f:
pickle.dump(session_data, f)
self.log(f"Session saved to {session_file}", "success")
self.session_is_valid = True
self.log("Successfully reauthorized session", "success")
return True
except instaloader.exceptions.BadCredentialsException:
self.log("Invalid username or password", "error")
self.log("Please check your Instagram credentials in the config file", "info")
self.log("The password may have been changed or the account may be locked", "info")
return False
except instaloader.exceptions.TwoFactorAuthRequiredException:
self.log("Two-factor authentication required", "info")
# Use subprocess to call instaloader CLI which handles 2FA better
if self.totp_secret:
try:
try:
import pyotp
except ImportError:
self.log("pyotp not installed, attempting to install...", "warning")
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "pyotp"])
import pyotp
import subprocess
# Generate 2FA code
totp = pyotp.TOTP(self.totp_secret)
two_factor_code = totp.now()
self.log(f"Generated 2FA code: {two_factor_code}", "info")
# Use instaloader CLI with the 2FA code
# Use configured session file path or default
if self.session_file:
session_file = Path(self.session_file).expanduser()
session_file.parent.mkdir(parents=True, exist_ok=True)
else:
session_file = self.session_dir / f"session-{self.username}"
# Pass password as separate argument to avoid shell escaping issues
cmd = [
'instaloader',
'--login', self.username,
'--password', self.password,
'--sessionfile', str(session_file)
]
self.log("Using instaloader CLI for 2FA login...", "info")
# Run with 2FA code as input
result = subprocess.run(
cmd,
input=f"{two_factor_code}\n",
capture_output=True,
text=True,
timeout=30
)
# Check if login was successful by looking for success messages and session file
login_success = ("Logged in as" in result.stdout and
"Saved session to" in result.stdout and
session_file.exists())
if login_success:
self.log("Successfully logged in with 2FA via CLI", "success")
# Wait a moment for file to be fully written
time.sleep(1)
# Load the new session
import pickle
with open(session_file, 'rb') as f:
session_data = pickle.load(f)
# Apply session to our loader
self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
if session_data.get('csrftoken'):
self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')
# Set username in context
self.loader.context.username = self.username
self.session_is_valid = True
return True
else:
# Log details about the failure
if not session_file.exists():
self.log("Session file was not created", "error")
else:
self.log("Login output unclear, treating as failure", "error")
self.log(f"CLI output: {result.stdout[:300]}", "debug")
return False
except Exception as e:
self.log(f"2FA login failed: {str(e)[:100]}", "error")
return False
else:
self.log("No 2FA code available - login failed", "error")
self.log("Options:", "info")
self.log("1. Add 'totp_secret' to config with your 2FA secret key", "info")
self.log(f"2. Create file: {self.session_dir}/2fa_code_USERNAME.txt with code", "info")
self.log("3. Run interactively to enter code when prompted", "info")
return False
except instaloader.exceptions.ConnectionException as e:
if "checkpoint" in str(e).lower():
self.log("Instagram checkpoint required - please verify account in browser", "error")
elif "429" in str(e):
self.log("Rate limited during login - please wait before retrying", "error")
else:
self.log(f"Connection error during login: {str(e)[:100]}", "error")
return False
except Exception as e:
self.log(f"Reauthorization failed: {str(e)[:100]}", "error")
return False
def _is_already_downloaded(self, post_id: str) -> bool:
"""Check if post has been downloaded (uses centralized function for cross-module detection)"""
if not self.use_database or not self.unified_db:
return False
# Check by URL first
url = f"https://www.instagram.com/p/{post_id}/"
if self.unified_db.is_downloaded(url):
return True
# Use centralized function for consistent cross-module detection
return is_instagram_downloaded(self.unified_db, str(post_id))
def _record_download(self, post_id: str, username: str, content_type: str,
filename: str = None, post_date: datetime = None,
caption: str = None, likes: int = None, comments: int = None,
deferred: bool = False):
"""Record successful download (uses centralized function for normalized media_id)
Args:
deferred: If True, don't record to database now - add to pending_downloads list
for later recording after file move is complete
"""
# Use centralized function for consistent cross-module storage
url = f"https://www.instagram.com/p/{post_id}/"
extra_metadata = {
'username': username,
'caption': caption[:500] if caption else None,
'likes': likes,
'comments': comments
}
# If deferred, store for later recording instead of recording now
if deferred:
self.pending_downloads.append({
'media_id': str(post_id),
'username': username,
'filename': filename,
'url': url,
'post_date': post_date.isoformat() if post_date else None,
'content_type': content_type,
'metadata': extra_metadata
})
self.log(f"Deferred recording for {post_id}", "debug")
return True
if not self.use_database or not self.unified_db:
return
record_instagram_download(
db=self.unified_db,
media_id=str(post_id),
username=username,
content_type=content_type,
filename=filename,
url=url,
post_date=post_date,
method='instaloader',
extra_metadata=extra_metadata
)
def get_pending_downloads(self):
"""Get list of downloads that were deferred for later recording"""
return self.pending_downloads.copy()
def clear_pending_downloads(self):
"""Clear the pending downloads list after they've been recorded"""
self.pending_downloads = []
def download(self, username: str, output_dir: str = "downloads",
content_type: str = "posts", max_downloads: int = None,
days_back: int = None, date_from: datetime = None,
date_to: datetime = None, defer_database: bool = False) -> int:
"""
Download content from Instagram user
Args:
username: Instagram username to download from
output_dir: Directory to save downloads
content_type: Type of content (posts, stories, reels, all)
max_downloads: Maximum number to download
days_back: Download content from last N days
date_from: Start date for downloads
date_to: End date for downloads
defer_database: If True, don't record to database immediately - store in
pending_downloads for later recording after file move is complete
Returns:
Number of items downloaded
"""
self.defer_database = defer_database # Store for use in _record_download
# Refresh session before each download
if self.username and self.password:
self.log("Refreshing session for download...", "info")
if not self.reauthorize_session():
self.log("Failed to refresh session", "error")
if self.require_valid_session:
self.log(f"Skipping download for @{username} - session refresh failed and require_valid_session is True", "warning")
return 0
self.session_is_valid = False
else:
self.session_is_valid = True
self.log(f"Session ready for @{username}", "success")
elif self.require_valid_session:
self.log(f"Skipping download for @{username} - no credentials and require_valid_session is True", "warning")
return 0
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Setup date filtering
if days_back and not date_from:
date_from = datetime.now() - timedelta(days=days_back)
if not date_to:
date_to = datetime.now()
self.log(f"Downloading {content_type} for @{username}", "info")
if date_from:
self.log(f"Date range: {date_from.strftime('%Y-%m-%d')} to {date_to.strftime('%Y-%m-%d')}", "info")
downloaded_count = 0
skipped_count = 0
error_count = 0
try:
# Get profile with retry
profile = self._get_profile_with_retry(username)
if not profile:
return 0
# Download based on content type
if content_type in ["posts", "all"]:
count = self._download_posts(profile, output_path, max_downloads,
date_from, date_to)
downloaded_count += count
if content_type in ["stories", "all"]:
count = self._download_stories(profile, output_path)
downloaded_count += count
if content_type in ["reels", "all"]:
count = self._download_reels(profile, output_path, max_downloads,
date_from, date_to)
downloaded_count += count
except Exception as e:
self.log(f"Download error: {e}", "error")
# Handle specific errors
if "429" in str(e) or "rate" in str(e).lower():
self.log("Rate limited by Instagram! Try again later.", "error")
self.log(f"Waiting {self.error_delay} seconds...", "warning")
time.sleep(self.error_delay)
elif "login" in str(e).lower():
self.log("Login required for this content!", "error")
self.log("Create a session file first (see documentation)", "info")
elif "not found" in str(e).lower():
self.log(f"User {username} not found or private", "error")
self.log(f"Download complete: {downloaded_count} downloaded", "success")
return downloaded_count
def _get_profile_with_retry(self, username: str):
"""Get Instagram profile with retry logic"""
import instaloader
# Ensure loader is initialized
if not self.loader:
self._init_loader()
for attempt in range(self.max_retries):
try:
self.log(f"Fetching profile: {username}", "info")
# Check if context is available
if not hasattr(self.loader, 'context') or self.loader.context is None:
self.log("Reinitializing loader context...", "debug")
self._init_loader()
profile = instaloader.Profile.from_username(self.loader.context, username)
# Log profile info
self.log(f"Profile found: {profile.full_name} ({profile.mediacount} posts)", "success")
if profile.is_private and not profile.followed_by_viewer:
self.log("Profile is private and not followed", "warning")
return profile
except Exception as e:
if attempt < self.max_retries - 1:
wait = self.error_delay * (attempt + 1)
self.log(f"Error getting profile (attempt {attempt + 1}): {e}", "warning")
self.log(f"Retrying in {wait} seconds...", "info")
time.sleep(wait)
else:
self.log(f"Failed to get profile after {self.max_retries} attempts", "error")
raise
return None
def _download_posts(self, profile, output_path: Path, max_downloads: int,
date_from: datetime, date_to: datetime) -> int:
"""Download posts from profile"""
downloaded = 0
skipped = 0
self.log(f"Downloading posts...", "info")
self.activity_manager.update_status("Checking posts")
try:
posts = profile.get_posts()
for post in posts:
# Check date range
if date_from and post.date < date_from:
self.log(f"Reached posts older than date range, stopping", "info")
break
if date_to and post.date > date_to:
continue
# Check if already downloaded
media_id = str(post.mediaid)
shortcode = post.shortcode
if self._is_already_downloaded(media_id):
self.log(f"Skipping already downloaded: {shortcode}", "debug")
skipped += 1
continue
# Download post
try:
self.log(f"Downloading post {shortcode} from {post.date.strftime('%Y-%m-%d')}", "info")
# Create temp directory for instaloader
temp_dir = output_path / f"temp_{shortcode}"
temp_dir.mkdir(parents=True, exist_ok=True)
# Download with Instaloader to temp dir
self.loader.download_post(post, target=temp_dir)
# Move and rename files to match FastDL format
self._process_downloaded_files(temp_dir, output_path, post.owner_username, media_id, post.date)
# Clean up temp directory
import shutil
shutil.rmtree(temp_dir, ignore_errors=True)
# Record in database
self._record_download(
post_id=media_id,
username=post.owner_username,
content_type="post",
post_date=post.date,
caption=post.caption[:500] if post.caption else None,
likes=post.likes,
comments=post.comments,
deferred=self.defer_database
)
downloaded += 1
# Update status
self.activity_manager.update_status(
"Downloading posts",
progress_current=downloaded,
progress_total=max_downloads
)
# Check max downloads
if max_downloads and downloaded >= max_downloads:
self.log(f"Reached max downloads ({max_downloads})", "info")
break
# Random delay to avoid detection
self._smart_delay(downloaded)
except Exception as e:
self.log(f"Error downloading post {media_id}: {e}", "error")
if "429" in str(e):
self.log("Rate limited! Stopping downloads.", "error")
break
except Exception as e:
error_msg = str(e)
self.log(f"Error iterating posts: {e}", "error")
# Check if Instagram is blocking us
if "401" in error_msg or "Please wait a few minutes" in error_msg:
self.log("Instagram is blocking requests - session may be compromised", "error")
self.log("Aborting all downloads to prevent further issues", "error")
# Mark session as invalid to prevent further attempts
self.session_is_valid = False
return 0
elif "403" in error_msg or "forbidden" in error_msg.lower():
self.log("Access forbidden - Instagram has blocked this session", "error")
self.session_is_valid = False
return 0
self.log(f"Posts: {downloaded} downloaded, {skipped} skipped", "info")
return downloaded
def _process_downloaded_files(self, temp_dir: Path, output_path: Path, username: str, fallback_id: str, post_date: datetime):
"""Process downloaded files to match FastDL naming and timestamps
Returns:
list: List of processed filenames, or empty list if no files processed
"""
import shutil
import re
import json
import lzma
from datetime import timedelta
processed_files = []
# Format date for filename - subtract 4 hours for timezone adjustment
adjusted_date_for_filename = post_date - timedelta(hours=4)
date_str = adjusted_date_for_filename.strftime('%Y%m%d_%H%M%S')
# Build a mapping of original filenames to media IDs from JSON
media_id_map = {}
# Load JSON file to get media IDs from URLs
json_files = list(temp_dir.glob('*.json.xz'))
if not json_files:
json_files = list(temp_dir.glob('*.json'))
if json_files:
try:
json_file = json_files[0]
if json_file.suffix == '.xz':
with lzma.open(json_file, 'rt') as f:
data = json.load(f)
else:
with open(json_file, 'r') as f:
data = json.load(f)
# Extract media IDs from URLs in carousel or single image
if 'node' in data:
node = data['node']
# Check for carousel in iphone_struct
if 'iphone_struct' in node and 'carousel_media' in node['iphone_struct']:
# Carousel post - each image has its own media ID
for idx, item in enumerate(node['iphone_struct']['carousel_media'], 1):
if 'image_versions2' in item and 'candidates' in item['image_versions2']:
url = item['image_versions2']['candidates'][0]['url']
# Extract media ID from URL
parts = url.split('/')
for part in parts:
if '.jpg' in part or '.mp4' in part:
filename = part.split('?')[0]
# Remove extension and _n suffix
media_id = filename.replace('.jpg', '').replace('.mp4', '').replace('_n', '')
# Map the index to media ID
media_id_map[str(idx)] = media_id
break
# Check for single image/video
elif 'display_url' in node or ('iphone_struct' in node and 'image_versions2' in node['iphone_struct']):
# Single post
url = node.get('display_url', '')
if not url and 'iphone_struct' in node and 'image_versions2' in node['iphone_struct']:
url = node['iphone_struct']['image_versions2']['candidates'][0]['url']
if url:
parts = url.split('/')
for part in parts:
if '.jpg' in part or '.mp4' in part:
filename = part.split('?')[0]
media_id = filename.replace('.jpg', '').replace('.mp4', '').replace('_n', '')
media_id_map['single'] = media_id
break
except Exception as e:
self.log(f"Could not extract media IDs from JSON: {e}", "debug")
# Process all downloaded files
for file_path in temp_dir.iterdir():
if file_path.is_file():
# Skip JSON metadata files
if file_path.suffix.lower() in ['.json', '.xz', '.txt']:
continue
# Get file extension
ext = file_path.suffix.lower()
# Check if it's a multi-image post (has _1, _2, etc. in filename)
match = re.search(r'_(\d+)\.(jpg|jpeg|png|mp4|mov)', file_path.name, re.IGNORECASE)
if match:
index = match.group(1)
# Use the media ID for this specific index
media_id = media_id_map.get(index, fallback_id)
new_filename = f"{username}_{date_str}_{media_id}{ext}"
else:
# Single image/video
media_id = media_id_map.get('single', fallback_id)
new_filename = f"{username}_{date_str}_{media_id}{ext}"
# Move and rename file
new_path = output_path / new_filename
shutil.move(str(file_path), str(new_path))
# Check for duplicate hash before finalizing (hash blacklist persists even if original deleted)
file_hash = self.unified_db.get_file_hash(str(new_path)) if self.unified_db else None
if file_hash:
existing = self.unified_db.get_download_by_file_hash(file_hash)
if existing and existing.get('file_path') and str(new_path) != existing.get('file_path'):
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
self.log(f"⚠ Duplicate content detected (hash match): {new_filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
# Delete the duplicate regardless of whether original file still exists
try:
new_path.unlink()
self.log(f"Deleted duplicate (hash blacklist): {new_filename}", "debug")
continue
except Exception as e:
self.log(f"Failed to delete duplicate {new_filename}: {e}", "warning")
# Set file timestamps to post date
self._update_file_timestamp(new_path, post_date)
# Add to processed files list
processed_files.append(new_filename)
return processed_files
def _update_file_timestamp(self, filepath: Path, post_date: datetime):
"""Update file timestamps to match post date"""
try:
import os
# Convert datetime to timestamp with 4-hour adjustment
from datetime import timedelta
adjusted_date = post_date - timedelta(hours=4)
timestamp = adjusted_date.timestamp()
# Set both access and modification time
os.utime(filepath, (timestamp, timestamp))
self.log(f"Updated timestamp for {filepath.name} (adjusted -4 hours)", "debug")
except Exception as e:
self.log(f"Failed to update timestamp: {e}", "warning")
def _download_stories(self, profile, output_path: Path) -> int:
"""Download stories from profile"""
downloaded = 0
if not self.username:
self.log("Login required to download stories", "warning")
return 0
self.log(f"Downloading stories...", "info")
self.activity_manager.update_status("Checking stories")
try:
import instaloader
import shutil
# Get user ID for stories
user_id = profile.userid
# Download stories
for story in self.loader.get_stories([user_id]):
for item in story.get_items():
media_id = str(item.mediaid)
if self._is_already_downloaded(media_id):
self.log(f"Skipping already downloaded story: {media_id}", "debug")
continue
try:
self.log(f"Downloading story {media_id}", "info")
# Download story to temp dir
temp_dir = output_path / f"temp_story_{media_id}"
temp_dir.mkdir(parents=True, exist_ok=True)
self.loader.download_storyitem(item, target=temp_dir)
# Process and move files to match FastDL format
processed_files = self._process_downloaded_files(temp_dir, output_path, profile.username, media_id, item.date)
# Clean up temp directory
shutil.rmtree(temp_dir, ignore_errors=True)
# Only record in database if files were successfully processed
if processed_files:
# Get the first processed filename for database record
filename = processed_files[0] if isinstance(processed_files, list) else None
self._record_download(
post_id=media_id,
username=profile.username,
content_type="story",
filename=filename,
post_date=item.date,
deferred=self.defer_database
)
downloaded += 1
self.activity_manager.update_status(
"Downloading stories",
progress_current=downloaded,
progress_total=max_downloads
)
else:
self.log(f"No files processed for story {media_id}, not recording in database", "warning")
self._smart_delay()
except Exception as e:
self.log(f"Error downloading story {media_id}: {e}", "error")
except Exception as e:
self.log(f"Error downloading stories: {e}", "error")
if "login" in str(e).lower():
self.log("Stories require login!", "warning")
self.log(f"Stories: {downloaded} downloaded", "info")
return downloaded
def _download_reels(self, profile, output_path: Path, max_downloads: int,
date_from: datetime, date_to: datetime) -> int:
"""Download reels from profile"""
downloaded = 0
self.log(f"Downloading reels...", "info")
self.activity_manager.update_status("Checking reels")
try:
# Reels are part of posts, filter for videos
posts = profile.get_posts()
for post in posts:
# Check if it's a reel (video post)
if not post.is_video:
continue
# Check date range
if date_from and post.date < date_from:
break
if date_to and post.date > date_to:
continue
# Check if already downloaded
media_id = str(post.mediaid)
shortcode = post.shortcode
if self._is_already_downloaded(media_id):
self.log(f"Skipping already downloaded reel: {shortcode}", "debug")
continue
try:
self.log(f"Downloading reel {shortcode}", "info")
# Download reel to temp dir
temp_dir = output_path / f"temp_reel_{shortcode}"
temp_dir.mkdir(parents=True, exist_ok=True)
self.loader.download_post(post, target=temp_dir)
# Process and move files to match FastDL format
self._process_downloaded_files(temp_dir, output_path, post.owner_username, media_id, post.date)
# Clean up temp directory
import shutil
shutil.rmtree(temp_dir, ignore_errors=True)
# Record in database
self._record_download(
post_id=media_id,
username=post.owner_username,
content_type="reel",
post_date=post.date,
likes=post.likes,
comments=post.comments,
deferred=self.defer_database
)
downloaded += 1
# Update status
self.activity_manager.update_status(
"Downloading reels",
progress_current=downloaded,
progress_total=max_downloads
)
if max_downloads and downloaded >= max_downloads:
break
self._smart_delay()
except Exception as e:
self.log(f"Error downloading reel {media_id}: {e}", "error")
except Exception as e:
self.log(f"Error downloading reels: {e}", "error")
self.log(f"Reels: {downloaded} downloaded", "info")
return downloaded
def _smart_delay(self, batch_count=0):
"""Smart delay between downloads to avoid detection"""
# Random delay with exponential backoff if needed
base_delay = random.uniform(self.min_delay, self.max_delay)
# Add batch delay if we've downloaded a batch
if batch_count > 0 and batch_count % self.download_batch_size == 0:
self.log(f"Batch limit reached ({self.download_batch_size} items), taking a longer break", "info")
base_delay = self.batch_delay + random.uniform(0, 10)
# Add extra delay if we're downloading fast
elif len(self.request_times) > 10:
recent_requests = self.request_times[-10:]
avg_interval = (recent_requests[-1] - recent_requests[0]) / 9
if avg_interval < 5: # Too fast
base_delay += random.uniform(5, 10)
self.log("Slowing down to avoid detection", "debug")
time.sleep(base_delay)
def login(self, username: str, password: str = None) -> bool:
"""
Login to Instagram and save session
Args:
username: Instagram username
password: Instagram password (will prompt if not provided)
Returns:
True if login successful
"""
try:
if not password:
import getpass
password = getpass.getpass(f"Password for {username}: ")
self.log(f"Logging in as {username}...", "info")
self.loader.login(username, password)
# Save session
# Use configured session file path or default
if self.session_file:
session_file = Path(self.session_file).expanduser()
session_file.parent.mkdir(parents=True, exist_ok=True)
else:
session_file = self.session_dir / f"session-{username}"
self.loader.save_session_to_file(session_file)
self.log(f"Session saved to {session_file}", "success")
self.username = username
return True
except Exception as e:
self.log(f"Login failed: {e}", "error")
if "checkpoint" in str(e).lower():
self.log("Instagram requires verification (checkpoint)", "warning")
self.log("Complete verification in browser, then export session", "info")
elif "bad password" in str(e).lower():
self.log("Invalid username or password", "error")
elif "429" in str(e):
self.log("Too many login attempts, try again later", "error")
return False
def get_database_stats(self) -> Dict:
"""Get database statistics"""
if not self.use_database or not self.unified_db:
return {"enabled": False}
# Use unified database statistics
return self.unified_db.get_statistics(platform='instagram')
# Test function
def test_module():
"""Test the InstaLoader module"""
from pathlib import Path
print("Testing InstaLoader Module")
print("=" * 60)
# Use proper path in database directory for testing
test_db_path = str(Path(__file__).parent.parent / 'database' / 'test_instaloader.db')
module = InstaLoaderModule(
show_progress=True,
use_database=True,
db_path=test_db_path
)
# Test download (limited)
count = module.download(
username="evalongoria",
output_dir="/opt/temp/test/instagram/posts",
content_type="posts",
max_downloads=2,
days_back=30
)
print(f"\nDownloaded {count} items")
# Show stats
stats = module.get_database_stats()
print(f"\nDatabase stats:")
print(f" Total: {stats.get('total_downloads', 0)}")
print(f" By type: {stats.get('by_type', {})}")
return count > 0
if __name__ == "__main__":
import sys
success = test_module()
sys.exit(0 if success else 1)