1259 lines
56 KiB
Python
Executable File
1259 lines
56 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Instaloader Module for Instagram Downloads
|
|
Based on FastDL module architecture with detection safeguards
|
|
Bypasses Cloudflare by using Instagram directly
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
import os
|
|
import sys
|
|
import time
|
|
import random
|
|
import json
|
|
from typing import Optional, Dict
|
|
import pickle
|
|
from modules.base_module import LoggingMixin
|
|
from modules.instagram_utils import (
|
|
extract_instagram_media_id,
|
|
record_instagram_download,
|
|
is_instagram_downloaded
|
|
)
|
|
|
|
class InstaLoaderModule(LoggingMixin):
|
|
"""
|
|
Instagram downloader using Instaloader with safeguards
|
|
|
|
Features:
|
|
- Rate limiting to avoid detection
|
|
- Session persistence and rotation
|
|
- Database tracking to avoid re-downloads
|
|
- Automatic retry with exponential backoff
|
|
- Detection avoidance strategies
|
|
"""
|
|
|
|
def __init__(self,
|
|
username: Optional[str] = None,
|
|
password: Optional[str] = None,
|
|
session_file: Optional[str] = None,
|
|
totp_secret: Optional[str] = None,
|
|
use_database: bool = True,
|
|
log_callback=None,
|
|
show_progress: bool = True,
|
|
max_rate: int = 100, # Max requests per hour
|
|
unified_db=None,
|
|
require_valid_session: bool = False):
|
|
"""
|
|
Initialize the Instaloader module
|
|
|
|
Args:
|
|
username: Instagram username for login (optional)
|
|
password: Instagram password for reauthorization (optional)
|
|
session_file: Path to saved session file
|
|
totp_secret: TOTP secret key for 2FA (optional)
|
|
use_database: Track downloads in database
|
|
log_callback: Callback for logging (tag, level, message)
|
|
show_progress: Print progress messages
|
|
max_rate: Maximum requests per hour (rate limiting)
|
|
unified_db: Unified database instance
|
|
require_valid_session: If True, skip downloads if session is invalid
|
|
"""
|
|
# Initialize logging via mixin
|
|
self._init_logger('Instagram', log_callback, default_module='Download')
|
|
|
|
self.username = username
|
|
self.password = password
|
|
self.totp_secret = totp_secret
|
|
self.session_file = session_file
|
|
self.use_database = use_database
|
|
self.show_progress = show_progress
|
|
self.max_rate = max_rate
|
|
self.unified_db = unified_db
|
|
self.require_valid_session = require_valid_session
|
|
self.session_is_valid = False # Track session validity
|
|
|
|
# Rate limiting
|
|
self.request_times = []
|
|
self.last_request_time = 0
|
|
|
|
# Session management - use script directory
|
|
script_dir = Path(__file__).parent.parent # Go up from modules/ to script root
|
|
self.session_dir = script_dir / "sessions"
|
|
self.session_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Initialize Instaloader
|
|
self.loader = None
|
|
self._init_loader()
|
|
|
|
# Debug: Check what credentials we have (without exposing sensitive data)
|
|
self.log(f"Module initialized with username: {self.username}", "debug")
|
|
self.log(f"Password provided: {self.password is not None}", "debug")
|
|
|
|
# No separate database initialization needed - using unified database only
|
|
|
|
# Initialize activity status manager for real-time updates
|
|
from modules.activity_status import get_activity_manager
|
|
self.activity_manager = get_activity_manager(unified_db)
|
|
|
|
# Detection avoidance settings - increased to avoid Instagram detection
|
|
# Based on GitHub issue #2391 recommendations
|
|
self.min_delay = 5 # Minimum seconds between requests (increased from 3)
|
|
self.max_delay = 15 # Maximum seconds between requests (increased from 10)
|
|
self.error_delay = 120 # Delay after error (increased from 60)
|
|
self.max_retries = 3
|
|
self.download_batch_size = 10 # Download in smaller batches
|
|
self.batch_delay = 30 # Delay between batches (seconds)
|
|
|
|
self.pending_downloads = [] # Track downloads for deferred database recording
|
|
|
|
def _init_loader(self):
|
|
"""Initialize Instaloader with safeguards"""
|
|
try:
|
|
import instaloader
|
|
except ImportError:
|
|
self.log("Installing instaloader...", "info")
|
|
import subprocess
|
|
subprocess.run(
|
|
["pip", "install", "--quiet", "--break-system-packages", "instaloader"],
|
|
capture_output=True,
|
|
check=False
|
|
)
|
|
import instaloader
|
|
|
|
# Suppress instaloader's direct output and redirect to our logger
|
|
class LoggerAdapter:
|
|
def __init__(self, parent_log_func):
|
|
self.parent_log = parent_log_func
|
|
|
|
def write(self, message):
|
|
if message.strip():
|
|
# Filter and format instaloader messages
|
|
msg = message.strip()
|
|
if 'JSON Query' in msg or '401 Unauthorized' in msg or '403 Forbidden' in msg:
|
|
# Convert to our format
|
|
if '401 Unauthorized' in msg:
|
|
self.parent_log("Session authentication issue - retrying", "warning")
|
|
elif '403 Forbidden' in msg:
|
|
self.parent_log("Access forbidden - rate limited", "warning")
|
|
elif 'Error when checking' in msg:
|
|
self.parent_log("Session validation failed", "debug")
|
|
elif msg and not msg.startswith('['):
|
|
self.parent_log(msg, "debug")
|
|
|
|
def flush(self):
|
|
pass
|
|
|
|
# Configure Instaloader with conservative settings
|
|
self.loader = instaloader.Instaloader(
|
|
quiet=True, # Always quiet to suppress direct output
|
|
download_videos=True,
|
|
download_video_thumbnails=False,
|
|
download_geotags=False,
|
|
download_comments=False,
|
|
save_metadata=True, # Need JSON to get media IDs
|
|
compress_json=True, # Save space with compression
|
|
post_metadata_txt_pattern="", # Don't save txt files
|
|
storyitem_metadata_txt_pattern="",
|
|
max_connection_attempts=5, # More retries
|
|
request_timeout=300,
|
|
# Don't treat 403 as fatal - Instagram returns this often
|
|
fatal_status_codes=[429], # Only stop on rate limit
|
|
# Use a desktop user agent to avoid mobile restrictions
|
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
|
|
)
|
|
|
|
# Redirect stderr to capture instaloader errors
|
|
if self.show_progress:
|
|
self._original_stderr = sys.stderr
|
|
sys.stderr = LoggerAdapter(self.log)
|
|
|
|
# Don't refresh session at startup - will do it on each download
|
|
self.session_is_valid = False
|
|
|
|
def _rate_limit(self):
|
|
"""Rate limiting to avoid detection"""
|
|
current_time = time.time()
|
|
|
|
# Clean old request times (older than 1 hour)
|
|
self.request_times = [t for t in self.request_times if current_time - t < 3600]
|
|
|
|
# Check rate limit (be more conservative)
|
|
if len(self.request_times) >= self.max_rate:
|
|
# Calculate wait time
|
|
oldest_request = min(self.request_times)
|
|
wait_time = 3600 - (current_time - oldest_request) + random.uniform(5, 15)
|
|
if wait_time > 0:
|
|
self.log(f"Rate limit reached, waiting {wait_time:.0f} seconds", "warning")
|
|
time.sleep(wait_time)
|
|
|
|
# Add longer random delay between requests to avoid 403
|
|
if self.last_request_time > 0:
|
|
elapsed = current_time - self.last_request_time
|
|
# Increase delays to avoid detection
|
|
min_wait = random.uniform(self.min_delay * 2, self.max_delay * 2)
|
|
if elapsed < min_wait:
|
|
wait = min_wait - elapsed
|
|
self.log(f"Waiting {wait:.1f}s between requests", "debug")
|
|
time.sleep(wait)
|
|
|
|
# Record request
|
|
self.request_times.append(current_time)
|
|
self.last_request_time = current_time
|
|
|
|
def is_ready(self) -> bool:
|
|
"""Check if the module is ready to download (will refresh session on download)"""
|
|
# Always return True since we refresh session on each download
|
|
return True
|
|
|
|
|
|
def _load_session(self):
|
|
"""Load saved session"""
|
|
import pickle
|
|
# Try multiple session sources
|
|
session_loaded = False
|
|
|
|
# 1. Try provided session file (pickle format)
|
|
if self.session_file:
|
|
session_path = Path(self.session_file).expanduser()
|
|
if session_path.exists():
|
|
try:
|
|
with open(session_path, 'rb') as f:
|
|
session_data = pickle.load(f)
|
|
# Set cookies directly
|
|
self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
|
|
if session_data.get('csrftoken'):
|
|
self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')
|
|
# Just set the username and mark as loaded
|
|
try:
|
|
self.loader.context.username = self.username
|
|
self.log(f"Session loaded from {self.session_file}", "success")
|
|
session_loaded = True
|
|
except Exception as e:
|
|
self.log(f"Error setting session username: {e}", "warning")
|
|
session_loaded = False
|
|
except Exception as e:
|
|
self.log(f"Could not load session file: {e}", "warning")
|
|
|
|
# 2. Try saved sessions directory (pickle format)
|
|
if not session_loaded and self.username:
|
|
session_path = self.session_dir / f"session-{self.username}"
|
|
if session_path.exists():
|
|
try:
|
|
with open(session_path, 'rb') as f:
|
|
session_data = pickle.load(f)
|
|
self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
|
|
if session_data.get('csrftoken'):
|
|
self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')
|
|
# Set the username on the context
|
|
self.loader.context.username = self.username
|
|
self.log(f"Session loaded for {self.username}", "success")
|
|
session_loaded = True
|
|
except Exception as e:
|
|
self.log(f"Could not load saved session: {e}", "warning")
|
|
|
|
# 3. Try script sessions directory (pickle format)
|
|
if not session_loaded and self.username:
|
|
script_session = self.session_dir / f"session-{self.username}"
|
|
if script_session.exists():
|
|
try:
|
|
with open(script_session, 'rb') as f:
|
|
session_data = pickle.load(f)
|
|
self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
|
|
if session_data.get('csrftoken'):
|
|
self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')
|
|
# Set the username on the context
|
|
self.loader.context.username = self.username
|
|
self.log(f"Session loaded from {script_session}", "success")
|
|
session_loaded = True
|
|
except Exception as e:
|
|
self.log(f"Could not load session: {e}", "warning")
|
|
|
|
if not session_loaded:
|
|
self.log("No session loaded - anonymous access only", "warning")
|
|
self.log("Some features may be limited without login", "warning")
|
|
# Don't validate here - it will be done after _load_session returns
|
|
|
|
def reauthorize_session(self, force_new: bool = False) -> bool:
|
|
"""
|
|
Reauthorize Instagram session using stored credentials
|
|
|
|
Args:
|
|
force_new: Force a completely new login even if session exists
|
|
|
|
Returns:
|
|
True if reauthorization successful, False otherwise
|
|
"""
|
|
if not self.username or not self.password:
|
|
self.log("Cannot reauthorize - no credentials available", "error")
|
|
self.log("Please provide username and password in config", "info")
|
|
return False
|
|
|
|
# Always create fresh session for each download
|
|
if self.totp_secret:
|
|
self.log("Using CLI method for login with 2FA", "info")
|
|
try:
|
|
try:
|
|
import pyotp
|
|
except ImportError:
|
|
self.log("pyotp not installed, attempting to install...", "warning")
|
|
import subprocess
|
|
import sys
|
|
subprocess.check_call([sys.executable, "-m", "pip", "install", "pyotp"])
|
|
import pyotp
|
|
import subprocess
|
|
import pickle
|
|
|
|
# Generate 2FA code
|
|
totp = pyotp.TOTP(self.totp_secret)
|
|
two_factor_code = totp.now()
|
|
self.log(f"Generated 2FA code: {two_factor_code}", "info")
|
|
|
|
# Use instaloader CLI with the 2FA code
|
|
# Use configured session file path or default
|
|
if self.session_file:
|
|
session_file = Path(self.session_file).expanduser()
|
|
session_file.parent.mkdir(parents=True, exist_ok=True)
|
|
else:
|
|
session_file = self.session_dir / f"session-{self.username}"
|
|
# Pass password as separate argument to avoid shell escaping issues
|
|
cmd = [
|
|
'instaloader',
|
|
'--login', self.username,
|
|
'--password', self.password,
|
|
'--sessionfile', str(session_file)
|
|
]
|
|
|
|
self.log("Using instaloader CLI for login...", "info")
|
|
self.log(f"Debug - Command: instaloader --login {self.username} --password [HIDDEN] --sessionfile {session_file}", "debug")
|
|
self.log(f"Debug - Password length being passed: {len(self.password)}", "debug")
|
|
|
|
# Run with 2FA code piped via stdin (avoids shell=True security risk)
|
|
self.log(f"Running command with 2FA code via stdin", "debug")
|
|
|
|
result = subprocess.run(
|
|
cmd,
|
|
input=two_factor_code + '\n',
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30
|
|
)
|
|
|
|
# Check if login was successful by looking for success messages and session file
|
|
login_success = ("Logged in as" in result.stdout and
|
|
"Saved session to" in result.stdout and
|
|
session_file.exists())
|
|
|
|
if login_success:
|
|
self.log("Successfully logged in via CLI", "success")
|
|
|
|
# Wait a moment for file to be fully written
|
|
time.sleep(1)
|
|
|
|
# Load the new session
|
|
with open(session_file, 'rb') as f:
|
|
session_data = pickle.load(f)
|
|
|
|
# Apply session to our loader
|
|
self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
|
|
if session_data.get('csrftoken'):
|
|
self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')
|
|
|
|
# Set username in context
|
|
self.loader.context.username = self.username
|
|
|
|
self.session_is_valid = True
|
|
return True
|
|
else:
|
|
# Log details about the failure
|
|
if not session_file.exists():
|
|
self.log("Session file was not created", "error")
|
|
self.log(f"CLI stdout: {result.stdout[:300]}", "info")
|
|
if result.stderr:
|
|
self.log(f"CLI stderr: {result.stderr[:200]}", "info")
|
|
return False
|
|
except Exception as e:
|
|
self.log(f"CLI login error: {str(e)[:100]}", "error")
|
|
# Fall back to Python API method
|
|
pass
|
|
|
|
# Fallback to Python API login
|
|
try:
|
|
import instaloader
|
|
import pickle
|
|
|
|
# Clear existing session if force_new
|
|
if force_new:
|
|
self.log("Clearing existing session for fresh login", "info")
|
|
# Clear all cookies properly to avoid duplicate sessionid issue
|
|
try:
|
|
self.loader.context._session.cookies.clear()
|
|
except Exception:
|
|
pass
|
|
# Create a fresh loader instance to avoid cookie conflicts
|
|
self.loader = instaloader.Instaloader(
|
|
download_pictures=False,
|
|
download_videos=False,
|
|
download_video_thumbnails=False,
|
|
compress_json=False,
|
|
save_metadata=False,
|
|
post_metadata_txt_pattern="",
|
|
quiet=True,
|
|
fatal_status_codes=[],
|
|
max_connection_attempts=3
|
|
)
|
|
|
|
# Add delay before login attempt to avoid rate limiting
|
|
import random
|
|
delay = random.uniform(3, 5)
|
|
self.log(f"Waiting {delay:.1f}s before login attempt", "debug")
|
|
time.sleep(delay)
|
|
|
|
# Attempt login
|
|
self.log(f"Logging in as {self.username}...", "info")
|
|
try:
|
|
self.loader.login(self.username, self.password)
|
|
|
|
# Save the new session
|
|
# Use configured session file path or default
|
|
if self.session_file:
|
|
session_file = Path(self.session_file).expanduser()
|
|
session_file.parent.mkdir(parents=True, exist_ok=True)
|
|
else:
|
|
session_file = self.session_dir / f"session-{self.username}"
|
|
session_data = {
|
|
'sessionid': self.loader.context._session.cookies.get('sessionid'),
|
|
'csrftoken': self.loader.context._session.cookies.get('csrftoken'),
|
|
'username': self.username,
|
|
'timestamp': datetime.now().isoformat()
|
|
}
|
|
|
|
with open(session_file, 'wb') as f:
|
|
pickle.dump(session_data, f)
|
|
|
|
self.log(f"Session saved to {session_file}", "success")
|
|
|
|
self.session_is_valid = True
|
|
self.log("Successfully reauthorized session", "success")
|
|
return True
|
|
|
|
except instaloader.exceptions.BadCredentialsException:
|
|
self.log("Invalid username or password", "error")
|
|
self.log("Please check your Instagram credentials in the config file", "info")
|
|
self.log("The password may have been changed or the account may be locked", "info")
|
|
return False
|
|
except instaloader.exceptions.TwoFactorAuthRequiredException:
|
|
self.log("Two-factor authentication required", "info")
|
|
|
|
# Use subprocess to call instaloader CLI which handles 2FA better
|
|
if self.totp_secret:
|
|
try:
|
|
try:
|
|
import pyotp
|
|
except ImportError:
|
|
self.log("pyotp not installed, attempting to install...", "warning")
|
|
import sys
|
|
subprocess.check_call([sys.executable, "-m", "pip", "install", "pyotp"])
|
|
import pyotp
|
|
import subprocess
|
|
|
|
# Generate 2FA code
|
|
totp = pyotp.TOTP(self.totp_secret)
|
|
two_factor_code = totp.now()
|
|
self.log(f"Generated 2FA code: {two_factor_code}", "info")
|
|
|
|
# Use instaloader CLI with the 2FA code
|
|
# Use configured session file path or default
|
|
if self.session_file:
|
|
session_file = Path(self.session_file).expanduser()
|
|
session_file.parent.mkdir(parents=True, exist_ok=True)
|
|
else:
|
|
session_file = self.session_dir / f"session-{self.username}"
|
|
# Pass password as separate argument to avoid shell escaping issues
|
|
cmd = [
|
|
'instaloader',
|
|
'--login', self.username,
|
|
'--password', self.password,
|
|
'--sessionfile', str(session_file)
|
|
]
|
|
|
|
self.log("Using instaloader CLI for 2FA login...", "info")
|
|
|
|
# Run with 2FA code as input
|
|
result = subprocess.run(
|
|
cmd,
|
|
input=f"{two_factor_code}\n",
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30
|
|
)
|
|
|
|
# Check if login was successful by looking for success messages and session file
|
|
login_success = ("Logged in as" in result.stdout and
|
|
"Saved session to" in result.stdout and
|
|
session_file.exists())
|
|
|
|
if login_success:
|
|
self.log("Successfully logged in with 2FA via CLI", "success")
|
|
|
|
# Wait a moment for file to be fully written
|
|
time.sleep(1)
|
|
|
|
# Load the new session
|
|
import pickle
|
|
with open(session_file, 'rb') as f:
|
|
session_data = pickle.load(f)
|
|
|
|
# Apply session to our loader
|
|
self.loader.context._session.cookies.set('sessionid', session_data['sessionid'], domain='.instagram.com', path='/')
|
|
if session_data.get('csrftoken'):
|
|
self.loader.context._session.cookies.set('csrftoken', session_data['csrftoken'], domain='.instagram.com', path='/')
|
|
|
|
# Set username in context
|
|
self.loader.context.username = self.username
|
|
|
|
self.session_is_valid = True
|
|
return True
|
|
else:
|
|
# Log details about the failure
|
|
if not session_file.exists():
|
|
self.log("Session file was not created", "error")
|
|
else:
|
|
self.log("Login output unclear, treating as failure", "error")
|
|
self.log(f"CLI output: {result.stdout[:300]}", "debug")
|
|
return False
|
|
|
|
except Exception as e:
|
|
self.log(f"2FA login failed: {str(e)[:100]}", "error")
|
|
return False
|
|
else:
|
|
self.log("No 2FA code available - login failed", "error")
|
|
self.log("Options:", "info")
|
|
self.log("1. Add 'totp_secret' to config with your 2FA secret key", "info")
|
|
self.log(f"2. Create file: {self.session_dir}/2fa_code_USERNAME.txt with code", "info")
|
|
self.log("3. Run interactively to enter code when prompted", "info")
|
|
return False
|
|
except instaloader.exceptions.ConnectionException as e:
|
|
if "checkpoint" in str(e).lower():
|
|
self.log("Instagram checkpoint required - please verify account in browser", "error")
|
|
elif "429" in str(e):
|
|
self.log("Rate limited during login - please wait before retrying", "error")
|
|
else:
|
|
self.log(f"Connection error during login: {str(e)[:100]}", "error")
|
|
return False
|
|
|
|
except Exception as e:
|
|
self.log(f"Reauthorization failed: {str(e)[:100]}", "error")
|
|
return False
|
|
|
|
|
|
def _is_already_downloaded(self, post_id: str) -> bool:
|
|
"""Check if post has been downloaded (uses centralized function for cross-module detection)"""
|
|
if not self.use_database or not self.unified_db:
|
|
return False
|
|
|
|
# Check by URL first
|
|
url = f"https://www.instagram.com/p/{post_id}/"
|
|
if self.unified_db.is_downloaded(url):
|
|
return True
|
|
|
|
# Use centralized function for consistent cross-module detection
|
|
return is_instagram_downloaded(self.unified_db, str(post_id))
|
|
|
|
def _record_download(self, post_id: str, username: str, content_type: str,
|
|
filename: str = None, post_date: datetime = None,
|
|
caption: str = None, likes: int = None, comments: int = None,
|
|
deferred: bool = False):
|
|
"""Record successful download (uses centralized function for normalized media_id)
|
|
|
|
Args:
|
|
deferred: If True, don't record to database now - add to pending_downloads list
|
|
for later recording after file move is complete
|
|
"""
|
|
# Use centralized function for consistent cross-module storage
|
|
url = f"https://www.instagram.com/p/{post_id}/"
|
|
extra_metadata = {
|
|
'username': username,
|
|
'caption': caption[:500] if caption else None,
|
|
'likes': likes,
|
|
'comments': comments
|
|
}
|
|
|
|
# If deferred, store for later recording instead of recording now
|
|
if deferred:
|
|
self.pending_downloads.append({
|
|
'media_id': str(post_id),
|
|
'username': username,
|
|
'filename': filename,
|
|
'url': url,
|
|
'post_date': post_date.isoformat() if post_date else None,
|
|
'content_type': content_type,
|
|
'metadata': extra_metadata
|
|
})
|
|
self.log(f"Deferred recording for {post_id}", "debug")
|
|
return True
|
|
|
|
if not self.use_database or not self.unified_db:
|
|
return
|
|
|
|
record_instagram_download(
|
|
db=self.unified_db,
|
|
media_id=str(post_id),
|
|
username=username,
|
|
content_type=content_type,
|
|
filename=filename,
|
|
url=url,
|
|
post_date=post_date,
|
|
method='instaloader',
|
|
extra_metadata=extra_metadata
|
|
)
|
|
|
|
def get_pending_downloads(self):
|
|
"""Get list of downloads that were deferred for later recording"""
|
|
return self.pending_downloads.copy()
|
|
|
|
def clear_pending_downloads(self):
|
|
"""Clear the pending downloads list after they've been recorded"""
|
|
self.pending_downloads = []
|
|
|
|
def download(self, username: str, output_dir: str = "downloads",
|
|
content_type: str = "posts", max_downloads: int = None,
|
|
days_back: int = None, date_from: datetime = None,
|
|
date_to: datetime = None, defer_database: bool = False) -> int:
|
|
"""
|
|
Download content from Instagram user
|
|
|
|
Args:
|
|
username: Instagram username to download from
|
|
output_dir: Directory to save downloads
|
|
content_type: Type of content (posts, stories, reels, all)
|
|
max_downloads: Maximum number to download
|
|
days_back: Download content from last N days
|
|
date_from: Start date for downloads
|
|
date_to: End date for downloads
|
|
defer_database: If True, don't record to database immediately - store in
|
|
pending_downloads for later recording after file move is complete
|
|
|
|
Returns:
|
|
Number of items downloaded
|
|
"""
|
|
self.defer_database = defer_database # Store for use in _record_download
|
|
# Refresh session before each download
|
|
if self.username and self.password:
|
|
self.log("Refreshing session for download...", "info")
|
|
if not self.reauthorize_session():
|
|
self.log("Failed to refresh session", "error")
|
|
if self.require_valid_session:
|
|
self.log(f"Skipping download for @{username} - session refresh failed and require_valid_session is True", "warning")
|
|
return 0
|
|
self.session_is_valid = False
|
|
else:
|
|
self.session_is_valid = True
|
|
self.log(f"Session ready for @{username}", "success")
|
|
elif self.require_valid_session:
|
|
self.log(f"Skipping download for @{username} - no credentials and require_valid_session is True", "warning")
|
|
return 0
|
|
|
|
output_path = Path(output_dir)
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Setup date filtering
|
|
if days_back and not date_from:
|
|
date_from = datetime.now() - timedelta(days=days_back)
|
|
|
|
if not date_to:
|
|
date_to = datetime.now()
|
|
|
|
self.log(f"Downloading {content_type} for @{username}", "info")
|
|
if date_from:
|
|
self.log(f"Date range: {date_from.strftime('%Y-%m-%d')} to {date_to.strftime('%Y-%m-%d')}", "info")
|
|
|
|
downloaded_count = 0
|
|
skipped_count = 0
|
|
error_count = 0
|
|
|
|
try:
|
|
# Get profile with retry
|
|
profile = self._get_profile_with_retry(username)
|
|
if not profile:
|
|
return 0
|
|
|
|
# Download based on content type
|
|
if content_type in ["posts", "all"]:
|
|
count = self._download_posts(profile, output_path, max_downloads,
|
|
date_from, date_to)
|
|
downloaded_count += count
|
|
|
|
if content_type in ["stories", "all"]:
|
|
count = self._download_stories(profile, output_path)
|
|
downloaded_count += count
|
|
|
|
if content_type in ["reels", "all"]:
|
|
count = self._download_reels(profile, output_path, max_downloads,
|
|
date_from, date_to)
|
|
downloaded_count += count
|
|
|
|
except Exception as e:
|
|
self.log(f"Download error: {e}", "error")
|
|
|
|
# Handle specific errors
|
|
if "429" in str(e) or "rate" in str(e).lower():
|
|
self.log("Rate limited by Instagram! Try again later.", "error")
|
|
self.log(f"Waiting {self.error_delay} seconds...", "warning")
|
|
time.sleep(self.error_delay)
|
|
elif "login" in str(e).lower():
|
|
self.log("Login required for this content!", "error")
|
|
self.log("Create a session file first (see documentation)", "info")
|
|
elif "not found" in str(e).lower():
|
|
self.log(f"User {username} not found or private", "error")
|
|
|
|
self.log(f"Download complete: {downloaded_count} downloaded", "success")
|
|
return downloaded_count
|
|
|
|
def _get_profile_with_retry(self, username: str):
|
|
"""Get Instagram profile with retry logic"""
|
|
import instaloader
|
|
|
|
# Ensure loader is initialized
|
|
if not self.loader:
|
|
self._init_loader()
|
|
|
|
for attempt in range(self.max_retries):
|
|
try:
|
|
self.log(f"Fetching profile: {username}", "info")
|
|
|
|
# Check if context is available
|
|
if not hasattr(self.loader, 'context') or self.loader.context is None:
|
|
self.log("Reinitializing loader context...", "debug")
|
|
self._init_loader()
|
|
|
|
profile = instaloader.Profile.from_username(self.loader.context, username)
|
|
|
|
# Log profile info
|
|
self.log(f"Profile found: {profile.full_name} ({profile.mediacount} posts)", "success")
|
|
|
|
if profile.is_private and not profile.followed_by_viewer:
|
|
self.log("Profile is private and not followed", "warning")
|
|
|
|
return profile
|
|
|
|
except Exception as e:
|
|
if attempt < self.max_retries - 1:
|
|
wait = self.error_delay * (attempt + 1)
|
|
self.log(f"Error getting profile (attempt {attempt + 1}): {e}", "warning")
|
|
self.log(f"Retrying in {wait} seconds...", "info")
|
|
time.sleep(wait)
|
|
else:
|
|
self.log(f"Failed to get profile after {self.max_retries} attempts", "error")
|
|
raise
|
|
|
|
return None
|
|
|
|
def _download_posts(self, profile, output_path: Path, max_downloads: int,
|
|
date_from: datetime, date_to: datetime) -> int:
|
|
"""Download posts from profile"""
|
|
downloaded = 0
|
|
skipped = 0
|
|
|
|
self.log(f"Downloading posts...", "info")
|
|
self.activity_manager.update_status("Checking posts")
|
|
|
|
try:
|
|
posts = profile.get_posts()
|
|
|
|
for post in posts:
|
|
# Check date range
|
|
if date_from and post.date < date_from:
|
|
self.log(f"Reached posts older than date range, stopping", "info")
|
|
break
|
|
|
|
if date_to and post.date > date_to:
|
|
continue
|
|
|
|
# Check if already downloaded
|
|
media_id = str(post.mediaid)
|
|
shortcode = post.shortcode
|
|
if self._is_already_downloaded(media_id):
|
|
self.log(f"Skipping already downloaded: {shortcode}", "debug")
|
|
skipped += 1
|
|
continue
|
|
|
|
# Download post
|
|
try:
|
|
self.log(f"Downloading post {shortcode} from {post.date.strftime('%Y-%m-%d')}", "info")
|
|
|
|
# Create temp directory for instaloader
|
|
temp_dir = output_path / f"temp_{shortcode}"
|
|
temp_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Download with Instaloader to temp dir
|
|
self.loader.download_post(post, target=temp_dir)
|
|
|
|
# Move and rename files to match FastDL format
|
|
self._process_downloaded_files(temp_dir, output_path, post.owner_username, media_id, post.date)
|
|
|
|
# Clean up temp directory
|
|
import shutil
|
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
|
|
# Record in database
|
|
self._record_download(
|
|
post_id=media_id,
|
|
username=post.owner_username,
|
|
content_type="post",
|
|
post_date=post.date,
|
|
caption=post.caption[:500] if post.caption else None,
|
|
likes=post.likes,
|
|
comments=post.comments,
|
|
deferred=self.defer_database
|
|
)
|
|
|
|
downloaded += 1
|
|
|
|
# Update status
|
|
self.activity_manager.update_status(
|
|
"Downloading posts",
|
|
progress_current=downloaded,
|
|
progress_total=max_downloads
|
|
)
|
|
|
|
# Check max downloads
|
|
if max_downloads and downloaded >= max_downloads:
|
|
self.log(f"Reached max downloads ({max_downloads})", "info")
|
|
break
|
|
|
|
# Random delay to avoid detection
|
|
self._smart_delay(downloaded)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading post {media_id}: {e}", "error")
|
|
if "429" in str(e):
|
|
self.log("Rate limited! Stopping downloads.", "error")
|
|
break
|
|
|
|
except Exception as e:
|
|
error_msg = str(e)
|
|
self.log(f"Error iterating posts: {e}", "error")
|
|
|
|
# Check if Instagram is blocking us
|
|
if "401" in error_msg or "Please wait a few minutes" in error_msg:
|
|
self.log("Instagram is blocking requests - session may be compromised", "error")
|
|
self.log("Aborting all downloads to prevent further issues", "error")
|
|
# Mark session as invalid to prevent further attempts
|
|
self.session_is_valid = False
|
|
return 0
|
|
elif "403" in error_msg or "forbidden" in error_msg.lower():
|
|
self.log("Access forbidden - Instagram has blocked this session", "error")
|
|
self.session_is_valid = False
|
|
return 0
|
|
|
|
self.log(f"Posts: {downloaded} downloaded, {skipped} skipped", "info")
|
|
return downloaded
|
|
|
|
def _process_downloaded_files(self, temp_dir: Path, output_path: Path, username: str, fallback_id: str, post_date: datetime):
|
|
"""Process downloaded files to match FastDL naming and timestamps
|
|
|
|
Returns:
|
|
list: List of processed filenames, or empty list if no files processed
|
|
"""
|
|
import shutil
|
|
import re
|
|
import json
|
|
import lzma
|
|
from datetime import timedelta
|
|
|
|
processed_files = []
|
|
|
|
# Format date for filename - subtract 4 hours for timezone adjustment
|
|
adjusted_date_for_filename = post_date - timedelta(hours=4)
|
|
date_str = adjusted_date_for_filename.strftime('%Y%m%d_%H%M%S')
|
|
|
|
# Build a mapping of original filenames to media IDs from JSON
|
|
media_id_map = {}
|
|
|
|
# Load JSON file to get media IDs from URLs
|
|
json_files = list(temp_dir.glob('*.json.xz'))
|
|
if not json_files:
|
|
json_files = list(temp_dir.glob('*.json'))
|
|
|
|
if json_files:
|
|
try:
|
|
json_file = json_files[0]
|
|
if json_file.suffix == '.xz':
|
|
with lzma.open(json_file, 'rt') as f:
|
|
data = json.load(f)
|
|
else:
|
|
with open(json_file, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
# Extract media IDs from URLs in carousel or single image
|
|
if 'node' in data:
|
|
node = data['node']
|
|
|
|
# Check for carousel in iphone_struct
|
|
if 'iphone_struct' in node and 'carousel_media' in node['iphone_struct']:
|
|
# Carousel post - each image has its own media ID
|
|
for idx, item in enumerate(node['iphone_struct']['carousel_media'], 1):
|
|
if 'image_versions2' in item and 'candidates' in item['image_versions2']:
|
|
url = item['image_versions2']['candidates'][0]['url']
|
|
# Extract media ID from URL
|
|
parts = url.split('/')
|
|
for part in parts:
|
|
if '.jpg' in part or '.mp4' in part:
|
|
filename = part.split('?')[0]
|
|
# Remove extension and _n suffix
|
|
media_id = filename.replace('.jpg', '').replace('.mp4', '').replace('_n', '')
|
|
# Map the index to media ID
|
|
media_id_map[str(idx)] = media_id
|
|
break
|
|
|
|
# Check for single image/video
|
|
elif 'display_url' in node or ('iphone_struct' in node and 'image_versions2' in node['iphone_struct']):
|
|
# Single post
|
|
url = node.get('display_url', '')
|
|
if not url and 'iphone_struct' in node and 'image_versions2' in node['iphone_struct']:
|
|
url = node['iphone_struct']['image_versions2']['candidates'][0]['url']
|
|
|
|
if url:
|
|
parts = url.split('/')
|
|
for part in parts:
|
|
if '.jpg' in part or '.mp4' in part:
|
|
filename = part.split('?')[0]
|
|
media_id = filename.replace('.jpg', '').replace('.mp4', '').replace('_n', '')
|
|
media_id_map['single'] = media_id
|
|
break
|
|
|
|
except Exception as e:
|
|
self.log(f"Could not extract media IDs from JSON: {e}", "debug")
|
|
|
|
# Process all downloaded files
|
|
for file_path in temp_dir.iterdir():
|
|
if file_path.is_file():
|
|
# Skip JSON metadata files
|
|
if file_path.suffix.lower() in ['.json', '.xz', '.txt']:
|
|
continue
|
|
|
|
# Get file extension
|
|
ext = file_path.suffix.lower()
|
|
|
|
# Check if it's a multi-image post (has _1, _2, etc. in filename)
|
|
match = re.search(r'_(\d+)\.(jpg|jpeg|png|mp4|mov)', file_path.name, re.IGNORECASE)
|
|
if match:
|
|
index = match.group(1)
|
|
# Use the media ID for this specific index
|
|
media_id = media_id_map.get(index, fallback_id)
|
|
new_filename = f"{username}_{date_str}_{media_id}{ext}"
|
|
else:
|
|
# Single image/video
|
|
media_id = media_id_map.get('single', fallback_id)
|
|
new_filename = f"{username}_{date_str}_{media_id}{ext}"
|
|
|
|
# Move and rename file
|
|
new_path = output_path / new_filename
|
|
shutil.move(str(file_path), str(new_path))
|
|
|
|
# Check for duplicate hash before finalizing (hash blacklist persists even if original deleted)
|
|
file_hash = self.unified_db.get_file_hash(str(new_path)) if self.unified_db else None
|
|
if file_hash:
|
|
existing = self.unified_db.get_download_by_file_hash(file_hash)
|
|
if existing and existing.get('file_path') and str(new_path) != existing.get('file_path'):
|
|
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
|
|
self.log(f"⚠ Duplicate content detected (hash match): {new_filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
|
# Delete the duplicate regardless of whether original file still exists
|
|
try:
|
|
new_path.unlink()
|
|
self.log(f"Deleted duplicate (hash blacklist): {new_filename}", "debug")
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Failed to delete duplicate {new_filename}: {e}", "warning")
|
|
|
|
# Set file timestamps to post date
|
|
self._update_file_timestamp(new_path, post_date)
|
|
|
|
# Add to processed files list
|
|
processed_files.append(new_filename)
|
|
|
|
return processed_files
|
|
|
|
def _update_file_timestamp(self, filepath: Path, post_date: datetime):
|
|
"""Update file timestamps to match post date"""
|
|
try:
|
|
import os
|
|
# Convert datetime to timestamp with 4-hour adjustment
|
|
from datetime import timedelta
|
|
adjusted_date = post_date - timedelta(hours=4)
|
|
timestamp = adjusted_date.timestamp()
|
|
# Set both access and modification time
|
|
os.utime(filepath, (timestamp, timestamp))
|
|
self.log(f"Updated timestamp for {filepath.name} (adjusted -4 hours)", "debug")
|
|
except Exception as e:
|
|
self.log(f"Failed to update timestamp: {e}", "warning")
|
|
|
|
def _download_stories(self, profile, output_path: Path) -> int:
|
|
"""Download stories from profile"""
|
|
downloaded = 0
|
|
|
|
if not self.username:
|
|
self.log("Login required to download stories", "warning")
|
|
return 0
|
|
|
|
self.log(f"Downloading stories...", "info")
|
|
self.activity_manager.update_status("Checking stories")
|
|
|
|
try:
|
|
import instaloader
|
|
import shutil
|
|
|
|
# Get user ID for stories
|
|
user_id = profile.userid
|
|
|
|
# Download stories
|
|
for story in self.loader.get_stories([user_id]):
|
|
for item in story.get_items():
|
|
media_id = str(item.mediaid)
|
|
|
|
if self._is_already_downloaded(media_id):
|
|
self.log(f"Skipping already downloaded story: {media_id}", "debug")
|
|
continue
|
|
|
|
try:
|
|
self.log(f"Downloading story {media_id}", "info")
|
|
|
|
# Download story to temp dir
|
|
temp_dir = output_path / f"temp_story_{media_id}"
|
|
temp_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
self.loader.download_storyitem(item, target=temp_dir)
|
|
|
|
# Process and move files to match FastDL format
|
|
processed_files = self._process_downloaded_files(temp_dir, output_path, profile.username, media_id, item.date)
|
|
|
|
# Clean up temp directory
|
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
|
|
# Only record in database if files were successfully processed
|
|
if processed_files:
|
|
# Get the first processed filename for database record
|
|
filename = processed_files[0] if isinstance(processed_files, list) else None
|
|
self._record_download(
|
|
post_id=media_id,
|
|
username=profile.username,
|
|
content_type="story",
|
|
filename=filename,
|
|
post_date=item.date,
|
|
deferred=self.defer_database
|
|
)
|
|
downloaded += 1
|
|
self.activity_manager.update_status(
|
|
"Downloading stories",
|
|
progress_current=downloaded,
|
|
progress_total=max_downloads
|
|
)
|
|
else:
|
|
self.log(f"No files processed for story {media_id}, not recording in database", "warning")
|
|
|
|
self._smart_delay()
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading story {media_id}: {e}", "error")
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading stories: {e}", "error")
|
|
if "login" in str(e).lower():
|
|
self.log("Stories require login!", "warning")
|
|
|
|
self.log(f"Stories: {downloaded} downloaded", "info")
|
|
return downloaded
|
|
|
|
def _download_reels(self, profile, output_path: Path, max_downloads: int,
|
|
date_from: datetime, date_to: datetime) -> int:
|
|
"""Download reels from profile"""
|
|
downloaded = 0
|
|
|
|
self.log(f"Downloading reels...", "info")
|
|
self.activity_manager.update_status("Checking reels")
|
|
|
|
try:
|
|
# Reels are part of posts, filter for videos
|
|
posts = profile.get_posts()
|
|
|
|
for post in posts:
|
|
# Check if it's a reel (video post)
|
|
if not post.is_video:
|
|
continue
|
|
|
|
# Check date range
|
|
if date_from and post.date < date_from:
|
|
break
|
|
|
|
if date_to and post.date > date_to:
|
|
continue
|
|
|
|
# Check if already downloaded
|
|
media_id = str(post.mediaid)
|
|
shortcode = post.shortcode
|
|
if self._is_already_downloaded(media_id):
|
|
self.log(f"Skipping already downloaded reel: {shortcode}", "debug")
|
|
continue
|
|
|
|
try:
|
|
self.log(f"Downloading reel {shortcode}", "info")
|
|
|
|
# Download reel to temp dir
|
|
temp_dir = output_path / f"temp_reel_{shortcode}"
|
|
temp_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
self.loader.download_post(post, target=temp_dir)
|
|
|
|
# Process and move files to match FastDL format
|
|
self._process_downloaded_files(temp_dir, output_path, post.owner_username, media_id, post.date)
|
|
|
|
# Clean up temp directory
|
|
import shutil
|
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
|
|
# Record in database
|
|
self._record_download(
|
|
post_id=media_id,
|
|
username=post.owner_username,
|
|
content_type="reel",
|
|
post_date=post.date,
|
|
likes=post.likes,
|
|
comments=post.comments,
|
|
deferred=self.defer_database
|
|
)
|
|
|
|
downloaded += 1
|
|
|
|
# Update status
|
|
self.activity_manager.update_status(
|
|
"Downloading reels",
|
|
progress_current=downloaded,
|
|
progress_total=max_downloads
|
|
)
|
|
|
|
if max_downloads and downloaded >= max_downloads:
|
|
break
|
|
|
|
self._smart_delay()
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading reel {media_id}: {e}", "error")
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading reels: {e}", "error")
|
|
|
|
self.log(f"Reels: {downloaded} downloaded", "info")
|
|
return downloaded
|
|
|
|
def _smart_delay(self, batch_count=0):
|
|
"""Smart delay between downloads to avoid detection"""
|
|
# Random delay with exponential backoff if needed
|
|
base_delay = random.uniform(self.min_delay, self.max_delay)
|
|
|
|
# Add batch delay if we've downloaded a batch
|
|
if batch_count > 0 and batch_count % self.download_batch_size == 0:
|
|
self.log(f"Batch limit reached ({self.download_batch_size} items), taking a longer break", "info")
|
|
base_delay = self.batch_delay + random.uniform(0, 10)
|
|
|
|
# Add extra delay if we're downloading fast
|
|
elif len(self.request_times) > 10:
|
|
recent_requests = self.request_times[-10:]
|
|
avg_interval = (recent_requests[-1] - recent_requests[0]) / 9
|
|
if avg_interval < 5: # Too fast
|
|
base_delay += random.uniform(5, 10)
|
|
self.log("Slowing down to avoid detection", "debug")
|
|
|
|
time.sleep(base_delay)
|
|
|
|
def login(self, username: str, password: str = None) -> bool:
|
|
"""
|
|
Login to Instagram and save session
|
|
|
|
Args:
|
|
username: Instagram username
|
|
password: Instagram password (will prompt if not provided)
|
|
|
|
Returns:
|
|
True if login successful
|
|
"""
|
|
try:
|
|
if not password:
|
|
import getpass
|
|
password = getpass.getpass(f"Password for {username}: ")
|
|
|
|
self.log(f"Logging in as {username}...", "info")
|
|
self.loader.login(username, password)
|
|
|
|
# Save session
|
|
# Use configured session file path or default
|
|
if self.session_file:
|
|
session_file = Path(self.session_file).expanduser()
|
|
session_file.parent.mkdir(parents=True, exist_ok=True)
|
|
else:
|
|
session_file = self.session_dir / f"session-{username}"
|
|
self.loader.save_session_to_file(session_file)
|
|
self.log(f"Session saved to {session_file}", "success")
|
|
|
|
self.username = username
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.log(f"Login failed: {e}", "error")
|
|
|
|
if "checkpoint" in str(e).lower():
|
|
self.log("Instagram requires verification (checkpoint)", "warning")
|
|
self.log("Complete verification in browser, then export session", "info")
|
|
elif "bad password" in str(e).lower():
|
|
self.log("Invalid username or password", "error")
|
|
elif "429" in str(e):
|
|
self.log("Too many login attempts, try again later", "error")
|
|
|
|
return False
|
|
|
|
def get_database_stats(self) -> Dict:
|
|
"""Get database statistics"""
|
|
if not self.use_database or not self.unified_db:
|
|
return {"enabled": False}
|
|
|
|
# Use unified database statistics
|
|
return self.unified_db.get_statistics(platform='instagram')
|
|
|
|
# Test function
|
|
def test_module():
|
|
"""Test the InstaLoader module"""
|
|
from pathlib import Path
|
|
print("Testing InstaLoader Module")
|
|
print("=" * 60)
|
|
|
|
# Use proper path in database directory for testing
|
|
test_db_path = str(Path(__file__).parent.parent / 'database' / 'test_instaloader.db')
|
|
module = InstaLoaderModule(
|
|
show_progress=True,
|
|
use_database=True,
|
|
db_path=test_db_path
|
|
)
|
|
|
|
# Test download (limited)
|
|
count = module.download(
|
|
username="evalongoria",
|
|
output_dir="/opt/temp/test/instagram/posts",
|
|
content_type="posts",
|
|
max_downloads=2,
|
|
days_back=30
|
|
)
|
|
|
|
print(f"\nDownloaded {count} items")
|
|
|
|
# Show stats
|
|
stats = module.get_database_stats()
|
|
print(f"\nDatabase stats:")
|
|
print(f" Total: {stats.get('total_downloads', 0)}")
|
|
print(f" By type: {stats.get('by_type', {})}")
|
|
|
|
return count > 0
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
success = test_module()
|
|
sys.exit(0 if success else 1) |