1117 lines
47 KiB
Python
1117 lines
47 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Toolzu Instagram Downloader Module
|
|
Downloads Instagram content at 1920x1440 resolution
|
|
"""
|
|
|
|
# Allow nested event loops for compatibility with asyncio contexts
|
|
try:
|
|
import nest_asyncio
|
|
nest_asyncio.apply()
|
|
except ImportError:
|
|
pass
|
|
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
|
import os
|
|
import re
|
|
import random
|
|
import time
|
|
import json
|
|
import requests
|
|
from modules.base_module import LoggingMixin
|
|
from modules.cloudflare_handler import (
|
|
CloudflareHandler, SiteStatus, get_flaresolverr_user_agent,
|
|
get_playwright_context_options, get_playwright_stealth_scripts
|
|
)
|
|
from modules.instagram_utils import (
|
|
extract_instagram_media_id,
|
|
scan_existing_files_for_media_ids,
|
|
record_instagram_download,
|
|
is_instagram_downloaded
|
|
)
|
|
|
|
|
|
class ToolzuDownloader(LoggingMixin):
|
|
"""
|
|
Toolzu Instagram downloader - provides 1920x1440 resolution downloads
|
|
|
|
Example usage:
|
|
from toolzu_module import ToolzuDownloader
|
|
|
|
downloader = ToolzuDownloader()
|
|
count = downloader.download(
|
|
username="evalongoria",
|
|
content_type="posts",
|
|
output_dir="downloads/posts"
|
|
)
|
|
print(f"Downloaded {count} items")
|
|
"""
|
|
|
|
def __init__(self, headless=True, show_progress=True, use_database=True,
|
|
log_callback=None, unified_db=None,
|
|
cookie_file=None, toolzu_email=None, toolzu_password=None):
|
|
"""
|
|
Initialize the downloader
|
|
|
|
Args:
|
|
headless: Run browser in headless mode
|
|
show_progress: Print progress messages
|
|
use_database: Use database to track downloads
|
|
log_callback: Optional callback function for logging
|
|
unified_db: Optional UnifiedDatabase instance
|
|
cookie_file: Path to cookie file for session persistence
|
|
toolzu_email: Email for Toolzu login (optional, for auto-login)
|
|
toolzu_password: Password for Toolzu login (optional, for auto-login)
|
|
"""
|
|
# Initialize logging via mixin
|
|
self._init_logger('Instagram', log_callback, default_module='Download')
|
|
|
|
self.headless = headless
|
|
self.show_progress = show_progress
|
|
# Toolzu now uses unified profile page with tabs
|
|
self.toolzu_url = 'https://toolzu.com/downloader/instagram/profile/'
|
|
self.login_url = 'https://toolzu.com/login'
|
|
self.downloaded_files = set()
|
|
self.use_database = use_database
|
|
self.toolzu_email = toolzu_email
|
|
self.toolzu_password = toolzu_password
|
|
self.unified_db = unified_db # Store for scraper config access
|
|
self.scraper_id = 'toolzu' # Scraper ID in database
|
|
|
|
# Rate limiting settings
|
|
self.min_delay = 5
|
|
self.max_delay = 15
|
|
self.batch_size = 10
|
|
self.batch_delay_min = 30
|
|
self.batch_delay_max = 60
|
|
self.download_count = 0
|
|
self.pending_downloads = [] # Track downloads for deferred database recording
|
|
|
|
# Use unified database
|
|
if unified_db and use_database:
|
|
from modules.unified_database import ToolzuDatabaseAdapter
|
|
self.db = ToolzuDatabaseAdapter(unified_db)
|
|
else:
|
|
self.db = None
|
|
self.use_database = False
|
|
|
|
# Initialize activity status manager for real-time updates
|
|
from modules.activity_status import get_activity_manager
|
|
self.activity_manager = get_activity_manager(unified_db)
|
|
|
|
# Load scraper configuration from database if available
|
|
self.proxy_url = None
|
|
self.cookie_file = None # Default to None (use database)
|
|
|
|
if unified_db:
|
|
scraper_config = unified_db.get_scraper(self.scraper_id)
|
|
if scraper_config:
|
|
# Get proxy configuration
|
|
if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
|
|
self.proxy_url = scraper_config['proxy_url']
|
|
self.log(f"Using proxy: {self.proxy_url}", "info")
|
|
|
|
# Fall back to cookie file if no database or if explicitly provided
|
|
if not unified_db:
|
|
if cookie_file:
|
|
self.cookie_file = Path(cookie_file)
|
|
else:
|
|
self.cookie_file = Path('/opt/media-downloader/cookies/toolzu_cookies.json')
|
|
|
|
# User-Agent to match FlareSolverr (dynamically fetched for consistency)
|
|
self.user_agent = get_flaresolverr_user_agent()
|
|
|
|
# Initialize universal Cloudflare handler
|
|
# Pass proxy_url if configured, and cookie_file=None for database storage
|
|
self.cf_handler = CloudflareHandler(
|
|
module_name="Toolzu",
|
|
cookie_file=str(self.cookie_file) if self.cookie_file else None,
|
|
user_agent=self.user_agent,
|
|
logger=self.logger,
|
|
aggressive_expiry=True,
|
|
proxy_url=self.proxy_url # Pass proxy to FlareSolverr
|
|
)
|
|
|
|
# Keep for backwards compatibility
|
|
self.flaresolverr_url = self.cf_handler.flaresolverr_url
|
|
self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled
|
|
|
|
# Load cookies from database if available
|
|
self._load_cookies_from_db()
|
|
|
|
def _load_cookies_from_db(self):
|
|
"""Load cookies from database if available"""
|
|
if not self.unified_db:
|
|
return
|
|
|
|
try:
|
|
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
|
if cookies:
|
|
# Load into CloudflareHandler
|
|
self.cf_handler._cookies = cookies
|
|
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies from database: {e}", "warning")
|
|
|
|
def _save_cookies_to_db(self, cookies: list, user_agent: str = None):
|
|
"""Save cookies to database
|
|
|
|
Args:
|
|
cookies: List of cookie dictionaries
|
|
user_agent: User agent to associate with cookies (important for cf_clearance).
|
|
If not provided, uses self.user_agent as fallback.
|
|
"""
|
|
if not self.unified_db:
|
|
return
|
|
|
|
try:
|
|
# Use provided user_agent or fall back to self.user_agent
|
|
ua = user_agent or self.user_agent
|
|
self.unified_db.save_scraper_cookies(
|
|
self.scraper_id,
|
|
cookies,
|
|
user_agent=ua,
|
|
merge=True
|
|
)
|
|
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50]}...)", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error saving cookies to database: {e}", "warning")
|
|
|
|
def _has_valid_cookies(self):
|
|
"""Check if we have valid cookies (either in file or database)"""
|
|
if self.unified_db:
|
|
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
|
return cookies and len(cookies) > 0
|
|
elif self.cookie_file:
|
|
return self.cookie_file.exists()
|
|
return False
|
|
|
|
def _cookies_expired(self):
|
|
"""Check if cookies are expired - delegates to CloudflareHandler"""
|
|
return self.cf_handler.cookies_expired()
|
|
|
|
def _get_cookies_for_requests(self):
|
|
"""Get cookies in format for requests library - delegates to CloudflareHandler"""
|
|
return self.cf_handler.get_cookies_dict()
|
|
|
|
def _get_cookies_via_flaresolverr(self, url="https://toolzu.com/", max_retries=2):
|
|
"""Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler
|
|
|
|
Args:
|
|
url: URL to fetch
|
|
max_retries: Maximum number of retry attempts (default: 2)
|
|
|
|
Returns:
|
|
True if cookies obtained successfully, False otherwise
|
|
"""
|
|
success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries)
|
|
|
|
# Save cookies to database if successful
|
|
if success and self.unified_db:
|
|
cookies_list = self.cf_handler.get_cookies_list()
|
|
if cookies_list:
|
|
# CRITICAL: Get the user_agent from FlareSolverr solution, not self.user_agent
|
|
# cf_clearance cookies are fingerprinted to the browser that solved the challenge
|
|
flaresolverr_ua = self.cf_handler.get_user_agent()
|
|
self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua)
|
|
|
|
return success
|
|
|
|
def _smart_delay(self):
|
|
"""Implement smart delays with randomization"""
|
|
self.download_count += 1
|
|
|
|
if self.download_count % self.batch_size == 0:
|
|
delay = random.uniform(self.batch_delay_min, self.batch_delay_max)
|
|
self.log(f"Batch delay: waiting {delay:.1f} seconds", "debug")
|
|
else:
|
|
delay = random.uniform(self.min_delay, self.max_delay)
|
|
self.log(f"Waiting {delay:.1f} seconds", "debug")
|
|
|
|
time.sleep(delay)
|
|
|
|
def _load_cookies(self, context):
|
|
"""Load cookies from database or file into browser context"""
|
|
# Try loading from database first
|
|
if self.unified_db:
|
|
try:
|
|
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
|
if cookies:
|
|
# Clean cookies - remove unsupported properties and convert expiry->expires
|
|
cleaned_cookies = []
|
|
for cookie in cookies:
|
|
cleaned = {k: v for k, v in cookie.items()
|
|
if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
|
|
# FlareSolverr uses 'expiry' but Playwright uses 'expires'
|
|
if 'expiry' in cleaned and 'expires' not in cleaned:
|
|
cleaned['expires'] = cleaned.pop('expiry')
|
|
cleaned_cookies.append(cleaned)
|
|
|
|
# CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect
|
|
try:
|
|
context.clear_cookies()
|
|
except Exception:
|
|
pass
|
|
|
|
context.add_cookies(cleaned_cookies)
|
|
self.log(f"Loaded {len(cleaned_cookies)} cookies from database", "info")
|
|
return
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies from database: {e}", "warning")
|
|
|
|
# Fallback to file-based cookies
|
|
if not self.cookie_file or not self.cookie_file.exists():
|
|
self.log("No saved cookies found", "debug")
|
|
return
|
|
|
|
try:
|
|
import json
|
|
with open(self.cookie_file, 'r') as f:
|
|
data = json.load(f)
|
|
cookies = data.get('cookies', [])
|
|
|
|
if cookies:
|
|
# Convert expiry->expires for Playwright compatibility
|
|
cleaned_cookies = []
|
|
for cookie in cookies:
|
|
cleaned = dict(cookie)
|
|
if 'expiry' in cleaned and 'expires' not in cleaned:
|
|
cleaned['expires'] = cleaned.pop('expiry')
|
|
cleaned_cookies.append(cleaned)
|
|
|
|
# CRITICAL: Clear existing cookies first
|
|
try:
|
|
context.clear_cookies()
|
|
except Exception:
|
|
pass
|
|
|
|
context.add_cookies(cleaned_cookies)
|
|
self.log(f"Loaded {len(cleaned_cookies)} cookies from file", "info")
|
|
except Exception as e:
|
|
self.log(f"Failed to load cookies: {e}", "warning")
|
|
|
|
def _save_cookies(self, context):
|
|
"""Save cookies to database or file"""
|
|
try:
|
|
import json
|
|
cookies = context.cookies()
|
|
|
|
# Save to database if available
|
|
if self.unified_db:
|
|
try:
|
|
# CRITICAL: Include user_agent for cf_clearance cookies to work
|
|
self.unified_db.save_scraper_cookies(
|
|
self.scraper_id,
|
|
cookies,
|
|
user_agent=self.user_agent,
|
|
merge=True
|
|
)
|
|
self.log(f"Saved {len(cookies)} cookies to database", "debug")
|
|
return
|
|
except Exception as e:
|
|
self.log(f"Error saving cookies to database: {e}", "warning")
|
|
|
|
# Fallback to file-based storage
|
|
if self.cookie_file:
|
|
# Ensure directory exists
|
|
self.cookie_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(self.cookie_file, 'w') as f:
|
|
json.dump({'cookies': cookies}, f, indent=2)
|
|
|
|
self.log(f"Saved {len(cookies)} cookies to file", "debug")
|
|
except Exception as e:
|
|
self.log(f"Failed to save cookies: {e}", "warning")
|
|
|
|
def login(self, page, context):
|
|
"""
|
|
Log in to Toolzu using provided credentials
|
|
|
|
Args:
|
|
page: Playwright page object
|
|
context: Browser context for saving cookies
|
|
|
|
Returns:
|
|
True if login successful, False otherwise
|
|
"""
|
|
if not self.toolzu_email or not self.toolzu_password:
|
|
self.log("No Toolzu credentials provided, cannot auto-login", "warning")
|
|
return False
|
|
|
|
try:
|
|
self.log("Attempting to log in to Toolzu...")
|
|
|
|
# Navigate to login page
|
|
page.goto(self.login_url, wait_until="domcontentloaded", timeout=30000)
|
|
page.wait_for_timeout(2000)
|
|
|
|
# Fill in email
|
|
email_input = page.locator("#loginform-email").first
|
|
if not email_input.is_visible():
|
|
self.log("Login form not found", "error")
|
|
return False
|
|
|
|
self.log(f"Filling in email: {self.toolzu_email}")
|
|
email_input.fill(self.toolzu_email)
|
|
page.wait_for_timeout(500)
|
|
|
|
# Fill in password
|
|
password_input = page.locator("#loginform-password").first
|
|
password_input.fill(self.toolzu_password)
|
|
page.wait_for_timeout(500)
|
|
|
|
# Handle reCAPTCHA v3 if present
|
|
try:
|
|
# Wait a bit for reCAPTCHA to execute
|
|
page.wait_for_timeout(2000)
|
|
|
|
# Check if reCAPTCHA token field exists and is populated
|
|
recaptcha_field = page.locator("#loginform-recaptcha").first
|
|
if recaptcha_field:
|
|
recaptcha_value = recaptcha_field.get_attribute("value")
|
|
if recaptcha_value:
|
|
self.log("reCAPTCHA v3 token detected", "debug")
|
|
else:
|
|
self.log("reCAPTCHA v3 token not populated yet, waiting...", "debug")
|
|
page.wait_for_timeout(3000)
|
|
except Exception:
|
|
pass
|
|
|
|
# Submit the form
|
|
submit_button = page.locator("button[type='submit'], button:has-text('Log in')").first
|
|
if submit_button.is_visible():
|
|
self.log("Submitting login form...")
|
|
submit_button.click()
|
|
else:
|
|
# Try pressing Enter on password field
|
|
password_input.press("Enter")
|
|
|
|
# Wait for navigation or error
|
|
page.wait_for_timeout(5000)
|
|
|
|
# Check if login was successful
|
|
# Success: redirected away from login page or see user menu
|
|
current_url = page.url
|
|
if "/login" not in current_url or page.locator("a:has-text('Log out'), .user-menu, .dropdown-toggle").first.is_visible():
|
|
self.log("Login successful!", "success")
|
|
|
|
# Save cookies with login session
|
|
self._save_cookies(context)
|
|
return True
|
|
else:
|
|
# Check for error messages
|
|
error_msg = page.locator(".alert-danger, .help-block-error, .invalid-feedback").first
|
|
if error_msg.is_visible():
|
|
error_text = error_msg.inner_text()
|
|
self.log(f"Login failed: {error_text}", "error")
|
|
else:
|
|
self.log("Login failed (still on login page)", "error")
|
|
return False
|
|
|
|
except Exception as e:
|
|
self.log(f"Login error: {e}", "error")
|
|
import traceback
|
|
self.log(traceback.format_exc(), "debug")
|
|
return False
|
|
|
|
def _check_if_login_needed(self, page):
|
|
"""
|
|
Check if we need to log in (e.g., hit download limit, session expired)
|
|
|
|
Args:
|
|
page: Playwright page object
|
|
|
|
Returns:
|
|
True if login is needed, False otherwise
|
|
"""
|
|
try:
|
|
# Check for download limit message
|
|
limit_msg = page.locator("text=EXCEEDED THE LIMIT, text=login to continue, text=sign in").first
|
|
if limit_msg.is_visible():
|
|
self.log("Download limit detected, login required", "info")
|
|
return True
|
|
|
|
# Check if redirected to login page
|
|
if "/login" in page.url:
|
|
self.log("Redirected to login page", "info")
|
|
return True
|
|
|
|
return False
|
|
except Exception:
|
|
return False
|
|
|
|
def _extract_timestamp_from_url(self, url):
|
|
"""
|
|
Extract timestamp from Toolzu thumbnail URL query parameter
|
|
|
|
NOTE: Toolzu does NOT provide actual post dates anywhere on the page.
|
|
The 'time=' parameter in thumbnail URLs is the page load time, not post date.
|
|
|
|
This method returns None - download time will be used as fallback.
|
|
|
|
Args:
|
|
url: Toolzu thumbnail URL with time parameter
|
|
|
|
Returns:
|
|
None (Toolzu doesn't provide reliable post dates)
|
|
"""
|
|
# Don't extract timestamps from Toolzu - they're page load times, not post dates
|
|
return None
|
|
|
|
def _extract_media_id_from_url(self, url):
|
|
"""
|
|
Extract media ID from Instagram CDN URL
|
|
|
|
Args:
|
|
url: Instagram CDN URL
|
|
|
|
Returns:
|
|
Media ID string
|
|
"""
|
|
# Pattern: number_MEDIAID_number_n.jpg
|
|
pattern = r'(\d+)_(\d{17,19})_\d+_n\.(jpg|mp4)'
|
|
match = re.search(pattern, url)
|
|
if match:
|
|
return match.group(2) # Return the media ID
|
|
|
|
# Fallback: extract from filename
|
|
try:
|
|
filename = url.split('/')[-1].split('?')[0]
|
|
return Path(filename).stem
|
|
except Exception:
|
|
return None
|
|
|
|
def _is_already_downloaded(self, media_id):
|
|
"""Check if media_id was already downloaded by ANY Instagram downloader (uses centralized function)"""
|
|
if not self.use_database:
|
|
return False
|
|
|
|
try:
|
|
# Use centralized function for consistent cross-module detection
|
|
return is_instagram_downloaded(self.db.db if hasattr(self.db, 'db') else self.db, media_id)
|
|
except Exception as e:
|
|
self.log(f"Error checking database for {media_id}: {e}", "error")
|
|
return False # Don't skip on error - try to download
|
|
|
|
def _record_download(self, media_id, username, content_type, filename,
|
|
download_url=None, post_date=None, metadata=None, deferred=False):
|
|
"""Record download in database (uses centralized function)
|
|
|
|
Args:
|
|
deferred: If True, don't record to database now - add to pending_downloads list
|
|
for later recording after file move is complete
|
|
"""
|
|
# If deferred, store for later recording instead of recording now
|
|
if deferred:
|
|
file_path = str(filename) # Full path
|
|
filename_only = Path(filename).name # Just the filename
|
|
self.pending_downloads.append({
|
|
'media_id': media_id,
|
|
'username': username,
|
|
'filename': filename_only,
|
|
'url': download_url,
|
|
'post_date': post_date.isoformat() if post_date else None,
|
|
'file_path': file_path,
|
|
'content_type': content_type,
|
|
'metadata': metadata
|
|
})
|
|
self.log(f"Deferred recording for {media_id}", "debug")
|
|
return True
|
|
|
|
if not self.use_database:
|
|
return
|
|
|
|
# Extract just the filename from the full path for database
|
|
file_path = str(filename) # Full path
|
|
filename_only = Path(filename).name # Just the filename
|
|
|
|
try:
|
|
# Use centralized function for consistent cross-module storage
|
|
result = record_instagram_download(
|
|
db=self.db.db if hasattr(self.db, 'db') else self.db,
|
|
media_id=media_id,
|
|
username=username,
|
|
content_type=content_type,
|
|
filename=filename_only,
|
|
download_url=download_url,
|
|
post_date=post_date,
|
|
file_path=file_path,
|
|
method='toolzu',
|
|
extra_metadata=metadata or {}
|
|
)
|
|
if result:
|
|
self.log(f"Recorded download for {media_id}", "debug")
|
|
else:
|
|
self.log(f"Failed to record download for {media_id} (possibly duplicate)", "debug")
|
|
except Exception as e:
|
|
self.log(f"Failed to record download: {e}", "warning")
|
|
|
|
def get_pending_downloads(self):
|
|
"""Get list of downloads that were deferred for later recording"""
|
|
return self.pending_downloads.copy()
|
|
|
|
def clear_pending_downloads(self):
|
|
"""Clear the pending downloads list after they've been recorded"""
|
|
self.pending_downloads = []
|
|
|
|
def _update_file_timestamps(self, filepath, post_date):
|
|
"""Update file timestamps to match post date"""
|
|
if not post_date:
|
|
return
|
|
|
|
timestamp = post_date.timestamp()
|
|
try:
|
|
os.utime(filepath, (timestamp, timestamp))
|
|
self.log(f"Updated timestamps to {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
|
|
except Exception as e:
|
|
self.log(f"Failed to update timestamps: {e}", "debug")
|
|
|
|
def download(self, username, content_type="posts", output_dir="downloads",
|
|
max_downloads=None, days_back=None, date_from=None, date_to=None,
|
|
defer_database=False):
|
|
"""
|
|
Download content from Instagram via Toolzu
|
|
|
|
Args:
|
|
username: Instagram username
|
|
content_type: 'posts' or 'stories' (Toolzu doesn't support reels)
|
|
output_dir: Directory to save downloads
|
|
max_downloads: Maximum number of items to download
|
|
days_back: Number of days back to download
|
|
date_from: Start date for range
|
|
date_to: End date for range
|
|
defer_database: If True, don't record to database immediately - store in
|
|
pending_downloads for later recording after file move is complete
|
|
|
|
Returns:
|
|
Number of successfully downloaded items
|
|
"""
|
|
# Clear downloaded_files cache between accounts to prevent memory growth
|
|
self.downloaded_files.clear()
|
|
|
|
# Check site status before doing anything else
|
|
self.log("Checking Toolzu site status...", "debug")
|
|
site_status, error_msg = self.cf_handler.check_site_status("https://toolzu.com/", timeout=10)
|
|
|
|
if self.cf_handler.should_skip_download(site_status):
|
|
self.log(f"Skipping download - Toolzu is unavailable: {error_msg}", "warning")
|
|
return 0
|
|
elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
|
|
self.log("Cloudflare challenge detected, will attempt bypass during download", "info")
|
|
|
|
# Validate content type
|
|
if content_type not in ['posts', 'stories']:
|
|
self.log(f"Toolzu only supports 'posts' and 'stories', not '{content_type}'", "warning")
|
|
return 0
|
|
|
|
self.username = username
|
|
self.content_type = content_type
|
|
self.output_dir = Path(output_dir)
|
|
self.max_downloads = max_downloads
|
|
self.profile_name = username.lower()
|
|
self.defer_database = defer_database # Store for deferred recording
|
|
|
|
# Setup date filtering
|
|
self._setup_date_filtering(days_back, date_from, date_to)
|
|
|
|
# Scan existing files
|
|
self._scan_existing_files()
|
|
|
|
# Run download
|
|
return self._run_download()
|
|
|
|
def _setup_date_filtering(self, days_back, date_from, date_to):
|
|
"""Setup date range for filtering"""
|
|
self.date_from = None
|
|
self.date_to = None
|
|
|
|
if date_from:
|
|
if isinstance(date_from, str):
|
|
self.date_from = datetime.strptime(date_from, "%Y-%m-%d")
|
|
else:
|
|
self.date_from = date_from
|
|
|
|
if date_to:
|
|
if isinstance(date_to, str):
|
|
self.date_to = datetime.strptime(date_to, "%Y-%m-%d")
|
|
else:
|
|
self.date_to = date_to
|
|
|
|
if days_back and not self.date_from:
|
|
now = datetime.now()
|
|
self.date_to = datetime(now.year, now.month, now.day, 23, 59, 59)
|
|
self.date_from = (now - timedelta(days=days_back-1)).replace(hour=0, minute=0, second=0)
|
|
self.log(f"Downloading content from last {days_back} days ({self.date_from.strftime('%Y-%m-%d')} to {self.date_to.strftime('%Y-%m-%d')})")
|
|
|
|
def _scan_existing_files(self):
|
|
"""Scan existing files to avoid re-downloading"""
|
|
self.downloaded_files = scan_existing_files_for_media_ids(self.output_dir, self.profile_name)
|
|
if self.downloaded_files:
|
|
self.log(f"Found {len(self.downloaded_files)} existing media IDs, will skip duplicates")
|
|
|
|
def _run_download(self):
|
|
"""Run the actual download process"""
|
|
success_count = 0
|
|
|
|
# Update activity status
|
|
self.activity_manager.update_status(f"Checking {self.content_type}")
|
|
|
|
# Try to get fresh cookies via FlareSolverr if we don't have them or they're old
|
|
if not self.cookie_file.exists() or self._cookies_expired():
|
|
self.log("Cookies missing or expired, attempting FlareSolverr bypass...", "info")
|
|
if self._get_cookies_via_flaresolverr():
|
|
self.log("Successfully got fresh cookies from FlareSolverr", "info")
|
|
else:
|
|
self.log("FlareSolverr unavailable, will try with Playwright", "warning")
|
|
|
|
# Set Playwright browser path and display
|
|
import os
|
|
# Use environment variable if set, otherwise use standard location
|
|
if 'PLAYWRIGHT_BROWSERS_PATH' not in os.environ:
|
|
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = '/root/.cache/ms-playwright'
|
|
os.environ['DISPLAY'] = ':100' # Use Xvfb virtual display
|
|
os.environ['HOME'] = '/root' # Fix Firefox launch as root
|
|
if 'XAUTHORITY' in os.environ:
|
|
del os.environ['XAUTHORITY'] # Remove user's XAUTHORITY
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.firefox.launch(
|
|
headless=self.headless,
|
|
firefox_user_prefs={
|
|
# Disable automation indicators
|
|
'dom.webdriver.enabled': False,
|
|
'useAutomationExtension': False,
|
|
'general.platform.override': 'Win32',
|
|
'general.appversion.override': '5.0 (Windows)',
|
|
'general.oscpu.override': 'Windows NT 10.0; Win64; x64'
|
|
}
|
|
)
|
|
|
|
# CRITICAL: Browser fingerprint must match FlareSolverr for cookies to work
|
|
# Get dynamic fingerprint settings (Firefox doesn't use Sec-Ch-Ua headers)
|
|
context_options = get_playwright_context_options()
|
|
# Firefox-specific: remove Chrome-specific headers
|
|
if 'extra_http_headers' in context_options:
|
|
context_options['extra_http_headers'] = {
|
|
'Accept-Language': context_options['extra_http_headers'].get('Accept-Language', 'en-US,en;q=0.9')
|
|
}
|
|
context_options['ignore_https_errors'] = True
|
|
|
|
# IMPORTANT: If cookies have a stored user_agent, use THAT user_agent
|
|
# Cloudflare cf_clearance cookies are fingerprinted to the browser that solved the challenge
|
|
try:
|
|
if self.unified_db:
|
|
stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
|
|
if stored_user_agent:
|
|
self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug")
|
|
context_options['user_agent'] = stored_user_agent
|
|
else:
|
|
self.log(f"Using fingerprint: UA={context_options['user_agent'][:50]}...", "debug")
|
|
else:
|
|
self.log(f"Using fingerprint: UA={context_options['user_agent'][:50]}...", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error getting stored user_agent, using default: {e}", "debug")
|
|
|
|
context = browser.new_context(**context_options)
|
|
|
|
# Load cookies for session persistence
|
|
self._load_cookies(context)
|
|
|
|
page = context.new_page()
|
|
|
|
# Add comprehensive anti-detection scripts
|
|
page.add_init_script(get_playwright_stealth_scripts())
|
|
|
|
try:
|
|
# Navigate to Toolzu profile page
|
|
self.log(f"Navigating to Toolzu profile downloader")
|
|
page.goto(self.toolzu_url, wait_until="domcontentloaded", timeout=30000)
|
|
page.wait_for_timeout(2000)
|
|
|
|
# Fill in the download box with username
|
|
try:
|
|
# Look for input box
|
|
input_selector = "input[name='profile'], input[type='text'], input.form-control"
|
|
input_box = page.locator(input_selector).first
|
|
|
|
if input_box.is_visible():
|
|
self.log(f"Filling in username: @{self.username}")
|
|
input_box.fill(f"@{self.username}")
|
|
page.wait_for_timeout(500)
|
|
|
|
# Submit form
|
|
submit_button = page.locator("button[type='submit'], button:has-text('Download'), .btn-primary").first
|
|
if submit_button.is_visible():
|
|
self.log("Submitting form...")
|
|
submit_button.click()
|
|
page.wait_for_timeout(5000) # Wait for page to load
|
|
else:
|
|
# Try pressing Enter
|
|
input_box.press("Enter")
|
|
page.wait_for_timeout(5000)
|
|
else:
|
|
self.log("Input box not found", "error")
|
|
return 0
|
|
except Exception as e:
|
|
self.log(f"Form submission error: {e}", "error")
|
|
return 0
|
|
|
|
# Wait for page to stabilize after form submission
|
|
page.wait_for_timeout(3000)
|
|
|
|
# Check if page loaded results (should have nav tabs or download cards)
|
|
try:
|
|
page.wait_for_selector("#pills-tab, .download-card, #photo-tab", timeout=15000)
|
|
self.log("Results page loaded", "debug")
|
|
except Exception:
|
|
self.log("Results page didn't load - may be blocked by reCAPTCHA", "warning")
|
|
# Take screenshot for debugging
|
|
try:
|
|
page.screenshot(path="/tmp/toolzu_blocked.png")
|
|
self.log("Screenshot saved to /tmp/toolzu_blocked.png", "debug")
|
|
except Exception:
|
|
pass
|
|
return 0
|
|
|
|
# If downloading stories, click the Stories tab
|
|
if self.content_type == 'stories':
|
|
self.log("Clicking Stories tab...")
|
|
try:
|
|
# Wait for the nav tabs to load first
|
|
page.wait_for_selector("#stories-tab", timeout=30000)
|
|
stories_tab = page.locator("#stories-tab").first
|
|
if stories_tab.is_visible():
|
|
# Click and wait for AJAX navigation
|
|
stories_tab.click()
|
|
self.log("Waiting for Stories AJAX content to load...")
|
|
|
|
# Wait for the stories tab to become active
|
|
page.wait_for_selector("#stories-tab.active", timeout=10000)
|
|
|
|
# Wait for the stories content div to be visible
|
|
page.wait_for_selector("#stories.active", timeout=10000)
|
|
|
|
# Wait a bit more for AJAX to populate content
|
|
page.wait_for_timeout(3000)
|
|
|
|
# Verify stories cards loaded
|
|
try:
|
|
page.wait_for_selector("#stories .download-card", timeout=30000) # 30 seconds for AJAX
|
|
download_cards_count = len(page.locator("#stories .download-card").all())
|
|
self.log(f"Found {download_cards_count} download cards in Stories tab", "debug")
|
|
self.log("Stories tab loaded successfully")
|
|
except PlaywrightTimeout:
|
|
# Check if we hit Toolzu's download limit
|
|
if self._check_if_login_needed(page):
|
|
self.log("Download limit reached, attempting auto-login...", "info")
|
|
if self.login(page, context):
|
|
# Login successful, retry the download
|
|
self.log("Retrying download after login...")
|
|
page.goto(self.toolzu_url, wait_until="domcontentloaded", timeout=30000)
|
|
# Continue with the download flow by not returning
|
|
# (let it fall through to retry)
|
|
else:
|
|
self.log("Auto-login failed, cannot continue", "error")
|
|
return 0
|
|
else:
|
|
self.log("No stories found in Stories tab (or loading timed out)", "warning")
|
|
return 0
|
|
else:
|
|
self.log("Stories tab not found", "error")
|
|
return 0
|
|
except Exception as e:
|
|
self.log(f"Failed to click Stories tab: {e}", "error")
|
|
return 0
|
|
else:
|
|
# For posts, wait for content to load
|
|
try:
|
|
page.wait_for_selector(".download-card", timeout=120000) # 2 minutes for reCAPTCHA
|
|
self.log("Content loaded successfully")
|
|
except PlaywrightTimeout:
|
|
self.log("Timeout waiting for content (reCAPTCHA may have failed)", "warning")
|
|
# Check if there's an actual error message
|
|
error_msg = page.locator(".alert-danger, .error-message, .alert-warning").first
|
|
if error_msg.is_visible():
|
|
error_text = error_msg.inner_text()
|
|
self.log(f"Error on page: {error_text}", "error")
|
|
|
|
# Download content (no tab navigation needed - different URLs per type)
|
|
success_count = self._download_content(page, context)
|
|
|
|
# Save cookies after successful download
|
|
self._save_cookies(context)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error: {e}", "error")
|
|
finally:
|
|
try:
|
|
# Save cookies even on error (to preserve session)
|
|
self._save_cookies(context)
|
|
|
|
context.close()
|
|
browser.close()
|
|
self.log("Browser closed", "debug")
|
|
except Exception:
|
|
pass
|
|
|
|
return success_count
|
|
|
|
# Note: _navigate_to_content_type() removed - no longer needed
|
|
# Toolzu uses separate URLs for posts and stories, not tabs
|
|
|
|
def _download_content(self, page, context):
|
|
"""Download content from the page"""
|
|
success_count = 0
|
|
|
|
# Determine the correct selector based on content type
|
|
if self.content_type == 'stories':
|
|
# Only look in the Stories tab content
|
|
card_selector = "#stories .download-card"
|
|
self.log("Looking for stories in #stories tab...")
|
|
else:
|
|
# Look in the default Photos & videos tab
|
|
card_selector = ".download-card"
|
|
|
|
# Scroll to load all content
|
|
self.log("Scrolling to load all content...")
|
|
self._scroll_to_load_content(page, card_selector)
|
|
|
|
# Find all download cards
|
|
download_cards = page.locator(card_selector).all()
|
|
|
|
if not download_cards:
|
|
self.log("No download cards found")
|
|
return 0
|
|
|
|
self.log(f"Found {len(download_cards)} items to download")
|
|
|
|
# Extract all download info BEFORE starting downloads
|
|
# (clicking downloads can change page state and invalidate element references)
|
|
download_items = []
|
|
|
|
for i, card in enumerate(download_cards, 1):
|
|
try:
|
|
# Get download link
|
|
download_link = card.locator("a[download]").first
|
|
if not download_link or not download_link.is_visible():
|
|
continue
|
|
|
|
download_url = download_link.get_attribute("href")
|
|
if not download_url:
|
|
continue
|
|
|
|
# Extract media ID
|
|
media_id = self._extract_media_id_from_url(download_url)
|
|
if not media_id:
|
|
continue
|
|
|
|
download_items.append({
|
|
'download_url': download_url,
|
|
'media_id': media_id,
|
|
'index': i
|
|
})
|
|
except Exception as e:
|
|
self.log(f"Error extracting info from card {i}: {e}", "debug")
|
|
continue
|
|
|
|
if not download_items:
|
|
self.log("No valid download links found")
|
|
return 0
|
|
|
|
self.log(f"Extracted {len(download_items)} valid download links")
|
|
|
|
# Limit downloads (default 15 for daily checks)
|
|
if self.max_downloads:
|
|
download_items = download_items[:self.max_downloads]
|
|
self.log(f"Limited to {len(download_items)} items")
|
|
elif len(download_items) > 15:
|
|
# Default limit: only check 15 most recent posts
|
|
download_items = download_items[:15]
|
|
self.log(f"Limited to {len(download_items)} items (default for frequent checks)")
|
|
|
|
consecutive_old_posts = 0
|
|
|
|
# Set initial progress so dashboard shows 0/N immediately
|
|
self.activity_manager.update_status(
|
|
f"Downloading {self.content_type}",
|
|
progress_current=0,
|
|
progress_total=len(download_items)
|
|
)
|
|
|
|
# Now download each item
|
|
for item_idx, item in enumerate(download_items):
|
|
i = item['index']
|
|
download_url = item['download_url']
|
|
media_id = item['media_id']
|
|
|
|
# Update progress at start of each iteration (fires even on skips)
|
|
self.activity_manager.update_status(
|
|
f"Downloading {self.content_type}",
|
|
progress_current=item_idx + 1,
|
|
progress_total=len(download_items)
|
|
)
|
|
|
|
try:
|
|
# Check for duplicates - check both original and normalized media ID
|
|
normalized_media_id = extract_instagram_media_id(media_id)
|
|
if media_id in self.downloaded_files or normalized_media_id in self.downloaded_files:
|
|
self.log(f"[{i}/{len(download_items)}] Skipping duplicate (session): {media_id}")
|
|
continue
|
|
|
|
if self._is_already_downloaded(media_id) or (normalized_media_id != media_id and self._is_already_downloaded(normalized_media_id)):
|
|
self.log(f"[{i}/{len(download_items)}] Skipping duplicate (database): {media_id}")
|
|
self.downloaded_files.add(media_id)
|
|
self.downloaded_files.add(normalized_media_id)
|
|
continue
|
|
|
|
# Determine file extension
|
|
ext = ".jpg" if ".jpg" in download_url else ".mp4" if ".mp4" in download_url else ".jpg"
|
|
|
|
# Create filename (no post_date from Toolzu)
|
|
date_str = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
filename = f"{self.profile_name}_{date_str}_{media_id}{ext}"
|
|
|
|
# Create username subdirectory for organization
|
|
user_output_dir = self.output_dir / self.profile_name
|
|
user_output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
filepath = user_output_dir / filename
|
|
|
|
# Download file using context.request (avoids stale element and navigation issues)
|
|
try:
|
|
# Use Playwright's request API to download directly with retry
|
|
max_retries = 2
|
|
for attempt in range(max_retries):
|
|
try:
|
|
response = context.request.get(download_url, timeout=60000) # 60 second timeout
|
|
|
|
if response.ok:
|
|
# Save the downloaded content
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.body())
|
|
break
|
|
else:
|
|
if attempt < max_retries - 1:
|
|
self.log(f"[{i}/{len(download_items)}] HTTP {response.status}, retrying...", "warning")
|
|
time.sleep(3)
|
|
else:
|
|
self.log(f"[{i}/{len(download_items)}] Download failed: HTTP {response.status}", "error")
|
|
continue
|
|
except Exception as retry_error:
|
|
if attempt < max_retries - 1:
|
|
self.log(f"[{i}/{len(download_items)}] Download error, retrying: {retry_error}", "warning")
|
|
time.sleep(3)
|
|
else:
|
|
raise
|
|
|
|
# Check for duplicate hash before recording (hash blacklist persists even if original deleted)
|
|
from pathlib import Path
|
|
file_hash = self.db.get_file_hash(str(filepath)) if self.db else None
|
|
if file_hash:
|
|
existing = self.db.get_download_by_file_hash(file_hash)
|
|
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
|
|
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
|
|
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
|
# Delete the duplicate regardless of whether original file still exists
|
|
try:
|
|
filepath.unlink()
|
|
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
|
|
|
|
# Record in database with normalized media_id for cross-module detection
|
|
self._record_download(
|
|
media_id=normalized_media_id,
|
|
username=self.profile_name,
|
|
content_type=self.content_type,
|
|
filename=str(filepath),
|
|
download_url=download_url,
|
|
post_date=None,
|
|
metadata={'resolution': '1920x1440'},
|
|
deferred=self.defer_database
|
|
)
|
|
|
|
self.downloaded_files.add(media_id)
|
|
self.downloaded_files.add(normalized_media_id)
|
|
success_count += 1
|
|
|
|
self.log(f"✓ [{i}/{len(download_items)}] Saved: {filename}", "success")
|
|
|
|
# Smart delay between downloads
|
|
if i < len(download_items):
|
|
self._smart_delay()
|
|
|
|
except PlaywrightTimeout:
|
|
self.log(f"[{i}/{len(download_items)}] Download timeout", "error")
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"[{i}/{len(download_items)}] Download error: {e}", "error")
|
|
continue
|
|
|
|
except Exception as e:
|
|
self.log(f"[{i}/{len(download_items)}] Error processing item: {e}", "error")
|
|
continue
|
|
|
|
return success_count
|
|
|
|
def _scroll_to_load_content(self, page, card_selector=".download-card"):
|
|
"""Scroll to load all lazy-loaded content"""
|
|
no_change_count = 0
|
|
max_scrolls = 15
|
|
|
|
for scroll_set in range(max_scrolls):
|
|
old_count = len(page.locator(card_selector).all())
|
|
|
|
# Slow, gradual scrolling
|
|
for small_scroll in range(5):
|
|
page.evaluate("window.scrollBy(0, 200)")
|
|
page.wait_for_timeout(500)
|
|
|
|
page.wait_for_timeout(2000)
|
|
|
|
new_count = len(page.locator(card_selector).all())
|
|
|
|
if new_count > old_count:
|
|
self.log(f"Loaded more items: {old_count} → {new_count}", "debug")
|
|
no_change_count = 0
|
|
else:
|
|
no_change_count += 1
|
|
|
|
if no_change_count >= 3:
|
|
self.log("No more content loading", "debug")
|
|
break
|
|
|
|
|
|
def download_instagram_content(username, content_type="posts", output_dir="downloads",
|
|
use_database=True, **kwargs):
|
|
"""
|
|
Simple function to download Instagram content via Toolzu
|
|
|
|
Args:
|
|
username: Instagram username
|
|
content_type: 'posts', 'stories', 'reels', or 'all'
|
|
output_dir: Where to save files
|
|
use_database: Use database to track downloads
|
|
**kwargs: Additional options
|
|
|
|
Returns:
|
|
Number of downloaded items
|
|
"""
|
|
downloader = ToolzuDownloader(headless=True, use_database=use_database)
|
|
return downloader.download(username, content_type, output_dir, **kwargs)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Example: Download posts for a user
|
|
count = download_instagram_content(
|
|
username="evalongoria",
|
|
content_type="posts",
|
|
output_dir="test_downloads",
|
|
days_back=3,
|
|
max_downloads=15 # Only check 15 most recent (runs every 4 hours)
|
|
)
|
|
print(f"\nTotal downloaded: {count} items")
|