#!/usr/bin/env python3 """ Toolzu Instagram Downloader Module Downloads Instagram content at 1920x1440 resolution """ # Allow nested event loops for compatibility with asyncio contexts try: import nest_asyncio nest_asyncio.apply() except ImportError: pass from pathlib import Path from datetime import datetime, timedelta from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout import os import re import random import time import json import requests from modules.base_module import LoggingMixin from modules.cloudflare_handler import ( CloudflareHandler, SiteStatus, get_flaresolverr_user_agent, get_playwright_context_options, get_playwright_stealth_scripts ) from modules.instagram_utils import ( extract_instagram_media_id, scan_existing_files_for_media_ids, record_instagram_download, is_instagram_downloaded ) class ToolzuDownloader(LoggingMixin): """ Toolzu Instagram downloader - provides 1920x1440 resolution downloads Example usage: from toolzu_module import ToolzuDownloader downloader = ToolzuDownloader() count = downloader.download( username="evalongoria", content_type="posts", output_dir="downloads/posts" ) print(f"Downloaded {count} items") """ def __init__(self, headless=True, show_progress=True, use_database=True, log_callback=None, unified_db=None, cookie_file=None, toolzu_email=None, toolzu_password=None): """ Initialize the downloader Args: headless: Run browser in headless mode show_progress: Print progress messages use_database: Use database to track downloads log_callback: Optional callback function for logging unified_db: Optional UnifiedDatabase instance cookie_file: Path to cookie file for session persistence toolzu_email: Email for Toolzu login (optional, for auto-login) toolzu_password: Password for Toolzu login (optional, for auto-login) """ # Initialize logging via mixin self._init_logger('Instagram', log_callback, default_module='Download') self.headless = headless self.show_progress = show_progress # Toolzu now uses unified profile page with tabs self.toolzu_url = 'https://toolzu.com/downloader/instagram/profile/' self.login_url = 'https://toolzu.com/login' self.downloaded_files = set() self.use_database = use_database self.toolzu_email = toolzu_email self.toolzu_password = toolzu_password self.unified_db = unified_db # Store for scraper config access self.scraper_id = 'toolzu' # Scraper ID in database # Rate limiting settings self.min_delay = 5 self.max_delay = 15 self.batch_size = 10 self.batch_delay_min = 30 self.batch_delay_max = 60 self.download_count = 0 self.pending_downloads = [] # Track downloads for deferred database recording # Use unified database if unified_db and use_database: from modules.unified_database import ToolzuDatabaseAdapter self.db = ToolzuDatabaseAdapter(unified_db) else: self.db = None self.use_database = False # Initialize activity status manager for real-time updates from modules.activity_status import get_activity_manager self.activity_manager = get_activity_manager(unified_db) # Load scraper configuration from database if available self.proxy_url = None self.cookie_file = None # Default to None (use database) if unified_db: scraper_config = unified_db.get_scraper(self.scraper_id) if scraper_config: # Get proxy configuration if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'): self.proxy_url = scraper_config['proxy_url'] self.log(f"Using proxy: {self.proxy_url}", "info") # Fall back to cookie file if no database or if explicitly provided if not unified_db: if cookie_file: self.cookie_file = Path(cookie_file) else: self.cookie_file = Path('/opt/media-downloader/cookies/toolzu_cookies.json') # User-Agent to match FlareSolverr (dynamically fetched for consistency) self.user_agent = get_flaresolverr_user_agent() # Initialize universal Cloudflare handler # Pass proxy_url if configured, and cookie_file=None for database storage self.cf_handler = CloudflareHandler( module_name="Toolzu", cookie_file=str(self.cookie_file) if self.cookie_file else None, user_agent=self.user_agent, logger=self.logger, aggressive_expiry=True, proxy_url=self.proxy_url # Pass proxy to FlareSolverr ) # Keep for backwards compatibility self.flaresolverr_url = self.cf_handler.flaresolverr_url self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled # Load cookies from database if available self._load_cookies_from_db() def _load_cookies_from_db(self): """Load cookies from database if available""" if not self.unified_db: return try: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) if cookies: # Load into CloudflareHandler self.cf_handler._cookies = cookies self.log(f"Loaded {len(cookies)} cookies from database", "debug") except Exception as e: self.log(f"Error loading cookies from database: {e}", "warning") def _save_cookies_to_db(self, cookies: list, user_agent: str = None): """Save cookies to database Args: cookies: List of cookie dictionaries user_agent: User agent to associate with cookies (important for cf_clearance). If not provided, uses self.user_agent as fallback. """ if not self.unified_db: return try: # Use provided user_agent or fall back to self.user_agent ua = user_agent or self.user_agent self.unified_db.save_scraper_cookies( self.scraper_id, cookies, user_agent=ua, merge=True ) self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50]}...)", "debug") except Exception as e: self.log(f"Error saving cookies to database: {e}", "warning") def _has_valid_cookies(self): """Check if we have valid cookies (either in file or database)""" if self.unified_db: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) return cookies and len(cookies) > 0 elif self.cookie_file: return self.cookie_file.exists() return False def _cookies_expired(self): """Check if cookies are expired - delegates to CloudflareHandler""" return self.cf_handler.cookies_expired() def _get_cookies_for_requests(self): """Get cookies in format for requests library - delegates to CloudflareHandler""" return self.cf_handler.get_cookies_dict() def _get_cookies_via_flaresolverr(self, url="https://toolzu.com/", max_retries=2): """Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler Args: url: URL to fetch max_retries: Maximum number of retry attempts (default: 2) Returns: True if cookies obtained successfully, False otherwise """ success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries) # Save cookies to database if successful if success and self.unified_db: cookies_list = self.cf_handler.get_cookies_list() if cookies_list: # CRITICAL: Get the user_agent from FlareSolverr solution, not self.user_agent # cf_clearance cookies are fingerprinted to the browser that solved the challenge flaresolverr_ua = self.cf_handler.get_user_agent() self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua) return success def _smart_delay(self): """Implement smart delays with randomization""" self.download_count += 1 if self.download_count % self.batch_size == 0: delay = random.uniform(self.batch_delay_min, self.batch_delay_max) self.log(f"Batch delay: waiting {delay:.1f} seconds", "debug") else: delay = random.uniform(self.min_delay, self.max_delay) self.log(f"Waiting {delay:.1f} seconds", "debug") time.sleep(delay) def _load_cookies(self, context): """Load cookies from database or file into browser context""" # Try loading from database first if self.unified_db: try: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) if cookies: # Clean cookies - remove unsupported properties and convert expiry->expires cleaned_cookies = [] for cookie in cookies: cleaned = {k: v for k, v in cookie.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']} # FlareSolverr uses 'expiry' but Playwright uses 'expires' if 'expiry' in cleaned and 'expires' not in cleaned: cleaned['expires'] = cleaned.pop('expiry') cleaned_cookies.append(cleaned) # CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect try: context.clear_cookies() except Exception: pass context.add_cookies(cleaned_cookies) self.log(f"Loaded {len(cleaned_cookies)} cookies from database", "info") return except Exception as e: self.log(f"Error loading cookies from database: {e}", "warning") # Fallback to file-based cookies if not self.cookie_file or not self.cookie_file.exists(): self.log("No saved cookies found", "debug") return try: import json with open(self.cookie_file, 'r') as f: data = json.load(f) cookies = data.get('cookies', []) if cookies: # Convert expiry->expires for Playwright compatibility cleaned_cookies = [] for cookie in cookies: cleaned = dict(cookie) if 'expiry' in cleaned and 'expires' not in cleaned: cleaned['expires'] = cleaned.pop('expiry') cleaned_cookies.append(cleaned) # CRITICAL: Clear existing cookies first try: context.clear_cookies() except Exception: pass context.add_cookies(cleaned_cookies) self.log(f"Loaded {len(cleaned_cookies)} cookies from file", "info") except Exception as e: self.log(f"Failed to load cookies: {e}", "warning") def _save_cookies(self, context): """Save cookies to database or file""" try: import json cookies = context.cookies() # Save to database if available if self.unified_db: try: # CRITICAL: Include user_agent for cf_clearance cookies to work self.unified_db.save_scraper_cookies( self.scraper_id, cookies, user_agent=self.user_agent, merge=True ) self.log(f"Saved {len(cookies)} cookies to database", "debug") return except Exception as e: self.log(f"Error saving cookies to database: {e}", "warning") # Fallback to file-based storage if self.cookie_file: # Ensure directory exists self.cookie_file.parent.mkdir(parents=True, exist_ok=True) with open(self.cookie_file, 'w') as f: json.dump({'cookies': cookies}, f, indent=2) self.log(f"Saved {len(cookies)} cookies to file", "debug") except Exception as e: self.log(f"Failed to save cookies: {e}", "warning") def login(self, page, context): """ Log in to Toolzu using provided credentials Args: page: Playwright page object context: Browser context for saving cookies Returns: True if login successful, False otherwise """ if not self.toolzu_email or not self.toolzu_password: self.log("No Toolzu credentials provided, cannot auto-login", "warning") return False try: self.log("Attempting to log in to Toolzu...") # Navigate to login page page.goto(self.login_url, wait_until="domcontentloaded", timeout=30000) page.wait_for_timeout(2000) # Fill in email email_input = page.locator("#loginform-email").first if not email_input.is_visible(): self.log("Login form not found", "error") return False self.log(f"Filling in email: {self.toolzu_email}") email_input.fill(self.toolzu_email) page.wait_for_timeout(500) # Fill in password password_input = page.locator("#loginform-password").first password_input.fill(self.toolzu_password) page.wait_for_timeout(500) # Handle reCAPTCHA v3 if present try: # Wait a bit for reCAPTCHA to execute page.wait_for_timeout(2000) # Check if reCAPTCHA token field exists and is populated recaptcha_field = page.locator("#loginform-recaptcha").first if recaptcha_field: recaptcha_value = recaptcha_field.get_attribute("value") if recaptcha_value: self.log("reCAPTCHA v3 token detected", "debug") else: self.log("reCAPTCHA v3 token not populated yet, waiting...", "debug") page.wait_for_timeout(3000) except Exception: pass # Submit the form submit_button = page.locator("button[type='submit'], button:has-text('Log in')").first if submit_button.is_visible(): self.log("Submitting login form...") submit_button.click() else: # Try pressing Enter on password field password_input.press("Enter") # Wait for navigation or error page.wait_for_timeout(5000) # Check if login was successful # Success: redirected away from login page or see user menu current_url = page.url if "/login" not in current_url or page.locator("a:has-text('Log out'), .user-menu, .dropdown-toggle").first.is_visible(): self.log("Login successful!", "success") # Save cookies with login session self._save_cookies(context) return True else: # Check for error messages error_msg = page.locator(".alert-danger, .help-block-error, .invalid-feedback").first if error_msg.is_visible(): error_text = error_msg.inner_text() self.log(f"Login failed: {error_text}", "error") else: self.log("Login failed (still on login page)", "error") return False except Exception as e: self.log(f"Login error: {e}", "error") import traceback self.log(traceback.format_exc(), "debug") return False def _check_if_login_needed(self, page): """ Check if we need to log in (e.g., hit download limit, session expired) Args: page: Playwright page object Returns: True if login is needed, False otherwise """ try: # Check for download limit message limit_msg = page.locator("text=EXCEEDED THE LIMIT, text=login to continue, text=sign in").first if limit_msg.is_visible(): self.log("Download limit detected, login required", "info") return True # Check if redirected to login page if "/login" in page.url: self.log("Redirected to login page", "info") return True return False except Exception: return False def _extract_timestamp_from_url(self, url): """ Extract timestamp from Toolzu thumbnail URL query parameter NOTE: Toolzu does NOT provide actual post dates anywhere on the page. The 'time=' parameter in thumbnail URLs is the page load time, not post date. This method returns None - download time will be used as fallback. Args: url: Toolzu thumbnail URL with time parameter Returns: None (Toolzu doesn't provide reliable post dates) """ # Don't extract timestamps from Toolzu - they're page load times, not post dates return None def _extract_media_id_from_url(self, url): """ Extract media ID from Instagram CDN URL Args: url: Instagram CDN URL Returns: Media ID string """ # Pattern: number_MEDIAID_number_n.jpg pattern = r'(\d+)_(\d{17,19})_\d+_n\.(jpg|mp4)' match = re.search(pattern, url) if match: return match.group(2) # Return the media ID # Fallback: extract from filename try: filename = url.split('/')[-1].split('?')[0] return Path(filename).stem except Exception: return None def _is_already_downloaded(self, media_id): """Check if media_id was already downloaded by ANY Instagram downloader (uses centralized function)""" if not self.use_database: return False try: # Use centralized function for consistent cross-module detection return is_instagram_downloaded(self.db.db if hasattr(self.db, 'db') else self.db, media_id) except Exception as e: self.log(f"Error checking database for {media_id}: {e}", "error") return False # Don't skip on error - try to download def _record_download(self, media_id, username, content_type, filename, download_url=None, post_date=None, metadata=None, deferred=False): """Record download in database (uses centralized function) Args: deferred: If True, don't record to database now - add to pending_downloads list for later recording after file move is complete """ # If deferred, store for later recording instead of recording now if deferred: file_path = str(filename) # Full path filename_only = Path(filename).name # Just the filename self.pending_downloads.append({ 'media_id': media_id, 'username': username, 'filename': filename_only, 'url': download_url, 'post_date': post_date.isoformat() if post_date else None, 'file_path': file_path, 'content_type': content_type, 'metadata': metadata }) self.log(f"Deferred recording for {media_id}", "debug") return True if not self.use_database: return # Extract just the filename from the full path for database file_path = str(filename) # Full path filename_only = Path(filename).name # Just the filename try: # Use centralized function for consistent cross-module storage result = record_instagram_download( db=self.db.db if hasattr(self.db, 'db') else self.db, media_id=media_id, username=username, content_type=content_type, filename=filename_only, download_url=download_url, post_date=post_date, file_path=file_path, method='toolzu', extra_metadata=metadata or {} ) if result: self.log(f"Recorded download for {media_id}", "debug") else: self.log(f"Failed to record download for {media_id} (possibly duplicate)", "debug") except Exception as e: self.log(f"Failed to record download: {e}", "warning") def get_pending_downloads(self): """Get list of downloads that were deferred for later recording""" return self.pending_downloads.copy() def clear_pending_downloads(self): """Clear the pending downloads list after they've been recorded""" self.pending_downloads = [] def _update_file_timestamps(self, filepath, post_date): """Update file timestamps to match post date""" if not post_date: return timestamp = post_date.timestamp() try: os.utime(filepath, (timestamp, timestamp)) self.log(f"Updated timestamps to {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug") except Exception as e: self.log(f"Failed to update timestamps: {e}", "debug") def download(self, username, content_type="posts", output_dir="downloads", max_downloads=None, days_back=None, date_from=None, date_to=None, defer_database=False): """ Download content from Instagram via Toolzu Args: username: Instagram username content_type: 'posts' or 'stories' (Toolzu doesn't support reels) output_dir: Directory to save downloads max_downloads: Maximum number of items to download days_back: Number of days back to download date_from: Start date for range date_to: End date for range defer_database: If True, don't record to database immediately - store in pending_downloads for later recording after file move is complete Returns: Number of successfully downloaded items """ # Clear downloaded_files cache between accounts to prevent memory growth self.downloaded_files.clear() # Check site status before doing anything else self.log("Checking Toolzu site status...", "debug") site_status, error_msg = self.cf_handler.check_site_status("https://toolzu.com/", timeout=10) if self.cf_handler.should_skip_download(site_status): self.log(f"Skipping download - Toolzu is unavailable: {error_msg}", "warning") return 0 elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE: self.log("Cloudflare challenge detected, will attempt bypass during download", "info") # Validate content type if content_type not in ['posts', 'stories']: self.log(f"Toolzu only supports 'posts' and 'stories', not '{content_type}'", "warning") return 0 self.username = username self.content_type = content_type self.output_dir = Path(output_dir) self.max_downloads = max_downloads self.profile_name = username.lower() self.defer_database = defer_database # Store for deferred recording # Setup date filtering self._setup_date_filtering(days_back, date_from, date_to) # Scan existing files self._scan_existing_files() # Run download return self._run_download() def _setup_date_filtering(self, days_back, date_from, date_to): """Setup date range for filtering""" self.date_from = None self.date_to = None if date_from: if isinstance(date_from, str): self.date_from = datetime.strptime(date_from, "%Y-%m-%d") else: self.date_from = date_from if date_to: if isinstance(date_to, str): self.date_to = datetime.strptime(date_to, "%Y-%m-%d") else: self.date_to = date_to if days_back and not self.date_from: now = datetime.now() self.date_to = datetime(now.year, now.month, now.day, 23, 59, 59) self.date_from = (now - timedelta(days=days_back-1)).replace(hour=0, minute=0, second=0) self.log(f"Downloading content from last {days_back} days ({self.date_from.strftime('%Y-%m-%d')} to {self.date_to.strftime('%Y-%m-%d')})") def _scan_existing_files(self): """Scan existing files to avoid re-downloading""" self.downloaded_files = scan_existing_files_for_media_ids(self.output_dir, self.profile_name) if self.downloaded_files: self.log(f"Found {len(self.downloaded_files)} existing media IDs, will skip duplicates") def _run_download(self): """Run the actual download process""" success_count = 0 # Update activity status self.activity_manager.update_status(f"Checking {self.content_type}") # Try to get fresh cookies via FlareSolverr if we don't have them or they're old if not self.cookie_file.exists() or self._cookies_expired(): self.log("Cookies missing or expired, attempting FlareSolverr bypass...", "info") if self._get_cookies_via_flaresolverr(): self.log("Successfully got fresh cookies from FlareSolverr", "info") else: self.log("FlareSolverr unavailable, will try with Playwright", "warning") # Set Playwright browser path and display import os # Use environment variable if set, otherwise use standard location if 'PLAYWRIGHT_BROWSERS_PATH' not in os.environ: os.environ['PLAYWRIGHT_BROWSERS_PATH'] = '/root/.cache/ms-playwright' os.environ['DISPLAY'] = ':100' # Use Xvfb virtual display os.environ['HOME'] = '/root' # Fix Firefox launch as root if 'XAUTHORITY' in os.environ: del os.environ['XAUTHORITY'] # Remove user's XAUTHORITY with sync_playwright() as p: browser = p.firefox.launch( headless=self.headless, firefox_user_prefs={ # Disable automation indicators 'dom.webdriver.enabled': False, 'useAutomationExtension': False, 'general.platform.override': 'Win32', 'general.appversion.override': '5.0 (Windows)', 'general.oscpu.override': 'Windows NT 10.0; Win64; x64' } ) # CRITICAL: Browser fingerprint must match FlareSolverr for cookies to work # Get dynamic fingerprint settings (Firefox doesn't use Sec-Ch-Ua headers) context_options = get_playwright_context_options() # Firefox-specific: remove Chrome-specific headers if 'extra_http_headers' in context_options: context_options['extra_http_headers'] = { 'Accept-Language': context_options['extra_http_headers'].get('Accept-Language', 'en-US,en;q=0.9') } context_options['ignore_https_errors'] = True # IMPORTANT: If cookies have a stored user_agent, use THAT user_agent # Cloudflare cf_clearance cookies are fingerprinted to the browser that solved the challenge try: if self.unified_db: stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id) if stored_user_agent: self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug") context_options['user_agent'] = stored_user_agent else: self.log(f"Using fingerprint: UA={context_options['user_agent'][:50]}...", "debug") else: self.log(f"Using fingerprint: UA={context_options['user_agent'][:50]}...", "debug") except Exception as e: self.log(f"Error getting stored user_agent, using default: {e}", "debug") context = browser.new_context(**context_options) # Load cookies for session persistence self._load_cookies(context) page = context.new_page() # Add comprehensive anti-detection scripts page.add_init_script(get_playwright_stealth_scripts()) try: # Navigate to Toolzu profile page self.log(f"Navigating to Toolzu profile downloader") page.goto(self.toolzu_url, wait_until="domcontentloaded", timeout=30000) page.wait_for_timeout(2000) # Fill in the download box with username try: # Look for input box input_selector = "input[name='profile'], input[type='text'], input.form-control" input_box = page.locator(input_selector).first if input_box.is_visible(): self.log(f"Filling in username: @{self.username}") input_box.fill(f"@{self.username}") page.wait_for_timeout(500) # Submit form submit_button = page.locator("button[type='submit'], button:has-text('Download'), .btn-primary").first if submit_button.is_visible(): self.log("Submitting form...") submit_button.click() page.wait_for_timeout(5000) # Wait for page to load else: # Try pressing Enter input_box.press("Enter") page.wait_for_timeout(5000) else: self.log("Input box not found", "error") return 0 except Exception as e: self.log(f"Form submission error: {e}", "error") return 0 # Wait for page to stabilize after form submission page.wait_for_timeout(3000) # Check if page loaded results (should have nav tabs or download cards) try: page.wait_for_selector("#pills-tab, .download-card, #photo-tab", timeout=15000) self.log("Results page loaded", "debug") except Exception: self.log("Results page didn't load - may be blocked by reCAPTCHA", "warning") # Take screenshot for debugging try: page.screenshot(path="/tmp/toolzu_blocked.png") self.log("Screenshot saved to /tmp/toolzu_blocked.png", "debug") except Exception: pass return 0 # If downloading stories, click the Stories tab if self.content_type == 'stories': self.log("Clicking Stories tab...") try: # Wait for the nav tabs to load first page.wait_for_selector("#stories-tab", timeout=30000) stories_tab = page.locator("#stories-tab").first if stories_tab.is_visible(): # Click and wait for AJAX navigation stories_tab.click() self.log("Waiting for Stories AJAX content to load...") # Wait for the stories tab to become active page.wait_for_selector("#stories-tab.active", timeout=10000) # Wait for the stories content div to be visible page.wait_for_selector("#stories.active", timeout=10000) # Wait a bit more for AJAX to populate content page.wait_for_timeout(3000) # Verify stories cards loaded try: page.wait_for_selector("#stories .download-card", timeout=30000) # 30 seconds for AJAX download_cards_count = len(page.locator("#stories .download-card").all()) self.log(f"Found {download_cards_count} download cards in Stories tab", "debug") self.log("Stories tab loaded successfully") except PlaywrightTimeout: # Check if we hit Toolzu's download limit if self._check_if_login_needed(page): self.log("Download limit reached, attempting auto-login...", "info") if self.login(page, context): # Login successful, retry the download self.log("Retrying download after login...") page.goto(self.toolzu_url, wait_until="domcontentloaded", timeout=30000) # Continue with the download flow by not returning # (let it fall through to retry) else: self.log("Auto-login failed, cannot continue", "error") return 0 else: self.log("No stories found in Stories tab (or loading timed out)", "warning") return 0 else: self.log("Stories tab not found", "error") return 0 except Exception as e: self.log(f"Failed to click Stories tab: {e}", "error") return 0 else: # For posts, wait for content to load try: page.wait_for_selector(".download-card", timeout=120000) # 2 minutes for reCAPTCHA self.log("Content loaded successfully") except PlaywrightTimeout: self.log("Timeout waiting for content (reCAPTCHA may have failed)", "warning") # Check if there's an actual error message error_msg = page.locator(".alert-danger, .error-message, .alert-warning").first if error_msg.is_visible(): error_text = error_msg.inner_text() self.log(f"Error on page: {error_text}", "error") # Download content (no tab navigation needed - different URLs per type) success_count = self._download_content(page, context) # Save cookies after successful download self._save_cookies(context) except Exception as e: self.log(f"Error: {e}", "error") finally: try: # Save cookies even on error (to preserve session) self._save_cookies(context) context.close() browser.close() self.log("Browser closed", "debug") except Exception: pass return success_count # Note: _navigate_to_content_type() removed - no longer needed # Toolzu uses separate URLs for posts and stories, not tabs def _download_content(self, page, context): """Download content from the page""" success_count = 0 # Determine the correct selector based on content type if self.content_type == 'stories': # Only look in the Stories tab content card_selector = "#stories .download-card" self.log("Looking for stories in #stories tab...") else: # Look in the default Photos & videos tab card_selector = ".download-card" # Scroll to load all content self.log("Scrolling to load all content...") self._scroll_to_load_content(page, card_selector) # Find all download cards download_cards = page.locator(card_selector).all() if not download_cards: self.log("No download cards found") return 0 self.log(f"Found {len(download_cards)} items to download") # Extract all download info BEFORE starting downloads # (clicking downloads can change page state and invalidate element references) download_items = [] for i, card in enumerate(download_cards, 1): try: # Get download link download_link = card.locator("a[download]").first if not download_link or not download_link.is_visible(): continue download_url = download_link.get_attribute("href") if not download_url: continue # Extract media ID media_id = self._extract_media_id_from_url(download_url) if not media_id: continue download_items.append({ 'download_url': download_url, 'media_id': media_id, 'index': i }) except Exception as e: self.log(f"Error extracting info from card {i}: {e}", "debug") continue if not download_items: self.log("No valid download links found") return 0 self.log(f"Extracted {len(download_items)} valid download links") # Limit downloads (default 15 for daily checks) if self.max_downloads: download_items = download_items[:self.max_downloads] self.log(f"Limited to {len(download_items)} items") elif len(download_items) > 15: # Default limit: only check 15 most recent posts download_items = download_items[:15] self.log(f"Limited to {len(download_items)} items (default for frequent checks)") consecutive_old_posts = 0 # Set initial progress so dashboard shows 0/N immediately self.activity_manager.update_status( f"Downloading {self.content_type}", progress_current=0, progress_total=len(download_items) ) # Now download each item for item_idx, item in enumerate(download_items): i = item['index'] download_url = item['download_url'] media_id = item['media_id'] # Update progress at start of each iteration (fires even on skips) self.activity_manager.update_status( f"Downloading {self.content_type}", progress_current=item_idx + 1, progress_total=len(download_items) ) try: # Check for duplicates - check both original and normalized media ID normalized_media_id = extract_instagram_media_id(media_id) if media_id in self.downloaded_files or normalized_media_id in self.downloaded_files: self.log(f"[{i}/{len(download_items)}] Skipping duplicate (session): {media_id}") continue if self._is_already_downloaded(media_id) or (normalized_media_id != media_id and self._is_already_downloaded(normalized_media_id)): self.log(f"[{i}/{len(download_items)}] Skipping duplicate (database): {media_id}") self.downloaded_files.add(media_id) self.downloaded_files.add(normalized_media_id) continue # Determine file extension ext = ".jpg" if ".jpg" in download_url else ".mp4" if ".mp4" in download_url else ".jpg" # Create filename (no post_date from Toolzu) date_str = datetime.now().strftime('%Y%m%d_%H%M%S') filename = f"{self.profile_name}_{date_str}_{media_id}{ext}" # Create username subdirectory for organization user_output_dir = self.output_dir / self.profile_name user_output_dir.mkdir(parents=True, exist_ok=True) filepath = user_output_dir / filename # Download file using context.request (avoids stale element and navigation issues) try: # Use Playwright's request API to download directly with retry max_retries = 2 for attempt in range(max_retries): try: response = context.request.get(download_url, timeout=60000) # 60 second timeout if response.ok: # Save the downloaded content with open(filepath, 'wb') as f: f.write(response.body()) break else: if attempt < max_retries - 1: self.log(f"[{i}/{len(download_items)}] HTTP {response.status}, retrying...", "warning") time.sleep(3) else: self.log(f"[{i}/{len(download_items)}] Download failed: HTTP {response.status}", "error") continue except Exception as retry_error: if attempt < max_retries - 1: self.log(f"[{i}/{len(download_items)}] Download error, retrying: {retry_error}", "warning") time.sleep(3) else: raise # Check for duplicate hash before recording (hash blacklist persists even if original deleted) from pathlib import Path file_hash = self.db.get_file_hash(str(filepath)) if self.db else None if file_hash: existing = self.db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): # Duplicate hash found - content was already downloaded (prevents redownload of deleted content) self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") # Delete the duplicate regardless of whether original file still exists try: filepath.unlink() self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {filename}: {e}", "warning") # Record in database with normalized media_id for cross-module detection self._record_download( media_id=normalized_media_id, username=self.profile_name, content_type=self.content_type, filename=str(filepath), download_url=download_url, post_date=None, metadata={'resolution': '1920x1440'}, deferred=self.defer_database ) self.downloaded_files.add(media_id) self.downloaded_files.add(normalized_media_id) success_count += 1 self.log(f"✓ [{i}/{len(download_items)}] Saved: {filename}", "success") # Smart delay between downloads if i < len(download_items): self._smart_delay() except PlaywrightTimeout: self.log(f"[{i}/{len(download_items)}] Download timeout", "error") continue except Exception as e: self.log(f"[{i}/{len(download_items)}] Download error: {e}", "error") continue except Exception as e: self.log(f"[{i}/{len(download_items)}] Error processing item: {e}", "error") continue return success_count def _scroll_to_load_content(self, page, card_selector=".download-card"): """Scroll to load all lazy-loaded content""" no_change_count = 0 max_scrolls = 15 for scroll_set in range(max_scrolls): old_count = len(page.locator(card_selector).all()) # Slow, gradual scrolling for small_scroll in range(5): page.evaluate("window.scrollBy(0, 200)") page.wait_for_timeout(500) page.wait_for_timeout(2000) new_count = len(page.locator(card_selector).all()) if new_count > old_count: self.log(f"Loaded more items: {old_count} → {new_count}", "debug") no_change_count = 0 else: no_change_count += 1 if no_change_count >= 3: self.log("No more content loading", "debug") break def download_instagram_content(username, content_type="posts", output_dir="downloads", use_database=True, **kwargs): """ Simple function to download Instagram content via Toolzu Args: username: Instagram username content_type: 'posts', 'stories', 'reels', or 'all' output_dir: Where to save files use_database: Use database to track downloads **kwargs: Additional options Returns: Number of downloaded items """ downloader = ToolzuDownloader(headless=True, use_database=use_database) return downloader.download(username, content_type, output_dir, **kwargs) if __name__ == "__main__": # Example: Download posts for a user count = download_instagram_content( username="evalongoria", content_type="posts", output_dir="test_downloads", days_back=3, max_downloads=15 # Only check 15 most recent (runs every 4 hours) ) print(f"\nTotal downloaded: {count} items")