#!/usr/bin/env python3 """ FastDL Instagram Downloader Module Can be imported and used in other scripts """ from pathlib import Path from datetime import datetime, timedelta from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout import os import re import urllib.parse import json import random import time import platform import subprocess import requests from concurrent.futures import ThreadPoolExecutor, as_completed from modules.base_module import LoggingMixin from modules.cloudflare_handler import ( CloudflareHandler, SiteStatus, get_flaresolverr_user_agent, get_playwright_context_options, get_playwright_stealth_scripts ) from modules.instagram_utils import ( extract_instagram_media_id, scan_existing_files_for_media_ids, record_instagram_download, is_instagram_downloaded ) class FastDLDownloader(LoggingMixin): """ FastDL Instagram downloader that can be used as a module Example usage: from fastdl_module import FastDLDownloader # Download stories for a user downloader = FastDLDownloader() count = downloader.download( username="evalongoria", content_type="stories", output_dir="downloads/stories" ) print(f"Downloaded {count} items") """ def __init__(self, headless=True, show_progress=True, use_database=True, log_callback=None, unified_db=None, high_res=False): """ Initialize the downloader Args: headless: Run browser in headless mode show_progress: Print progress messages use_database: Use SQLite database to track downloads (set False to re-download) db_path: Path to SQLite database file (ignored if unified_db provided) log_callback: Optional callback function for logging (tag, level, message) unified_db: Optional UnifiedDatabase instance for centralized tracking high_res: Use high-resolution download mode (searches individual Instagram URLs) """ # Initialize logging via mixin self._init_logger('Instagram', log_callback, default_module='Download') self.headless = headless self.show_progress = show_progress self.fastdl_url = "https://fastdl.app/en2" self.downloaded_files = set() self.use_database = use_database self.high_res = high_res self.unified_db = unified_db # Store for scraper config access self.scraper_id = 'fastdl' # Scraper ID in database self.pending_downloads = [] # Track downloads for deferred database recording self._cdn_to_pk_map = {} # CDN filename -> Instagram pk map (for browser fallback) # Rate limiting settings (matching InstaLoader improvements) self.min_delay = 1 # Minimum delay between downloads (seconds) self.max_delay = 3 # Maximum delay between downloads (seconds) self.batch_size = 10 # Downloads before longer break self.batch_delay_min = 30 # Minimum batch delay (seconds) self.batch_delay_max = 60 # Maximum batch delay (seconds) self.download_count = 0 # Track downloads for batch delays # Use unified database only if unified_db and use_database: from modules.unified_database import FastDLDatabaseAdapter self.db = FastDLDatabaseAdapter(unified_db) else: self.db = None self.use_database = False # Initialize activity status manager for real-time updates from modules.activity_status import get_activity_manager self.activity_manager = get_activity_manager(unified_db) # Load scraper configuration from database if available self.proxy_url = None self.cookie_file = None # Default to None (use database) if unified_db: scraper_config = unified_db.get_scraper(self.scraper_id) if scraper_config: # Get proxy configuration if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'): self.proxy_url = scraper_config['proxy_url'] self.log(f"Using proxy: {self.proxy_url}", "info") # Fall back to cookie file if no database if not unified_db: self.cookie_file = Path("cookies/fastdl_cookies.json") self.cookie_file.parent.mkdir(exist_ok=True) # User-Agent to match FlareSolverr (dynamically fetched for consistency) self.user_agent = get_flaresolverr_user_agent() # Initialize universal Cloudflare handler # Pass proxy_url if configured, and cookie_file=None for database storage self.cf_handler = CloudflareHandler( module_name="FastDL", cookie_file=str(self.cookie_file) if self.cookie_file else None, user_agent=self.user_agent, logger=self.logger, aggressive_expiry=True, proxy_url=self.proxy_url # Pass proxy to FlareSolverr ) # Keep for backwards compatibility self.flaresolverr_url = self.cf_handler.flaresolverr_url # Load cookies from database if available self._load_cookies_from_db() self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled def _load_cookies_from_db(self): """Load cookies from database if available""" if not self.unified_db: return try: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) if cookies: # Load into CloudflareHandler self.cf_handler._cookies = cookies self.log(f"Loaded {len(cookies)} cookies from database", "debug") except Exception as e: self.log(f"Error loading cookies from database: {e}", "warning") def _save_cookies_to_db(self, cookies: list, user_agent: str = None): """Save cookies to database Args: cookies: List of cookie dictionaries user_agent: User agent to associate with cookies (important for cf_clearance). If not provided, uses self.user_agent as fallback. """ if not self.unified_db: return try: # Use provided user_agent or fall back to self.user_agent ua = user_agent or self.user_agent self.unified_db.save_scraper_cookies( self.scraper_id, cookies, user_agent=ua, merge=True ) self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50]}...)", "debug") except Exception as e: self.log(f"Error saving cookies to database: {e}", "warning") def _has_valid_cookies(self): """Check if we have valid cookies (either in file or database)""" if self.unified_db: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) return cookies and len(cookies) > 0 elif self.cookie_file: return self.cookie_file.exists() return False def _cookies_expired(self): """Check if cookies are expired - delegates to CloudflareHandler""" return self.cf_handler.cookies_expired() def _get_cookies_for_requests(self): """Get cookies in format for requests library - delegates to CloudflareHandler""" return self.cf_handler.get_cookies_dict() def _get_cookies_via_flaresolverr(self, url="https://fastdl.app/", max_retries=2): """Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler Args: url: URL to fetch max_retries: Maximum number of retry attempts (default: 2) Returns: True if cookies obtained successfully, False otherwise """ success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries) # Save cookies to database if successful if success and self.unified_db: cookies_list = self.cf_handler.get_cookies_list() if cookies_list: # CRITICAL: Get the user_agent from FlareSolverr solution, not self.user_agent # cf_clearance cookies are fingerprinted to the browser that solved the challenge flaresolverr_ua = self.cf_handler.get_user_agent() self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua) return success def _media_id_to_shortcode(self, media_id): """Convert Instagram media ID to shortcode Instagram uses a custom base64 alphabet: ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_ Args: media_id: Instagram media ID (string or int) Returns: Instagram shortcode string """ alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' # Convert string ID to integer media_id = int(media_id) # Convert to base64 shortcode shortcode = '' while media_id > 0: remainder = media_id % 64 media_id = media_id // 64 shortcode = alphabet[remainder] + shortcode return shortcode or 'A' def _extract_media_ids_from_fastdl_url(self, url): """Extract Instagram media IDs from FastDL proxied URLs FastDL URLs contain Instagram CDN URLs with media IDs like: 561378837_18538674661006538_479694548187839800_n.jpg The second number (18538674661006538) is the Instagram media ID Args: url: FastDL URL string Returns: List of media IDs found in the URL """ # Pattern: number_MEDIAID_number_n.jpg pattern = r'(\d+)_(\d{17,19})_\d+_n\.(jpg|mp4)' matches = re.findall(pattern, url) if matches: # Return the media ID (second capture group) return [match[1] for match in matches] return [] def _search_instagram_url_on_fastdl(self, page, instagram_url): """Search for a specific Instagram URL on FastDL to get high-res download links Args: page: Playwright page object instagram_url: Instagram post URL (e.g., https://www.instagram.com/p/BB3NONxpzK/) Returns: List of tuples: [(download_link, file_extension, is_high_res), ...] Empty list if search fails """ try: self.log(f"Searching FastDL for: {instagram_url}", "debug") # Navigate to FastDL homepage page.goto(self.fastdl_url, wait_until="domcontentloaded", timeout=60000) page.wait_for_timeout(2000) # Enter Instagram URL input_field = page.locator("input[type='text']").first if not input_field or not input_field.is_visible(): self.log("Could not find FastDL input field", "error") return [] input_field.fill(instagram_url) page.wait_for_timeout(500) # Click download button download_button = page.locator("button:has-text('Download')").first if not download_button or not download_button.is_visible(): self.log("Could not find Download button", "error") return [] download_button.click(force=True) self.log("Loading post from Instagram URL...", "debug") # Wait for content to load - Instagram URL searches take longer try: page.wait_for_selector(".loader-component", timeout=60000, state="detached") self.log("Loader dismissed", "debug") except Exception: self.log("Loader still visible after 60s...", "warning") # Wait additional time for content to render page.wait_for_timeout(5000) # Check for errors first error_elem = page.locator(".error-message__text").first if error_elem and error_elem.is_visible(): error_text = error_elem.text_content() or "Unknown error" self.log(f"FastDL returned error: {error_text}", "error") return [] # Try waiting for actual content elements try: page.wait_for_selector(".button__download, a[href*='media.fastdl.app']", timeout=10000) self.log("Post content loaded successfully", "debug") except Exception: self.log("Post content did not load as expected", "warning") # Check for error message in HTML html = page.content() if "Something went wrong" in html or "error-message" in html: self.log("FastDL encountered an error fetching this post (may be deleted/unavailable)", "error") return [] # Extract download links - try multiple selectors # FastDL uses button elements with specific classes for download links download_links = page.locator("a.button__download, a[href*='media.fastdl.app'], a[href*='.jpg'], a[href*='.mp4']").all() if not download_links: self.log("No download links found for this Instagram URL", "warning") return [] # Analyze links to find high-res versions results = [] for link in download_links: href = link.get_attribute("href") if not href: continue # Determine file type ext = ".jpg" if ".jpg" in href else ".mp4" # Check if it's high-res by looking for resolution indicators is_high_res = False if 'p1080x1080' in href or 'p1440x1440' in href or 'p2048x2048' in href: is_high_res = True elif 'p640x640' in href: is_high_res = False else: # No resolution indicator, assume it might be high-res is_high_res = True results.append((href, ext, is_high_res)) # Filter to only high-res links if available high_res_only = [r for r in results if r[2]] if high_res_only: self.log(f"Found {len(high_res_only)} high-res download link(s)", "info") return high_res_only else: self.log(f"Found {len(results)} download link(s) (resolution unknown)", "info") return results except Exception as e: self.log(f"Error searching Instagram URL on FastDL: {e}", "error") return [] def _fetch_highres_via_api_convert(self, page, instagram_url): """Trigger FastDL to process an Instagram URL and intercept the /api/convert response. Navigates to FastDL, enters the Instagram URL, clicks Download, and captures the POST /api/convert response that FastDL makes internally. Args: page: Playwright page object instagram_url: Instagram post URL (e.g. https://instagram.com/p/SHORTCODE/) Returns: Parsed JSON list from /api/convert response, or None on failure. """ convert_response = [None] # mutable container for closure def _intercept_convert(response): try: if '/api/convert' in response.url and response.status == 200: content_type = response.headers.get('content-type', '') if 'json' in content_type: convert_response[0] = response.json() except Exception: pass try: page.on("response", _intercept_convert) # Navigate to FastDL homepage page.goto(self.fastdl_url, wait_until="domcontentloaded", timeout=60000) page.wait_for_timeout(2000) # Dismiss any consent overlay self._dismiss_consent_dialog(page) # Enter Instagram URL input_field = page.locator("input[type='text']").first if not input_field or not input_field.is_visible(): self.log("Could not find FastDL input field for /api/convert", "error") return None input_field.fill(instagram_url) page.wait_for_timeout(500) # Click download button download_button = page.locator("button:has-text('Download')").first if not download_button or not download_button.is_visible(): self.log("Could not find Download button for /api/convert", "error") return None download_button.click(force=True) self.log(f"Waiting for /api/convert response for {instagram_url}...", "debug") # Poll until response captured or timeout (30s) for _ in range(60): if convert_response[0] is not None: break page.wait_for_timeout(500) if convert_response[0] is None: self.log(f"Timeout waiting for /api/convert response for {instagram_url}", "warning") return None self.log(f"Captured /api/convert response with {len(convert_response[0])} item(s)", "debug") return convert_response[0] except Exception as e: self.log(f"Error fetching /api/convert for {instagram_url}: {e}", "error") return None finally: try: page.remove_listener("response", _intercept_convert) except Exception: pass def _extract_highres_items_from_convert_response(self, convert_data, shortcode, fallback_date=None): """Parse /api/convert response into download items suitable for _download_items_parallel(). Args: convert_data: JSON list from /api/convert response shortcode: Instagram shortcode for this post fallback_date: Fallback datetime if meta.taken_at is missing Returns: List of dicts with keys: download_url, filename, media_id, normalized_media_id, post_date, ext, metadata """ items = [] profile = self.profile_name or "unknown" if not isinstance(convert_data, list): convert_data = [convert_data] for idx, entry in enumerate(convert_data): try: # Extract download URL — first url entry has highest res url_list = entry.get('url', []) if not url_list: continue best_url = url_list[0] download_url = best_url.get('url', '') if not download_url: continue ext_raw = best_url.get('ext', 'jpg') ext = f".{ext_raw}" if not ext_raw.startswith('.') else ext_raw # Extract metadata meta = entry.get('meta', {}) taken_at = meta.get('taken_at', 0) post_date = datetime.fromtimestamp(taken_at) if taken_at else fallback_date caption = meta.get('title', '') post_shortcode = meta.get('shortcode', shortcode) # Extract media_id from thumb URL's filename= param or uri= param media_id = None thumb_url = entry.get('thumb', '') if thumb_url and 'filename=' in thumb_url: try: parsed = urllib.parse.urlparse(thumb_url) params = urllib.parse.parse_qs(parsed.query) fn = params.get('filename', [''])[0] if fn: media_id = Path(fn).stem except Exception: pass if not media_id and thumb_url and 'uri=' in thumb_url: try: parsed = urllib.parse.urlparse(thumb_url) params = urllib.parse.parse_qs(parsed.query) uri = params.get('uri', [''])[0] if uri: media_id = self._extract_media_id_from_cdn_url(uri) except Exception: pass if not media_id and download_url and 'uri=' in download_url: try: parsed = urllib.parse.urlparse(download_url) params = urllib.parse.parse_qs(parsed.query) uri = params.get('uri', [''])[0] if uri: media_id = self._extract_media_id_from_cdn_url(uri) except Exception: pass if not media_id: # Final fallback: shortcode + index media_id = f"{post_shortcode}_{idx}" if len(convert_data) > 1 else post_shortcode normalized = extract_instagram_media_id(media_id) if media_id else media_id date_str = post_date.strftime('%Y%m%d_%H%M%S') if post_date else datetime.now().strftime('%Y%m%d_%H%M%S') filename = f"{profile}_{date_str}_{media_id}{ext}" items.append({ 'media_id': media_id, 'normalized_media_id': normalized, 'download_url': download_url, 'filename': filename, 'post_date': post_date, 'ext': ext, 'shortcode': post_shortcode, 'caption': caption, 'metadata': {'high_res': True, 'instagram_url': f"https://www.instagram.com/p/{post_shortcode}/"}, }) except Exception as e: self.log(f"Error parsing /api/convert entry {idx}: {e}", "debug") continue return items def _check_post_phrases(self, page, phrase_config): """ Check if post contains required phrases Args: page: Playwright page object phrase_config: Phrase search configuration Returns: True if post matches phrase criteria, False otherwise """ try: # Get post caption/text from FastDL detail page # The caption is typically in p.media-content__caption on the detail page caption_selectors = [ 'p.media-content__caption', # Primary caption selector on detail page '.media-content__caption', '.caption', '.post-caption', 'div[class*="caption"]', 'p[class*="caption"]', '.media-content__description', 'div.content', 'p.content' ] post_text = "" for selector in caption_selectors: try: elements = page.locator(selector).all() for element in elements: if element.is_visible(): text = element.text_content() or "" if text: post_text += " " + text except Exception: continue # Also check any visible text in media content area try: media_content = page.locator('.media-content, .post-content').first if media_content.count() > 0: post_text += " " + (media_content.text_content() or "") except Exception: pass if not post_text: self.log("Could not extract post text for phrase matching", "debug") # If we can't get text, default to downloading (avoid false negatives) return True # Clean up text post_text = ' '.join(post_text.split()) # Normalize whitespace phrases = phrase_config.get('phrases', []) if not phrases: return True # No phrases to match = match all case_sensitive = phrase_config.get('case_sensitive', False) match_all = phrase_config.get('match_all', False) if not case_sensitive: post_text = post_text.lower() phrases = [p.lower() for p in phrases] # Check phrase matching matches = [] for phrase in phrases: if phrase in post_text: matches.append(phrase) self.log(f"Found phrase match: '{phrase}'", "debug") if match_all: # All phrases must be found result = len(matches) == len(phrases) else: # At least one phrase must be found result = len(matches) > 0 if result: self.log(f"Post matches phrase criteria ({len(matches)}/{len(phrases)} phrases found)", "info") else: self.log(f"Post does not match phrase criteria ({len(matches)}/{len(phrases)} phrases found)", "info") return result except Exception as e: self.log(f"Error checking phrases: {e}", "error") # On error, default to downloading (avoid false negatives) return True def _dismiss_consent_dialog(self, page): """Dismiss cookie consent / GDPR overlay if present (Google FundingChoices).""" try: consent_btn = page.locator( 'button.fc-cta-consent, ' 'button.fc-cta-do-not-consent, ' 'button[aria-label="Consent"], ' 'button.fc-dismiss-button, ' '.fc-dialog button.fc-primary-button' ).first if consent_btn.count() > 0 and consent_btn.is_visible(): consent_btn.click(force=True) self.log("Dismissed consent dialog", "debug") import time time.sleep(0.5) return overlay = page.locator('.fc-consent-root, .fc-dialog-overlay').first if overlay.count() > 0: page.evaluate("document.querySelectorAll('.fc-consent-root, .fc-dialog-overlay, .fc-dialog-container').forEach(el => el.remove())") self.log("Removed consent overlay via JS", "debug") except Exception: pass def _smart_delay(self): """Implement smart delays with randomization to avoid detection""" self.download_count += 1 # Check if we need a batch delay if self.download_count % self.batch_size == 0: delay = random.uniform(self.batch_delay_min, self.batch_delay_max) self.log(f"Batch delay: waiting {delay:.1f} seconds after {self.download_count} downloads", "debug") else: # Regular delay with randomization delay = random.uniform(self.min_delay, self.max_delay) self.log(f"Waiting {delay:.1f} seconds before next download", "debug") time.sleep(delay) def _update_all_timestamps(self, filepath, post_date): """Update all timestamps for a file: filesystem and EXIF Args: filepath: Path to the file post_date: datetime object with the target date/time """ if not post_date: return timestamp = post_date.timestamp() # 1. Update file system timestamps (access time and modification time) try: os.utime(filepath, (timestamp, timestamp)) self.log(f"Updated file timestamps to {post_date.strftime('%Y-%m-%d %H:%M:%S')}") except Exception as e: self.log(f"Failed to update file timestamps: {e}", "error") # 2. Update creation time (platform-specific) try: if platform.system() == 'Darwin': # macOS # Use SetFile command on macOS to set creation date date_str = post_date.strftime('%m/%d/%Y %H:%M:%S') subprocess.run( ['SetFile', '-d', date_str, str(filepath)], capture_output=True, text=True ) elif platform.system() == 'Windows': # On Windows, we can use PowerShell to set creation time # Escape special characters to prevent command injection filepath_escaped = str(filepath).replace("'", "''") # PowerShell single-quote escape # isoformat() produces safe strings like "2024-01-15T10:30:00" but escape anyway date_escaped = post_date.isoformat().replace("'", "''") ps_command = f"(Get-Item -LiteralPath '{filepath_escaped}').CreationTime = Get-Date '{date_escaped}'" subprocess.run( ['powershell', '-Command', ps_command], capture_output=True, text=True ) # Linux doesn't support changing creation time except Exception as e: # SetFile might not be available on newer macOS versions pass # 3. Update EXIF data for images if str(filepath).lower().endswith(('.jpg', '.jpeg', '.png')): self._update_exif_timestamp(filepath, post_date) # 4. Update MP4 metadata for videos if str(filepath).lower().endswith(('.mp4', '.mov')): self._update_video_metadata(filepath, post_date) def _update_exif_timestamp(self, filepath, post_date): """Update EXIF timestamps in image files Requires exiftool to be installed: brew install exiftool (macOS) or apt-get install exiftool (Linux) """ try: # Check if exiftool is available result = subprocess.run(['which', 'exiftool'], capture_output=True, text=True) if result.returncode != 0: # Try to use piexif as fallback if available try: import piexif self._update_exif_with_piexif(filepath, post_date) except ImportError: pass # Silently skip if no EXIF tools available return # Format date for EXIF exif_date = post_date.strftime('%Y:%m:%d %H:%M:%S') # Update all date fields in EXIF including MetadataDate for Immich cmd = [ 'exiftool', '-overwrite_original', '-quiet', f'-AllDates={exif_date}', f'-MetadataDate={exif_date}', '-HistoryWhen=', f'-FileModifyDate={exif_date}', str(filepath) ] subprocess.run(cmd, capture_output=True, text=True) self.log(f"Updated EXIF timestamps to {post_date.strftime('%Y-%m-%d %H:%M:%S')}") except Exception as e: pass # Silently skip EXIF updates if tools not available def _update_exif_with_piexif(self, filepath, post_date): """Update EXIF using piexif library as fallback""" try: import piexif from PIL import Image # Format date for EXIF exif_date = post_date.strftime('%Y:%m:%d %H:%M:%S').encode('utf-8') # Load existing EXIF or create new exif_dict = {'0th': {}, 'Exif': {}, 'GPS': {}, 'Interop': {}, '1st': {}, 'thumbnail': None} try: with Image.open(filepath) as img: exif_dict = piexif.load(img.info.get('exif', b'')) except Exception: pass # Use default empty dict # Update date fields exif_dict['0th'][piexif.ImageIFD.DateTime] = exif_date exif_dict['Exif'][piexif.ExifIFD.DateTimeOriginal] = exif_date exif_dict['Exif'][piexif.ExifIFD.DateTimeDigitized] = exif_date # Save with updated EXIF exif_bytes = piexif.dump(exif_dict) with Image.open(filepath) as img: img.save(filepath, exif=exif_bytes) self.log(f"Updated EXIF with piexif to {post_date.strftime('%Y-%m-%d %H:%M:%S')}") except Exception as e: pass # Silently skip if piexif not available def _update_video_metadata(self, filepath, post_date): """Update MP4/MOV video metadata timestamps Uses ffmpeg if available to update video metadata """ try: # Check if ffmpeg is available result = subprocess.run(['which', 'ffmpeg'], capture_output=True, text=True) if result.returncode != 0: return # ffmpeg not available # Format date for video metadata meta_date = post_date.strftime('%Y-%m-%d %H:%M:%S') # Create temp file temp_file = str(filepath) + '.temp.mp4' # Update metadata using ffmpeg cmd = [ 'ffmpeg', '-i', str(filepath), '-metadata', f'creation_time={post_date.isoformat()}Z', '-metadata', f'date={meta_date}', '-c', 'copy', # Copy streams without re-encoding '-y', # Overwrite temp_file ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: # Replace original with temp file os.replace(temp_file, filepath) # Re-apply file timestamps (os.replace creates a new file with current mtime) timestamp = post_date.timestamp() os.utime(str(filepath), (timestamp, timestamp)) self.log(f"Updated video metadata to {post_date.strftime('%Y-%m-%d %H:%M:%S')}") else: # Clean up temp file if it exists if os.path.exists(temp_file): os.remove(temp_file) except Exception as e: pass # Silently skip video metadata updates def _is_already_downloaded(self, media_id): """Check if media_id has already been downloaded (uses centralized function)""" if not self.use_database: return False # Use centralized function for consistent cross-module detection return is_instagram_downloaded(self.db.db if hasattr(self.db, 'db') else self.db, media_id) def _record_download(self, media_id, username, content_type, filename, download_url=None, post_date=None, metadata=None, deferred=False): """Record a successful download in the database (uses centralized function) Args: deferred: If True, don't record to database now - add to pending_downloads list for later recording after file move is complete """ # If deferred, store for later recording instead of recording now if deferred: file_path = str(filename) # Full path filename_only = Path(filename).name # Just the filename self.pending_downloads.append({ 'media_id': media_id, 'username': username, 'filename': filename_only, 'url': download_url, 'post_date': post_date.isoformat() if post_date else None, 'file_path': file_path, 'content_type': content_type, 'metadata': metadata }) self.log(f"Deferred recording for {media_id}", "debug") return True if not self.use_database: self.log(f"Database recording disabled (use_database=False)", "debug") return # Extract just the filename from the full path for database file_path = str(filename) # Full path filename_only = Path(filename).name # Just the filename self.log(f"Recording download in database: filename={filename_only}, media_id={media_id}, user={username}", "debug") # Use centralized function for consistent cross-module storage result = record_instagram_download( db=self.db.db if hasattr(self.db, 'db') else self.db, media_id=media_id, username=username, content_type=content_type, filename=filename_only, download_url=download_url, post_date=post_date, file_path=file_path, method='fastdl', extra_metadata=metadata ) if result: self.log(f"Successfully recorded download for {filename_only}", "debug") else: self.log(f"Failed to record download for {filename_only} (possibly duplicate)", "debug") return result def get_pending_downloads(self): """Get list of downloads that were deferred for later recording""" return self.pending_downloads.copy() def clear_pending_downloads(self): """Clear the pending downloads list after they've been recorded""" self.pending_downloads = [] def _record_checked(self, media_id, username, content_type, reason="checked", post_date=None): """Record that a post was checked but not downloaded Args: media_id: The media ID that was checked username: Instagram username content_type: Type of content reason: Reason for skipping ('old_post', 'phrase_checked', 'checked') post_date: Optional post date """ if not self.use_database: return # Create a marker filename similar to ImgInn marker_filename = f"_{reason}_{media_id}" # Use centralized function for consistent cross-module storage return record_instagram_download( db=self.db.db if hasattr(self.db, 'db') else self.db, media_id=media_id, username=username, content_type=content_type, filename=marker_filename, post_date=post_date, method='fastdl', extra_metadata={'marker': True, 'reason': reason} ) def reset_database(self, username=None, content_type=None): """Reset database by removing tracking records Args: username: If specified, only reset records for this user content_type: If specified, only reset records for this content type Returns: Number of records deleted """ if not self.use_database or not self.db: self.log("Database is disabled") return 0 # Use unified database return self.db.reset_database(username, content_type) def remove_tracking(self, media_ids): """Remove specific media IDs from tracking Args: media_ids: Single media_id string or list of media_ids to remove Returns: Number of records deleted """ if not self.use_database or not self.db: return 0 # Use unified database return self.db.remove_tracking(media_ids) def get_tracked_items(self, username=None, content_type=None): """Get list of tracked items from database Args: username: Filter by username content_type: Filter by content type Returns: List of dictionaries with tracking info """ if not self.use_database or not self.db: return [] # Use unified database return self.db.get_tracked_items(username, content_type) def get_database_stats(self): """Get statistics about the database Returns: Dictionary with database statistics """ if not self.use_database or not self.db: return {'enabled': False} # Use unified database return self.db.get_database_stats() def download(self, username, content_type="all", output_dir="downloads", max_downloads=None, days_back=None, date_from=None, date_to=None, phrase_config=None, defer_database=False): """ Download content from Instagram via FastDL Args: username: Instagram username or URL content_type: Type of content ('posts', 'stories', 'reels', 'highlights', 'all') output_dir: Directory to save downloads max_downloads: Maximum number of items to download days_back: Number of days back to download posts/reels date_from: Start date for range (datetime object or YYYY-MM-DD string) date_to: End date for range (datetime object or YYYY-MM-DD string) phrase_config: Optional phrase search configuration for posts/reels { 'enabled': bool, 'phrases': list of phrases to search for, 'case_sensitive': bool, 'match_all': bool (True = all phrases must match, False = any phrase) } defer_database: If True, don't record to database immediately - store in pending_downloads for later recording after file move is complete Returns: Number of successfully downloaded items """ # Clear downloaded_files cache between accounts to prevent memory growth self.downloaded_files.clear() # Check site status before doing anything else self.log("Checking FastDL site status...", "debug") site_status, error_msg = self.cf_handler.check_site_status("https://fastdl.app/", timeout=10) if self.cf_handler.should_skip_download(site_status): self.log(f"Skipping download - FastDL is unavailable: {error_msg}", "warning") return 0 elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE: self.log("Cloudflare challenge detected, will attempt bypass during download", "info") # Setup self.username = username self.content_type = content_type self.output_dir = Path(output_dir) # Don't create output_dir here - only create when we have files to download self.max_downloads = max_downloads self.phrase_config = phrase_config self.defer_database = defer_database # Store for deferred recording # Extract profile name self.profile_name = self._extract_profile_name(username) # Setup date filtering self._setup_date_filtering(days_back, date_from, date_to) # Scan existing files self._scan_existing_files() # Run the download return self._run_download() def download_multi(self, username, content_types, output_dirs, max_downloads=None, days_back=None, date_from=None, date_to=None, phrase_configs=None, defer_database=False): """Download multiple content types in a single browser session. Args: username: Instagram username content_types: List like ['stories', 'reels', 'posts'] output_dirs: Dict {content_type: output_dir_path} phrase_configs: Dict {content_type: phrase_config} or None (other args same as download()) Returns: Dict: {content_type: {'count': N, 'pending_downloads': [...]}} """ # Clear downloaded_files cache between accounts to prevent memory growth self.downloaded_files.clear() # Check site status before doing anything else self.log("Checking FastDL site status...", "debug") site_status, error_msg = self.cf_handler.check_site_status("https://fastdl.app/", timeout=10) if self.cf_handler.should_skip_download(site_status): self.log(f"Skipping download - FastDL is unavailable: {error_msg}", "warning") return {ct: {'count': 0, 'pending_downloads': []} for ct in content_types} elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE: self.log("Cloudflare challenge detected, will attempt bypass during download", "info") # Setup self.username = username self.profile_name = self._extract_profile_name(username) self.max_downloads = max_downloads self.defer_database = defer_database # Setup date filtering self._setup_date_filtering(days_back, date_from, date_to) # Do NOT call _scan_existing_files() here — done per content type inside _run_download_multi() # Run the multi-content download return self._run_download_multi(content_types, output_dirs, phrase_configs or {}) def _run_download_multi(self, content_types, output_dirs, phrase_configs): """Single browser session for all content types. Args: content_types: List of content types to download output_dirs: Dict {content_type: output_dir_path} phrase_configs: Dict {content_type: phrase_config} Returns: Dict: {content_type: {'count': N, 'pending_downloads': [...]}} """ results = {} # Try to get fresh cookies via FlareSolverr if we don't have them or they're old if not self._has_valid_cookies() or self._cookies_expired(): self.log("Cookies missing or expired, attempting FlareSolverr bypass...", "info") if self._get_cookies_via_flaresolverr(): self.log("Successfully got fresh cookies from FlareSolverr", "info") else: self.log("FlareSolverr unavailable, will try with Playwright", "warning") with sync_playwright() as p: browser = p.chromium.launch( headless=self.headless, args=[ '--disable-blink-features=AutomationControlled', '--disable-infobars', '--disable-background-timer-throttling', '--disable-backgrounding-occluded-windows', '--disable-renderer-backgrounding' ] ) # CRITICAL: Browser fingerprint must match FlareSolverr for cookies to work context_options = get_playwright_context_options() context_options['accept_downloads'] = True context_options['ignore_https_errors'] = True # Use stored cookie user_agent if available try: if self.unified_db: stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id) if stored_user_agent: self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug") context_options['user_agent'] = stored_user_agent else: self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug") else: self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug") except Exception as e: self.log(f"Error getting stored user_agent, using default: {e}", "debug") context = browser.new_context(**context_options) # Load cookies from database or file cookies_loaded = False if self.unified_db: try: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) if cookies: cleaned_cookies = [] for cookie in cookies: cleaned = {k: v for k, v in cookie.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']} if 'expiry' in cleaned and 'expires' not in cleaned: cleaned['expires'] = cleaned.pop('expiry') cleaned_cookies.append(cleaned) try: context.clear_cookies() except Exception: pass context.add_cookies(cleaned_cookies) self.log(f"Loaded {len(cleaned_cookies)} cookies from database", "debug") cookies_loaded = True except Exception as e: self.log(f"Error loading cookies from database: {e}", "warning") # Fallback to file-based cookies if not cookies_loaded and self.cookie_file and self.cookie_file.exists(): try: with open(self.cookie_file, 'r') as f: data = json.load(f) cookies = data.get('cookies', []) if cookies: cleaned_cookies = [] for cookie in cookies: cleaned = dict(cookie) if 'expiry' in cleaned and 'expires' not in cleaned: cleaned['expires'] = cleaned.pop('expiry') cleaned_cookies.append(cleaned) try: context.clear_cookies() except Exception: pass context.add_cookies(cleaned_cookies) self.log(f"Loaded {len(cleaned_cookies)} cookies from file", "debug") except Exception as e: self.log(f"Failed to load cookies: {e}", "warning") # Handle popups def handle_popup(page): if len(context.pages) > 1: self.log("Blocking popup") page.close() context.on("page", handle_popup) page = context.new_page() page.on("popup", lambda popup: popup.close()) # Add anti-detection scripts page.add_init_script(get_playwright_stealth_scripts()) try: # Intercept all API responses to discover FastDL's backend endpoints api_responses = [] def _capture_api_response(response): try: url = response.url if 'fastdl.app' in url and response.status == 200: content_type_header = response.headers.get('content-type', '') if 'json' in content_type_header: try: body = response.json() api_responses.append({ 'url': url, 'body': body, 'size': len(str(body)), }) endpoint = url.split('/')[-1].split('?')[0] if isinstance(body, dict) and 'result' in body: result = body['result'] if isinstance(result, list): self.log(f"[API] Captured {endpoint}: {len(result)} items", "info") elif isinstance(result, dict) and 'edges' in result: self.log(f"[API] Captured {endpoint}: {len(result['edges'])} edges (count: {result.get('count', '?')})", "info") else: self.log(f"[API] Captured {endpoint}", "info") except Exception: pass except Exception: pass page.on("response", _capture_api_response) # Navigate to FastDL self.log(f"Navigating to FastDL...") page.goto(self.fastdl_url, wait_until="domcontentloaded", timeout=60000) page.wait_for_timeout(2000) # Enter username input_field = page.locator("input[type='text']").first if not input_field or not input_field.is_visible(): self.log("Could not find input field", "error") return {ct: {'count': 0, 'pending_downloads': []} for ct in content_types} self.log(f"Entering username: {self.username}") input_field.fill(self.username) page.wait_for_timeout(500) # Click download button download_button = page.locator("button:has-text('Download')").first if download_button and download_button.is_visible(): download_button.click(force=True) self.log("Loading profile...") try: self.log("Waiting for profile to load...") page.wait_for_selector(".loader-component", timeout=30000, state="detached") self.log("Profile loading complete") except PlaywrightTimeout: self.log("Profile still loading after 30s, continuing anyway...", "warning") except Exception as e: self.log(f"Error waiting for loader: {e}", "debug") page.wait_for_timeout(2000) try: page.wait_for_selector("ul.tabs-component", timeout=5000, state="attached") tabs_count = page.locator("button.tabs-component__button").count() if tabs_count > 0: self.log(f"Profile loaded successfully - found {tabs_count} tabs") else: self.log("Tabs container found but no buttons, waiting...", "warning") page.wait_for_timeout(5000) except PlaywrightTimeout: self.log("Tabs container not found after 5s, continuing anyway...", "warning") except Exception as e: self.log(f"Error checking tabs: {e}", "warning") # Dismiss consent dialog self._dismiss_consent_dialog(page) # Scroll to load all paginated posts within date range # Only scroll when posts content type is requested — reels uses # the initial postsV2 capture (first page) without extra scrolling if 'posts' in content_types: self._scroll_to_load_api_posts(page, api_responses) # Track API responses per content type by recording list boundaries # Initial profile load + scrolling captures posts/postsV2 — used by both posts and reels # (postsV2 contains all timeline content; _extract_posts_from_api filters by type) api_responses_for = {} initial_responses = list(api_responses) # snapshot after scrolling if 'posts' in content_types: api_responses_for['posts'] = initial_responses if 'reels' in content_types: api_responses_for['reels'] = initial_responses # same postsV2, filtered in extraction # Click non-default tabs and capture their API responses separately for ct in content_types: if ct != 'posts': start_idx = len(api_responses) self.content_type = ct self._navigate_to_content_tab(page) # Stories has its own /stories endpoint — use only responses from its tab click if ct == 'stories': api_responses_for[ct] = api_responses[start_idx:] # Process each content type in order: stories -> reels -> posts # Posts go last because _download_highres_via_api_convert navigates away from profile ordered = sorted(content_types, key=lambda ct: {'stories': 0, 'reels': 1, 'posts': 2}.get(ct, 9)) for ct in ordered: try: prev_pending = len(self.pending_downloads) self.content_type = ct self.output_dir = Path(output_dirs[ct]) self.phrase_config = phrase_configs.get(ct) # Scan existing files for THIS content type's dir (accumulate, don't replace) ct_existing = scan_existing_files_for_media_ids(self.output_dir, self.profile_name) self.downloaded_files.update(ct_existing) # Use only API responses relevant to this content type ct_api_responses = api_responses_for.get(ct, []) # Same download decision tree as _run_download api_result = -1 use_api = ct in ('stories', 'posts', 'reels') and ct_api_responses if use_api and self.high_res and ct == 'posts': self.log("High-res mode enabled for posts, trying /api/convert approach", "info") api_convert_result = self._download_highres_via_api_convert(page, ct_api_responses) if api_convert_result < 0: self.log("Falling back to browser-based high-res download", "info") else: api_result = api_convert_result use_api = False if use_api: api_result = self._download_from_api(ct_api_responses) if api_result >= 0: self.log(f"API-based download complete for {ct}: {api_result} items") count = api_result else: self.log(f"No API data available for {ct}, skipping", "debug") count = 0 results[ct] = { 'count': count, 'pending_downloads': self.pending_downloads[prev_pending:] } except Exception as e: self.log(f"Error downloading {ct}: {e}", "error") import traceback self.log(traceback.format_exc(), "debug") results[ct] = {'count': 0, 'pending_downloads': []} # Stop API interception page.remove_listener("response", _capture_api_response) except Exception as e: self.log(f"Error: {e}", "error") finally: try: context.close() self.log("Browser context closed", "debug") except Exception: pass try: browser.close() self.log("Browser closed", "debug") except Exception: pass # Fill in any missing content types with empty results for ct in content_types: if ct not in results: results[ct] = {'count': 0, 'pending_downloads': []} return results def _extract_profile_name(self, input_value): """Extract profile name from username or URL""" if "/" in input_value: # It's a URL, extract username parts = input_value.rstrip('/').split('/') for i, part in enumerate(parts): if part == "p" and i + 1 < len(parts): # It's a post URL, get username from different position return None elif part in ["stories", "highlights", "reels"] and i > 0: # Username is before the content type return parts[i-1] # Default to last part for profile URLs return parts[-1] else: # Direct username return input_value.lower() def _setup_date_filtering(self, days_back, date_from, date_to): """Setup date range for filtering""" self.date_from = None self.date_to = None if date_from: if isinstance(date_from, str): self.date_from = datetime.strptime(date_from, "%Y-%m-%d") else: self.date_from = date_from if date_to: if isinstance(date_to, str): self.date_to = datetime.strptime(date_to, "%Y-%m-%d") else: self.date_to = date_to if days_back and not self.date_from: # Set date range to include full days now = datetime.now() self.date_to = datetime(now.year, now.month, now.day, 23, 59, 59) # End of today self.date_from = (now - timedelta(days=days_back-1)).replace(hour=0, minute=0, second=0) # Start of N days ago self.log(f"Downloading content from last {days_back} days ({self.date_from.strftime('%Y-%m-%d')} to {self.date_to.strftime('%Y-%m-%d')})") def _scan_existing_files(self): """Scan existing files to avoid re-downloading""" self.downloaded_files = scan_existing_files_for_media_ids(self.output_dir, self.profile_name) if self.downloaded_files: self.log(f"Found {len(self.downloaded_files)} existing media IDs, will skip duplicates") def _extract_media_id_from_filename(self, filename): """Extract media ID from filename""" name_without_ext = Path(filename).stem if self.profile_name and name_without_ext.startswith(self.profile_name): remaining = name_without_ext[len(self.profile_name):].lstrip('_') else: remaining = name_without_ext return remaining if remaining else name_without_ext def _run_download(self): """Run the actual download process""" success_count = 0 # Try to get fresh cookies via FlareSolverr if we don't have them or they're old if not self._has_valid_cookies() or self._cookies_expired(): self.log("Cookies missing or expired, attempting FlareSolverr bypass...", "info") if self._get_cookies_via_flaresolverr(): self.log("Successfully got fresh cookies from FlareSolverr", "info") else: self.log("FlareSolverr unavailable, will try with Playwright", "warning") with sync_playwright() as p: browser = p.chromium.launch( headless=self.headless, args=[ '--disable-blink-features=AutomationControlled', '--disable-infobars', '--disable-background-timer-throttling', '--disable-backgrounding-occluded-windows', '--disable-renderer-backgrounding' ] ) # CRITICAL: Browser fingerprint must match FlareSolverr for cookies to work # Get dynamic fingerprint settings from FlareSolverr context_options = get_playwright_context_options() context_options['accept_downloads'] = True context_options['ignore_https_errors'] = True # IMPORTANT: If cookies have a stored user_agent, use THAT user_agent # Cloudflare cf_clearance cookies are fingerprinted to the browser that solved the challenge try: if self.unified_db: stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id) if stored_user_agent: self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug") context_options['user_agent'] = stored_user_agent else: self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug") else: self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug") except Exception as e: self.log(f"Error getting stored user_agent, using default: {e}", "debug") context = browser.new_context(**context_options) # Load cookies from database or file cookies_loaded = False if self.unified_db: try: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) if cookies: # Clean cookies - remove unsupported properties and convert expiry->expires cleaned_cookies = [] for cookie in cookies: cleaned = {k: v for k, v in cookie.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']} # FlareSolverr uses 'expiry' but Playwright uses 'expires' if 'expiry' in cleaned and 'expires' not in cleaned: cleaned['expires'] = cleaned.pop('expiry') cleaned_cookies.append(cleaned) # CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect try: context.clear_cookies() except Exception: pass context.add_cookies(cleaned_cookies) self.log(f"Loaded {len(cleaned_cookies)} cookies from database", "debug") cookies_loaded = True except Exception as e: self.log(f"Error loading cookies from database: {e}", "warning") # Fallback to file-based cookies if not cookies_loaded and self.cookie_file and self.cookie_file.exists(): try: with open(self.cookie_file, 'r') as f: data = json.load(f) cookies = data.get('cookies', []) if cookies: # Convert expiry->expires for Playwright compatibility cleaned_cookies = [] for cookie in cookies: cleaned = dict(cookie) if 'expiry' in cleaned and 'expires' not in cleaned: cleaned['expires'] = cleaned.pop('expiry') cleaned_cookies.append(cleaned) # CRITICAL: Clear existing cookies first try: context.clear_cookies() except Exception: pass context.add_cookies(cleaned_cookies) self.log(f"Loaded {len(cleaned_cookies)} cookies from file", "debug") except Exception as e: self.log(f"Failed to load cookies: {e}", "warning") # Handle popups def handle_popup(page): if len(context.pages) > 1: self.log("Blocking popup") page.close() context.on("page", handle_popup) page = context.new_page() page.on("popup", lambda popup: popup.close()) # Add anti-detection scripts page.add_init_script(get_playwright_stealth_scripts()) try: # Intercept all API responses to discover FastDL's backend endpoints api_responses = [] def _capture_api_response(response): try: url = response.url if 'fastdl.app' in url and response.status == 200: content_type = response.headers.get('content-type', '') if 'json' in content_type: try: body = response.json() api_responses.append({ 'url': url, 'body': body, 'size': len(str(body)), }) # Extract endpoint name for logging endpoint = url.split('/')[-1].split('?')[0] if isinstance(body, dict) and 'result' in body: result = body['result'] if isinstance(result, list): self.log(f"[API] Captured {endpoint}: {len(result)} items", "info") elif isinstance(result, dict) and 'edges' in result: self.log(f"[API] Captured {endpoint}: {len(result['edges'])} edges (count: {result.get('count', '?')})", "info") else: self.log(f"[API] Captured {endpoint}", "info") except Exception: pass except Exception: pass page.on("response", _capture_api_response) # Navigate to FastDL self.log(f"Navigating to FastDL...") page.goto(self.fastdl_url, wait_until="domcontentloaded", timeout=60000) page.wait_for_timeout(2000) # Enter username input_field = page.locator("input[type='text']").first if not input_field or not input_field.is_visible(): self.log("Could not find input field", "error") return 0 self.log(f"Entering username: {self.username}") input_field.fill(self.username) page.wait_for_timeout(500) # Click download button download_button = page.locator("button:has-text('Download')").first if download_button and download_button.is_visible(): download_button.click(force=True) self.log("Loading profile...") # Wait for the profile loading message to disappear try: self.log("Waiting for profile to load...") # Wait for the loader component to disappear (max 30 seconds) page.wait_for_selector(".loader-component", timeout=30000, state="detached") self.log("Profile loading complete") except PlaywrightTimeout: self.log("Profile still loading after 30s, continuing anyway...", "warning") except Exception as e: self.log(f"Error waiting for loader: {e}", "debug") # Additional wait for tabs to render page.wait_for_timeout(2000) # Wait for tabs to exist in DOM (they'll become actionable when clicked) try: page.wait_for_selector("ul.tabs-component", timeout=5000, state="attached") tabs_count = page.locator("button.tabs-component__button").count() if tabs_count > 0: self.log(f"Profile loaded successfully - found {tabs_count} tabs") else: self.log("Tabs container found but no buttons, waiting...", "warning") page.wait_for_timeout(5000) except PlaywrightTimeout: self.log("Tabs container not found after 5s, continuing anyway...", "warning") except Exception as e: self.log(f"Error checking tabs: {e}", "warning") # Dismiss consent dialog self._dismiss_consent_dialog(page) # Navigate to content tab (this also triggers the API call for that content type) if self.content_type != "all": self._navigate_to_content_tab(page) # Scroll to load all paginated posts/reels within date range if self.content_type in ('posts', 'reels'): self._scroll_to_load_api_posts(page, api_responses) # Try API-based download first (much faster — no scrolling/DOM needed) # postsV2 is already captured from initial profile load + scrolling # stories/reels are captured when we click their tab above # Skip API for posts with high_res — need browser to access Instagram directly api_result = -1 use_api = self.content_type in ('stories', 'posts', 'reels') and api_responses if use_api and self.high_res and self.content_type == 'posts': self.log("High-res mode enabled for posts, trying /api/convert approach", "info") api_convert_result = self._download_highres_via_api_convert(page, api_responses) if api_convert_result < 0: # postsV2 data missing, fall back to browser-based high-res self.log("Falling back to browser-based high-res download", "info") else: api_result = api_convert_result use_api = False # Don't also run normal API download for posts if use_api: api_result = self._download_from_api(api_responses) if api_result >= 0: self.log(f"API-based download complete: {api_result} items") success_count = api_result else: if api_responses and self.content_type in ('stories', 'posts', 'reels'): self.log("API data not usable, falling back to browser-based download", "info") # Build a pk lookup map from API responses so the browser # fallback can still tag downloads with the Instagram pk. self._cdn_to_pk_map = {} if self.content_type == 'stories' and api_responses: self._build_pk_map_from_api(api_responses) success_count = self._download_content(page) # Stop API interception page.remove_listener("response", _capture_api_response) except Exception as e: self.log(f"Error: {e}", "error") finally: try: context.close() self.log("Browser context closed", "debug") except Exception: pass try: browser.close() self.log("Browser closed", "debug") except Exception: pass return success_count def _navigate_to_content_tab(self, page): """Navigate to specific content type tab""" # All tabs are lowercase on FastDL tab_map = { "stories": "stories", "posts": "posts", "reels": "reels", "highlights": "highlights" } if self.content_type in tab_map: tab_name = tab_map[self.content_type] # Use the tabs-component__button selector tab_selector = f"button.tabs-component__button:has-text('{tab_name}')" try: # Wait for the specific tab to exist in DOM page.wait_for_selector(tab_selector, timeout=5000, state="attached") # Get the tab element tab = page.locator(tab_selector).first # Dismiss consent overlay before clicking tab self._dismiss_consent_dialog(page) # Use dispatch_event to fire a DOM click event directly on the element # force=True only dispatches mouse events at coordinates which Vue.js doesn't register self.log(f"Clicking {tab_name} tab") tab.dispatch_event('click') page.wait_for_timeout(2000) # Verify tab switched by checking for active class is_active = tab.evaluate("el => el.classList.contains('tabs-component__button--active')") if not is_active: self.log(f"dispatch_event didn't activate tab, trying JS click", "debug") tab.evaluate("el => el.click()") page.wait_for_timeout(2000) is_active = tab.evaluate("el => el.classList.contains('tabs-component__button--active')") if not is_active: self.log(f"JS click also failed to activate {tab_name} tab", "warning") else: self.log(f"JS click activated {tab_name} tab", "debug") else: self.log(f"{tab_name} tab is now active", "debug") # Wait for tab content to load page.wait_for_timeout(3000) except PlaywrightTimeout: self.log(f"Timeout waiting for {tab_name} tab to become clickable", "warning") except Exception as e: self.log(f"Could not click {tab_name} tab: {e}", "warning") def _extract_shortcodes_from_json(self, data, shortcodes_list): """Recursively extract Instagram shortcodes from JSON data Args: data: JSON data (dict, list, or primitive) shortcodes_list: List to append found shortcodes to """ if isinstance(data, dict): # Check for common keys that might contain shortcodes for key in ['shortcode', 'code', 'post_id', 'media_id', 'id', 'pk', 'shortCode']: if key in data: value = data[key] if isinstance(value, str) and len(value) == 11: # Validate it looks like a shortcode instagram_alphabet = set('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_') if set(value).issubset(instagram_alphabet): shortcodes_list.append(value) # Recursively check all values for value in data.values(): self._extract_shortcodes_from_json(value, shortcodes_list) elif isinstance(data, list): # Recursively check all items for item in data: self._extract_shortcodes_from_json(item, shortcodes_list) def _extract_instagram_url_from_item(self, item, page): """Extract Instagram post URL from a profile item by clicking on it FastDL displays Instagram shortcodes when you click on a post thumbnail. We'll click the item, extract the Instagram URL from the detail view, then go back to the grid. Args: item: Profile media list item element page: Playwright page object Returns: Instagram post URL string or None """ try: # Method 1: Check for data attributes first (fast) for attr in ['data-url', 'data-post-url', 'data-instagram-url', 'data-shortcode']: value = item.get_attribute(attr) if value: if 'instagram.com/p/' in value: return value # Check if it's just a shortcode elif len(value) == 11 and value.replace('_', '').replace('-', '').isalnum(): return f"https://www.instagram.com/p/{value}/" # Method 2: Check for Instagram links in the HTML all_links = item.locator("a").all() for link in all_links: href = link.get_attribute("href") if href and 'instagram.com/p/' in href: return href # Method 3: Click on the item to open detail view # Find the clickable image or container clickable = item.locator("img.media-content__image").first if not clickable or not clickable.is_visible(): # Try finding any clickable element in the item clickable = item.locator("a, button, .media-content__image").first if clickable and clickable.is_visible(): self.log("Clicking item to extract Instagram URL...", "debug") # Store current URL to know if we navigated current_url = page.url # Click the item clickable.click(force=True) page.wait_for_timeout(2000) # Wait for detail view to load # Look for Instagram URL in the detail view # Check page source for Instagram URLs page_content = page.content() # Look for instagram.com/p/ URLs in the HTML import re instagram_pattern = r'https?://(?:www\.)?instagram\.com/p/([A-Za-z0-9_-]{11})' matches = re.findall(instagram_pattern, page_content) if matches: instagram_url = f"https://www.instagram.com/p/{matches[0]}/" self.log(f"Found Instagram URL in detail view: {instagram_url}", "debug") # Go back to grid view page.go_back() page.wait_for_timeout(1000) return instagram_url # If we didn't find anything, go back if page.url != current_url: page.go_back() page.wait_for_timeout(1000) except Exception as e: self.log(f"Error extracting Instagram URL: {e}", "debug") # Try to go back if we're stuck try: page.go_back() page.wait_for_timeout(500) except Exception: pass return None def _download_content_highres(self, page): """Download content in high-resolution mode by searching individual Instagram URLs""" success_count = 0 # STEP 0: Try to intercept API responses to find shortcodes api_shortcodes = [] def handle_response(response): """Intercept API responses to extract shortcodes""" try: # Check if this is a FastDL API response if 'fastdl.app' in response.url and response.status == 200: content_type = response.headers.get('content-type', '') if 'json' in content_type: try: data = response.json() # Look for shortcodes in the JSON response self._extract_shortcodes_from_json(data, api_shortcodes) except Exception: pass except Exception as e: self.log(f"Error intercepting response: {e}", "debug") # Start listening to responses page.on("response", handle_response) # STEP 1: Scroll to load ALL content from the profile self.log(f"Loading all {self.content_type} from profile...") self._scroll_to_load_content(page) # Stop listening page.remove_listener("response", handle_response) if api_shortcodes: self.log(f"Extracted {len(api_shortcodes)} shortcodes from API responses!") else: self.log("No shortcodes found in API responses", "debug") # STEP 1.5: Try to extract all Instagram shortcodes from page source first (faster) self.log("Checking page source for Instagram URLs and shortcodes...") page_content = page.content() # Method 1: Look for full Instagram URLs (most reliable) instagram_pattern = r'https?://(?:www\.)?instagram\.com/p/([A-Za-z0-9_-]{11})' instagram_urls_found = re.findall(instagram_pattern, page_content) # Method 2: Look for shortcodes in specific contexts only # Look in data attributes that explicitly mention shortcode/post/media data_attr_pattern = r'data-(?:shortcode|post-id|media-id|code)=["\']([A-Za-z0-9_-]{11})["\']' data_attr_shortcodes = re.findall(data_attr_pattern, page_content, re.IGNORECASE) # Method 3: Look in JavaScript objects with explicit keys js_pattern = r'["\']?(?:shortcode|code|post_id|media_id)["\']?\s*[:=]\s*["\']([A-Za-z0-9_-]{11})["\']' js_shortcodes = re.findall(js_pattern, page_content, re.IGNORECASE) # Combine initial findings potential_shortcodes = list(set(instagram_urls_found + data_attr_shortcodes + js_shortcodes)) # Filter out common false positives (HTML attributes, common words) blacklist = { 'crossorigin', 'placeholder', 'description', 'attribution', 'information', 'application', 'xsrfcookie', 'performance', 'credentials', 'stylesheets', 'stylesheet_', 'javascript', 'touchstart', 'touchcancel', 'transparent', 'comfortable' } # Additional validation: Instagram shortcodes typically have mixed case # and often contain numbers, underscores, or hyphens def is_valid_shortcode(sc): sc_lower = sc.lower() # Reject if in blacklist if sc_lower in blacklist: return False # Reject if all lowercase letters (likely a word) if sc.islower() and sc.isalpha(): return False # Reject if starts with common prefixes if sc_lower.startswith(('data', 'http', 'www', 'src', 'href')): return False # Must use Instagram's alphabet only instagram_alphabet = set('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_') if not set(sc).issubset(instagram_alphabet): return False # Should have at least one uppercase OR number OR special char if not any(c.isupper() or c.isdigit() or c in '-_' for c in sc): return False return True valid_shortcodes = [sc for sc in potential_shortcodes if is_valid_shortcode(sc)] # Also validate API shortcodes api_shortcodes = [sc for sc in api_shortcodes if is_valid_shortcode(sc)] # Combine all found shortcodes (from API responses, page source, JS) all_shortcodes = set(api_shortcodes + valid_shortcodes) if all_shortcodes: self.log(f"Found {len(all_shortcodes)} valid Instagram shortcodes") if api_shortcodes: self.log(f" - {len(api_shortcodes)} from API responses") if valid_shortcodes: self.log(f" - {len(valid_shortcodes)} from page source/HTML") instagram_urls_set = set(f"https://www.instagram.com/p/{shortcode}/" for shortcode in all_shortcodes) # Log a few examples for verification examples = list(all_shortcodes)[:5] self.log(f"Example shortcodes: {', '.join(examples)}") else: instagram_urls_set = set() self.log("No Instagram shortcodes found - high-res mode will not work", "warning") # STEP 2: Collect all items and look for Instagram URLs or shortcodes self.log("Extracting Instagram post URLs from profile content...") all_media_items = page.locator("li.profile-media-list__item").all() # Build mapping of Instagram URLs to their metadata media_items_data = [] # If we found Instagram URLs in page source and count matches items, # assume they're in order (much faster than clicking each item) use_page_source_urls = len(instagram_urls_set) > 0 and len(instagram_urls_set) >= len(all_media_items) * 0.8 if use_page_source_urls: self.log(f"Using Instagram URLs from page source (found {len(instagram_urls_set)}, items {len(all_media_items)})") instagram_urls_list = list(instagram_urls_set) for idx, item in enumerate(all_media_items): # Get the date for this item post_date = None date_str = None time_elem = item.locator("p.media-content__meta-time").first if time_elem and time_elem.is_visible(): date_str = time_elem.get_attribute("title") if date_str: try: post_date = datetime.strptime(date_str, "%m/%d/%Y, %I:%M:%S %p") except Exception: pass # Try to get Instagram URL instagram_url = None # First, try to use pre-extracted URLs from page source (if available) if use_page_source_urls and idx < len(instagram_urls_list): instagram_url = instagram_urls_list[idx] self.log(f"Item {idx+1}: Using URL from page source: {instagram_url}", "debug") else: # Otherwise, extract from the item itself (may click on it) instagram_url = self._extract_instagram_url_from_item(item, page) if instagram_url: media_items_data.append({ 'instagram_url': instagram_url, 'post_date': post_date, 'date_str': date_str }) else: # Fallback: Try to extract media ID and convert (unreliable) self.log(f"Item {idx+1}: No Instagram URL found, trying CDN media ID (unreliable)", "warning") item_links = item.locator("a[href*='.jpg'], a[href*='.mp4']").all() for link in item_links: href = link.get_attribute("href") if href: # Extract media IDs from this URL media_ids = self._extract_media_ids_from_fastdl_url(href) for media_id in media_ids: media_items_data.append({ 'media_id': media_id, 'instagram_url': None, 'post_date': post_date, 'date_str': date_str }) break # Only process first link per item if not media_items_data: self.log("No Instagram post URLs or media IDs found in profile content", "warning") self.log("", "info") self.log("╔═══════════════════════════════════════════════════════════════════════╗", "warning") self.log("║ HIGH-RES MODE FAILED: FastDL doesn't expose Instagram shortcodes ║", "warning") self.log("║ ║", "warning") self.log("║ Recommendation: Disable high_res mode in settings.json for FastDL ║", "warning") self.log("║ Regular FastDL downloads are already good quality (640x640 or better)║", "warning") self.log("╚═══════════════════════════════════════════════════════════════════════╝", "warning") return 0 self.log(f"Found {len(media_items_data)} media items to download in high-res") # Apply max_downloads limit if self.max_downloads: media_items_data = media_items_data[:self.max_downloads] self.log(f"Limited to {len(media_items_data)} items") # Set initial progress so dashboard shows 0/N immediately self.activity_manager.update_status( f"Downloading {self.content_type}", progress_current=0, progress_total=len(media_items_data) ) # STEP 3: For each item, get Instagram URL and search on FastDL consecutive_old_posts = 0 for i, item_data in enumerate(media_items_data, 1): # Update progress at start of each iteration (fires even on skips) self.activity_manager.update_status( f"Downloading {self.content_type}", progress_current=i, progress_total=len(media_items_data) ) instagram_url = item_data.get('instagram_url') media_id = item_data.get('media_id') post_date = item_data['post_date'] # Extract media ID for tracking if instagram_url: # Extract shortcode from Instagram URL for tracking # URL format: https://www.instagram.com/p/SHORTCODE/ shortcode_match = re.search(r'/p/([A-Za-z0-9_-]+)', instagram_url) if shortcode_match: tracking_id = shortcode_match.group(1) else: tracking_id = instagram_url # Use full URL as fallback elif media_id: tracking_id = media_id else: self.log(f"[{i}/{len(media_items_data)}] No Instagram URL or media ID found, skipping") continue # Check if already downloaded - check both original and normalized media ID normalized_tracking_id = extract_instagram_media_id(tracking_id) if tracking_id in self.downloaded_files or normalized_tracking_id in self.downloaded_files: self.log(f"[{i}/{len(media_items_data)}] Skipping duplicate (session): {tracking_id}") continue if self._is_already_downloaded(tracking_id) or (normalized_tracking_id != tracking_id and self._is_already_downloaded(normalized_tracking_id)): self.log(f"[{i}/{len(media_items_data)}] Skipping duplicate (database): {tracking_id}") self.downloaded_files.add(tracking_id) self.downloaded_files.add(normalized_tracking_id) continue # Check date filtering if post_date and (self.date_from or self.date_to): if self.date_from and post_date < self.date_from: self.log(f"[{i}/{len(media_items_data)}] Skipping - too old: {post_date.strftime('%Y-%m-%d')}") # Record as checked so we don't check again self._record_checked(tracking_id, self.profile_name, self.content_type, reason="old_post", post_date=post_date) consecutive_old_posts += 1 if consecutive_old_posts >= 5: self.log("Reached old posts, stopping...") break continue if self.date_to and post_date > self.date_to: self.log(f"[{i}/{len(media_items_data)}] Skipping - too new: {post_date.strftime('%Y-%m-%d')}") # Record as checked so we don't check again self._record_checked(tracking_id, self.profile_name, self.content_type, reason="too_new", post_date=post_date) continue consecutive_old_posts = 0 # Get Instagram URL - either directly or by converting media ID if not instagram_url: # Fallback: Try to convert media ID to Instagram shortcode try: shortcode = self._media_id_to_shortcode(media_id) instagram_url = f"https://www.instagram.com/p/{shortcode}/" self.log(f"[{i}/{len(media_items_data)}] Converting media ID {media_id} → {shortcode}", "warning") self.log(f"[{i}/{len(media_items_data)}] NOTE: This conversion may not be accurate", "warning") except Exception as e: self.log(f"[{i}/{len(media_items_data)}] Error converting media ID {media_id}: {e}", "error") continue else: self.log(f"[{i}/{len(media_items_data)}] Using Instagram URL: {instagram_url}") # Search for this Instagram URL on FastDL to get high-res links high_res_links = self._search_instagram_url_on_fastdl(page, instagram_url) if not high_res_links: self.log(f"[{i}/{len(media_items_data)}] No high-res links found for {instagram_url}", "warning") continue # Check for phrase matching on high-res page (if configured) if self.phrase_config and self.phrase_config.get('enabled'): # Extract caption from the high-res detail page caption_text = "" try: # Try multiple caption selectors on the high-res page caption_selectors = [ 'div.output-list__caption p', '.output-list__caption', 'div.output-list__caption', '.media-content__caption', 'p.media-content__caption' ] for selector in caption_selectors: try: caption_elem = page.locator(selector).first if caption_elem and caption_elem.is_visible(): text = caption_elem.text_content() or "" if text: caption_text = text break except Exception: continue if caption_text: # Clean up text caption_text = ' '.join(caption_text.split()) phrases = self.phrase_config.get('phrases', []) if phrases: case_sensitive = self.phrase_config.get('case_sensitive', False) match_all = self.phrase_config.get('match_all', False) if not case_sensitive: caption_text = caption_text.lower() phrases = [p.lower() for p in phrases] matches = [] for phrase in phrases: if phrase in caption_text: matches.append(phrase) if match_all: result = len(matches) == len(phrases) else: result = len(matches) > 0 if not result: self.log(f"[{i}/{len(media_items_data)}] Post doesn't match phrase criteria, skipping", "info") # Record as checked so we don't check again self._record_checked(tracking_id, self.profile_name, self.content_type, reason="phrase_checked", post_date=post_date) continue else: self.log(f"[{i}/{len(media_items_data)}] Post matches phrase criteria ({len(matches)}/{len(phrases)} phrases found)", "info") else: self.log(f"[{i}/{len(media_items_data)}] No caption found on high-res page, downloading anyway", "debug") except Exception as e: self.log(f"Error checking phrases on high-res page: {e}", "warning") # On error, proceed with download to avoid false negatives # Download each high-res link for link_idx, (download_url, ext, is_high_res) in enumerate(high_res_links): try: # Create clickable element or use direct download # For now, we'll try to find and click the download link download_link = page.locator(f"a[href='{download_url}']").first if not download_link or not download_link.is_visible(): self.log(f"Could not find clickable link for high-res download", "debug") continue # Download the file profile = self.profile_name or "unknown" if post_date: date_str_formatted = post_date.strftime('%Y%m%d_%H%M%S') else: date_str_formatted = datetime.now().strftime('%Y%m%d_%H%M%S') if len(high_res_links) > 1: new_filename = f"{profile}_{date_str_formatted}_{tracking_id}_{link_idx+1}{ext}" else: new_filename = f"{profile}_{date_str_formatted}_{tracking_id}{ext}" filepath = self.output_dir / new_filename self.output_dir.mkdir(parents=True, exist_ok=True) try: with page.expect_download(timeout=30000) as download_info: download_link.click(force=True) download = download_info.value download.save_as(filepath) except Exception: self.log(f"Browser download failed, trying direct HTTP download", "debug") resp = requests.get(download_url, timeout=60, stream=True) resp.raise_for_status() with open(filepath, 'wb') as f: for chunk in resp.iter_content(chunk_size=8192): f.write(chunk) # Check for duplicate hash before recording (hash blacklist persists even if original deleted) file_hash = self.db.get_file_hash(str(filepath)) if self.db else None if file_hash: existing = self.db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): # Duplicate hash found - content was already downloaded (prevents redownload of deleted content) self.log(f"⚠ Duplicate content detected (hash match): {filepath.name} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") # Delete the duplicate regardless of whether original file still exists try: filepath.unlink() self.log(f"Deleted duplicate (hash blacklist): {filepath.name}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {filepath.name}: {e}", "warning") # Update timestamps if post_date: self._update_all_timestamps(filepath, post_date) self.log(f"✓ [{i}/{len(media_items_data)}] Saved high-res: {filepath.name} (dated: {post_date.strftime('%Y-%m-%d %H:%M')})", "success") else: self.log(f"✓ [{i}/{len(media_items_data)}] Saved high-res: {filepath.name}", "success") # Record in database with normalized media_id for cross-module detection self._record_download( media_id=normalized_tracking_id, username=self.profile_name, content_type=self.content_type, filename=str(filepath), download_url=download_url, post_date=post_date, metadata={'high_res': True, 'instagram_url': instagram_url}, deferred=self.defer_database ) self.downloaded_files.add(tracking_id) self.downloaded_files.add(normalized_tracking_id) success_count += 1 # Smart delay between downloads self._smart_delay() except Exception as e: self.log(f"Error downloading high-res file: {e}", "error") continue return success_count def _download_highres_via_api_convert(self, page, api_responses): """Download high-res posts using /api/convert endpoint instead of browser scraping. Uses postsV2 data (already captured from profile load) to get shortcodes, then triggers /api/convert for each post to get high-res download URLs. Args: page: Playwright page object api_responses: List of captured API responses from profile load Returns: Number of files downloaded, or -1 if postsV2 data not available (triggers fallback). """ # Find postsV2 data from captured API responses (merge all pages) posts_data = None for resp in api_responses: url = resp.get('url', '') body = resp.get('body', {}) if not isinstance(body, dict) or 'result' not in body: continue if '/postsV2' in url: result = body['result'] if isinstance(result, dict) and 'edges' in result: if posts_data is None: posts_data = result else: more_edges = result.get('edges', []) if more_edges: posts_data['edges'].extend(more_edges) if posts_data is None: self.log("No postsV2 data found in API responses, cannot use /api/convert", "warning") return -1 # Signal caller to fall back to browser-based high-res edges = posts_data.get('edges', []) if not edges: self.log("postsV2 has no edges") return 0 # Extract shortcodes + dates from postsV2 post_entries = [] for edge in edges: node = edge.get('node', edge) shortcode = node.get('shortcode', '') if not shortcode: continue taken_at = node.get('taken_at_timestamp') or node.get('taken_at', 0) post_date = datetime.fromtimestamp(taken_at) if taken_at else None post_entries.append({ 'shortcode': shortcode, 'post_date': post_date, 'post_id': str(node.get('id', '')), }) self.log(f"Found {len(post_entries)} posts from postsV2 for high-res /api/convert download") # Filter: dedup (session + DB), date range, max_downloads filtered_entries = [] consecutive_old = 0 for entry in post_entries: shortcode = entry['shortcode'] post_date = entry['post_date'] # Session dedup if shortcode in self.downloaded_files: continue # Database dedup if self._is_already_downloaded(shortcode): self.downloaded_files.add(shortcode) continue # Date filtering if post_date and (self.date_from or self.date_to): if self.date_from and post_date < self.date_from: self.log(f"Skipping old post: {shortcode} ({post_date.strftime('%Y-%m-%d')})") self._record_checked(shortcode, self.profile_name, self.content_type, reason="old_post", post_date=post_date) consecutive_old += 1 if consecutive_old >= 5: self.log("Reached old posts, stopping") break continue if self.date_to and post_date > self.date_to: self.log(f"Skipping future post: {shortcode} ({post_date.strftime('%Y-%m-%d')})") continue consecutive_old = 0 filtered_entries.append(entry) # Apply max_downloads limit if self.max_downloads and len(filtered_entries) > self.max_downloads: filtered_entries = filtered_entries[:self.max_downloads] self.log(f"Limiting to {self.max_downloads} posts") if not filtered_entries: self.log("No new posts to download after filtering") return 0 self.log(f"Processing {len(filtered_entries)} posts via /api/convert for high-res download...") # Set initial progress self.activity_manager.update_status( f"Downloading {self.content_type} (high-res)", progress_current=0, progress_total=len(filtered_entries) ) # For each post: fetch via /api/convert, extract items, apply phrase matching all_items = [] for i, entry in enumerate(filtered_entries, 1): shortcode = entry['shortcode'] fallback_date = entry['post_date'] instagram_url = f"https://instagram.com/p/{shortcode}/" self.activity_manager.update_status( f"Fetching high-res post {i}/{len(filtered_entries)}", progress_current=i, progress_total=len(filtered_entries) ) self.log(f"[{i}/{len(filtered_entries)}] Fetching /api/convert for {shortcode}...") convert_data = self._fetch_highres_via_api_convert(page, instagram_url) if not convert_data: self.log(f"[{i}/{len(filtered_entries)}] No /api/convert data for {shortcode}, skipping", "warning") continue items = self._extract_highres_items_from_convert_response(convert_data, shortcode, fallback_date) if not items: self.log(f"[{i}/{len(filtered_entries)}] No downloadable items from /api/convert for {shortcode}", "warning") continue # Phrase matching using caption from /api/convert response (meta.title) if self.phrase_config and self.phrase_config.get('enabled'): caption = items[0].get('caption', '') if items else '' if caption: phrases = self.phrase_config.get('phrases', []) if phrases: case_sensitive = self.phrase_config.get('case_sensitive', False) match_all = self.phrase_config.get('match_all', False) check_text = caption if case_sensitive else caption.lower() check_phrases = phrases if case_sensitive else [p.lower() for p in phrases] matches = [p for p in check_phrases if p in check_text] if match_all: passed = len(matches) == len(check_phrases) else: passed = len(matches) > 0 if not passed: self.log(f"[{i}/{len(filtered_entries)}] Post {shortcode} doesn't match phrase criteria, skipping") self._record_checked(shortcode, self.profile_name, self.content_type, reason="phrase_checked", post_date=fallback_date) continue else: self.log(f"[{i}/{len(filtered_entries)}] Post matches phrases ({len(matches)}/{len(phrases)})") else: self.log(f"[{i}/{len(filtered_entries)}] No caption from /api/convert, downloading anyway", "debug") # Dedup individual carousel items new_for_post = 0 for item in items: mid = item['media_id'] norm = item.get('normalized_media_id', mid) if mid in self.downloaded_files or norm in self.downloaded_files: continue if self._is_already_downloaded(mid) or (norm != mid and self._is_already_downloaded(norm)): self.downloaded_files.add(mid) self.downloaded_files.add(norm) continue all_items.append(item) new_for_post += 1 # Record shortcode as processed so next run skips the /api/convert fetch if new_for_post == 0: # All items already downloaded — record shortcode to avoid re-fetching self.downloaded_files.add(shortcode) self._record_checked(shortcode, self.profile_name, self.content_type, reason="downloaded", post_date=fallback_date) # Smart delay between posts (not between carousel items) if i < len(filtered_entries): self._smart_delay() if not all_items: self.log("No new high-res items to download after processing") return 0 self.log(f"Downloading {len(all_items)} high-res items via parallel HTTP...") count = self._download_items_parallel(all_items) # Record all processed shortcodes so next run skips the /api/convert fetch for entry in filtered_entries: sc = entry['shortcode'] self.downloaded_files.add(sc) self._record_checked(sc, self.profile_name, self.content_type, reason="downloaded", post_date=entry.get('post_date')) return count def _download_from_api(self, api_responses): """Download content directly from intercepted API responses (no browser needed). Returns: Number of files downloaded, or -1 if API data not available for this content type. """ # Find the relevant API response(s) for our content type api_data = None for resp in api_responses: url = resp.get('url', '') body = resp.get('body', {}) if not isinstance(body, dict) or 'result' not in body: continue if self.content_type == 'stories' and '/stories' in url: api_data = body['result'] break elif self.content_type in ('posts', 'reels') and '/postsV2' in url: result = body['result'] if api_data is None: api_data = result elif isinstance(api_data, dict) and 'edges' in api_data and isinstance(result, dict): # Merge edges from additional paginated responses more_edges = result.get('edges', []) if more_edges: api_data['edges'].extend(more_edges) if api_data is None: return -1 # No API data for this content type # Extract download items based on content type items = [] if self.content_type == 'stories': if not isinstance(api_data, list): return -1 items = self._extract_stories_from_api(api_data) elif self.content_type in ('posts', 'reels'): if not isinstance(api_data, dict) or 'edges' not in api_data: return -1 items = self._extract_posts_from_api(api_data) if not items: self.log("No downloadable items found in API response") return 0 self.log(f"Found {len(items)} items from API response") # Filter out already-downloaded items new_items = [] for item in items: media_id = item['media_id'] normalized = item.get('normalized_media_id', media_id) if media_id in self.downloaded_files or normalized in self.downloaded_files: continue if self._is_already_downloaded(media_id) or (normalized and normalized != media_id and self._is_already_downloaded(normalized)): self.downloaded_files.add(media_id) if normalized: self.downloaded_files.add(normalized) continue new_items.append(item) if not new_items: self.log("All items already downloaded") return 0 # Apply date filtering filtered_items = [] consecutive_old = 0 for item in new_items: post_date = item.get('post_date') if post_date and (self.date_from or self.date_to): if self.date_from and post_date < self.date_from: self.log(f"Skipping old item: {post_date.strftime('%Y-%m-%d')}") self._record_checked(item['media_id'], self.profile_name, self.content_type, reason="old_post", post_date=post_date) # Track shortcode so other content types don't re-check the same post if item.get('shortcode'): self.downloaded_files.add(item['shortcode']) consecutive_old += 1 if self.content_type != 'stories' and consecutive_old >= 5: self.log("Reached old posts, stopping") break continue if self.date_to and post_date > self.date_to: self.log(f"Skipping future item: {post_date.strftime('%Y-%m-%d')}") continue consecutive_old = 0 self.log(f"Item within date range: {post_date.strftime('%Y-%m-%d')}") filtered_items.append(item) # Apply max_downloads limit if self.max_downloads and len(filtered_items) > self.max_downloads: filtered_items = filtered_items[:self.max_downloads] self.log(f"Limiting to {self.max_downloads} items") if not filtered_items: self.log("No items passed filtering") return 0 self.log(f"Downloading {len(filtered_items)} items via API (parallel HTTP)...") return self._download_items_parallel(filtered_items) def _download_items_parallel(self, filtered_items): """Download items in parallel via HTTP with post-processing. Items need: download_url, filename, media_id, normalized_media_id, post_date, ext Returns: number of successfully downloaded files. """ if not filtered_items: return 0 # Set initial progress self.activity_manager.update_status( f"Downloading {self.content_type}", progress_current=0, progress_total=len(filtered_items) ) # Download all items in parallel via HTTP self.output_dir.mkdir(parents=True, exist_ok=True) success_count = 0 results = [] def _download_single(item): """Download a single file via HTTP with retry on server errors. Thread-safe.""" last_error = None for attempt in range(3): try: resp = requests.get(item['download_url'], timeout=60, stream=True) resp.raise_for_status() filepath = self.output_dir / item['filename'] with open(filepath, 'wb') as f: for chunk in resp.iter_content(chunk_size=8192): f.write(chunk) return {**item, 'filepath': filepath, 'success': True} except requests.exceptions.HTTPError as e: last_error = e if resp.status_code >= 500 and attempt < 2: time.sleep(2 * (attempt + 1)) continue break except Exception as e: last_error = e break self.log(f"Download failed for {item['media_id']}: {last_error}", "warning") return {**item, 'success': False, 'error': str(last_error)} max_workers = min(4, len(filtered_items)) with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = {} for idx, item in enumerate(filtered_items): future = executor.submit(_download_single, item) futures[future] = item if idx < len(filtered_items) - 1: time.sleep(0.2) for future in as_completed(futures): result = future.result() if result.get('success'): results.append(result) self.activity_manager.update_status( f"Downloading {self.content_type}", progress_current=len(results), progress_total=len(filtered_items) ) # Post-process: timestamps, hash check, DB recording (sequential) for result in results: filepath = result['filepath'] media_id = result['media_id'] normalized = result.get('normalized_media_id', media_id) post_date = result.get('post_date') download_url = result.get('download_url', '') # Hash duplicate check file_hash = self.db.get_file_hash(str(filepath)) if self.db else None if file_hash: existing = self.db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): self.log(f"Duplicate detected: {filepath.name}", "warning") try: filepath.unlink() continue except Exception: pass # Update timestamps if post_date: self._update_all_timestamps(filepath, post_date) self.log(f"Saved: {filepath.name} (dated: {post_date.strftime('%Y-%m-%d %H:%M')})") else: self.log(f"Saved: {filepath.name}") # Record in database — include pk for stories so callers # can use the stable Instagram primary key as story ID meta = result.get('metadata') or {} if result.get('pk'): meta['pk'] = result['pk'] self._record_download( media_id=normalized or media_id, username=self.profile_name, content_type=self.content_type, filename=str(filepath), download_url=download_url, post_date=post_date, metadata=meta or None, deferred=self.defer_database ) self.downloaded_files.add(media_id) if normalized: self.downloaded_files.add(normalized) success_count += 1 return success_count def _extract_media_id_from_cdn_url(self, url): """Extract Instagram media ID from a CDN URL path. Instagram CDN URLs look like: https://scontent-xxx.cdninstagram.com/.../643551919_18095277650490921_7199803193185481374_n.jpg?... Returns the filename stem like '643551919_18095277650490921_7199803193185481374_n' """ if not url: return None try: # Parse the URL path, get the last segment before query params path = urllib.parse.urlparse(url).path filename = Path(path).stem # filename without extension # Validate it looks like an Instagram media filename (contains underscores and digits) if filename and '_' in filename and any(c.isdigit() for c in filename): return filename except Exception: pass return None def _build_pk_map_from_api(self, api_responses): """Build a CDN-filename-to-pk map from captured API responses. When the API-based download fails and we fall back to browser scraping, we still need the pk for each story so callers can use stable IDs. This extracts pk from the raw API data and maps it by CDN filename. """ for resp in api_responses: url = resp.get('url', '') body = resp.get('body', {}) if not isinstance(body, dict) or 'result' not in body: continue if '/stories' not in url: continue result = body['result'] if not isinstance(result, list): continue for story in result: pk = str(story.get('pk', '')) if not pk: continue # Map CDN filenames from all video/image versions to this pk for vv in story.get('video_versions', []): cdn_url = vv.get('url', '') fname = self._extract_media_id_from_cdn_url(cdn_url) if fname: self._cdn_to_pk_map[fname] = pk for cand in story.get('image_versions2', {}).get('candidates', []): cdn_url = cand.get('url', '') fname = self._extract_media_id_from_cdn_url(cdn_url) if fname: self._cdn_to_pk_map[fname] = pk if self._cdn_to_pk_map: self.log(f"Built pk map for {len(self._cdn_to_pk_map)} CDN filenames from API data", "debug") def _extract_stories_from_api(self, stories_data): """Extract download items from stories API response.""" items = [] profile = self.profile_name or "unknown" for story in stories_data: try: pk = str(story.get('pk', '')) taken_at = story.get('taken_at', 0) post_date = datetime.fromtimestamp(taken_at) if taken_at else None # Determine if video or image video_versions = story.get('video_versions', []) if video_versions: # Video — get highest resolution best = max(video_versions, key=lambda v: v.get('height', 0) * v.get('width', 0)) cdn_url = best.get('url', '') download_url = best.get('url_downloadable') or cdn_url ext = '.mp4' else: # Image — get highest resolution candidate candidates = story.get('image_versions2', {}).get('candidates', []) if not candidates: continue best = max(candidates, key=lambda c: c.get('height', 0) * c.get('width', 0)) cdn_url = best.get('url', '') download_url = best.get('url_downloadable') or cdn_url ext = '.jpg' if not download_url: continue # Extract media_id from the CDN URL (has Instagram filename) # url_downloadable is a FastDL proxy URL, cdn url has the real filename media_id = self._extract_media_id_from_cdn_url(cdn_url) if not media_id: # Fallback: try extracting from url_downloadable's filename param if 'filename=' in download_url: parsed = urllib.parse.urlparse(download_url) params = urllib.parse.parse_qs(parsed.query) fn = params.get('filename', [''])[0] if fn: media_id = Path(fn).stem if not media_id: media_id = pk normalized = extract_instagram_media_id(media_id) if media_id else pk date_str = post_date.strftime('%Y%m%d_%H%M%S') if post_date else datetime.now().strftime('%Y%m%d_%H%M%S') filename = f"{profile}_{date_str}_{media_id}{ext}" items.append({ 'media_id': media_id, 'normalized_media_id': normalized, 'download_url': download_url, 'filename': filename, 'post_date': post_date, 'ext': ext, 'pk': pk, }) except Exception as e: self.log(f"Error parsing story item: {e}", "debug") continue return items def _extract_posts_from_api(self, posts_data): """Extract download items from postsV2 API response.""" items = [] profile = self.profile_name or "unknown" edges = posts_data.get('edges', []) for edge in edges: try: node = edge.get('node', edge) # Some formats wrap in 'node' post_id = str(node.get('id', '')) shortcode = node.get('shortcode', '') is_video = node.get('is_video', False) taken_at = node.get('taken_at_timestamp') or node.get('taken_at', 0) post_date = datetime.fromtimestamp(taken_at) if taken_at else None # Filter by content type: reels are always videos # product_type "clips" = reels (if available in API data) if self.content_type == 'reels': product_type = node.get('product_type', '') if product_type: # If product_type is available, use it for precise filtering if product_type != 'clips': continue elif not is_video: # Fallback: at minimum, reels must be videos continue cdn_url = '' download_url = '' if is_video: download_url = node.get('video_url', '') cdn_url = download_url if not download_url: resources = node.get('display_resources', []) if resources: best = max(resources, key=lambda r: r.get('config_width', 0) * r.get('config_height', 0)) cdn_url = best.get('src', '') download_url = best.get('url_downloadable') or cdn_url ext = '.mp4' else: resources = node.get('display_resources', []) if resources: best = max(resources, key=lambda r: r.get('config_width', 0) * r.get('config_height', 0)) cdn_url = best.get('src', '') download_url = best.get('url_downloadable') or cdn_url else: cdn_url = node.get('display_url', '') download_url = cdn_url ext = '.jpg' if not download_url: continue # Extract media_id from CDN URL (has Instagram filename) media_id = self._extract_media_id_from_cdn_url(cdn_url) if not media_id: # Fallback: try url_downloadable's filename param if 'filename=' in download_url: parsed = urllib.parse.urlparse(download_url) params = urllib.parse.parse_qs(parsed.query) fn = params.get('filename', [''])[0] if fn: media_id = Path(fn).stem if not media_id: media_id = shortcode or post_id normalized = extract_instagram_media_id(media_id) if media_id else post_id date_str = post_date.strftime('%Y%m%d_%H%M%S') if post_date else datetime.now().strftime('%Y%m%d_%H%M%S') filename = f"{profile}_{date_str}_{media_id}{ext}" items.append({ 'media_id': media_id, 'normalized_media_id': normalized, 'download_url': download_url, 'filename': filename, 'post_date': post_date, 'ext': ext, 'shortcode': shortcode, 'post_id': post_id, }) except Exception as e: self.log(f"Error parsing post edge: {e}", "debug") continue return items def _download_content(self, page): """Download content from the page""" # Special handling for highlights if self.content_type == "highlights": return self._download_highlights(page) # Use high-res mode ONLY for posts (stories/reels already at best quality) if self.high_res and self.content_type == "posts": self.log("Using high-resolution download mode for posts", "info") return self._download_content_highres(page) success_count = 0 # Update activity status self.activity_manager.update_status(f"Checking {self.content_type}") # STEP 1: Scroll to load ALL content first self.log(f"Scrolling to load all {self.content_type} content...") self._scroll_to_load_content(page) # STEP 2: After scrolling, collect all items and their dates self.log("Collecting all items and dates after scrolling...") all_media_items = page.locator("li.profile-media-list__item").all() # Build a mapping of media items to dates item_dates = {} for item in all_media_items: time_elem = item.locator("p.media-content__meta-time").first if time_elem and time_elem.is_visible(): date_str = time_elem.get_attribute("title") if date_str: try: # Parse date - use m/d/Y format date_obj = datetime.strptime(date_str, "%m/%d/%Y, %I:%M:%S %p") # Map all download links in this item to this date item_links = item.locator("a[href*='.jpg'], a[href*='.mp4']").all() for link in item_links: href = link.get_attribute("href") if href: item_dates[href] = (date_str, date_obj) except Exception: pass # STEP 3: Get all download links after everything is loaded all_download_links = page.locator("a[href*='.jpg'], a[href*='.mp4']").all() if not all_download_links: self.log("No downloadable items found") return 0 # STEP 3.5: Filter out duplicates BEFORE counting download_links = [] skipped_duplicates = 0 for element in all_download_links: if not element.is_visible(): continue # Check for duplicates during collection href = element.get_attribute("href") or "" if "filename=" in href: parsed = urllib.parse.urlparse(href) params = urllib.parse.parse_qs(parsed.query) if 'filename' in params: url_filename = params['filename'][0] media_id = self._extract_media_id_from_filename(url_filename) normalized_media_id = extract_instagram_media_id(media_id) if media_id else None # Check in-memory cache first (both original and normalized) if media_id in self.downloaded_files or (normalized_media_id and normalized_media_id in self.downloaded_files): skipped_duplicates += 1 continue # Check database (both original and normalized) if self._is_already_downloaded(media_id) or (normalized_media_id and normalized_media_id != media_id and self._is_already_downloaded(normalized_media_id)): self.downloaded_files.add(media_id) # Add to cache if normalized_media_id: self.downloaded_files.add(normalized_media_id) skipped_duplicates += 1 continue # Not a duplicate, add to download list download_links.append(element) if skipped_duplicates > 0: self.log(f"Filtered out {skipped_duplicates} already-downloaded items") if not download_links: self.log("No new items to download (all are duplicates)") return 0 self.log(f"Found {len(download_links)} new items to download") # Limit downloads if specified limit = len(download_links) if self.max_downloads and self.max_downloads < limit: limit = self.max_downloads self.log(f"Limiting to {limit} items (max_downloads setting)") # Set initial progress so dashboard shows 0/N immediately self.activity_manager.update_status( f"Downloading {self.content_type}", progress_current=0, progress_total=limit ) # Dismiss any cookie consent overlay before clicking download links self._dismiss_consent_dialog(page) # STEP 4: Download all items in batch consecutive_old_posts = 0 # Track posts outside date range for i in range(limit): if i >= len(download_links): break # Update progress at start of each iteration (fires even on skips) self.activity_manager.update_status( f"Downloading {self.content_type}", progress_current=i + 1, progress_total=limit ) element = download_links[i] if not element.is_visible(): continue # Find the date for this specific item post_date = None try: # Get the href of this link to look up its date href = element.get_attribute("href") if href and href in item_dates: date_str, post_date = item_dates[href] self.log(f"Found date for item {i+1}: {date_str}") # Fallback: Try to find the parent li and get its date if not post_date: parent_li = element.locator("xpath=ancestor::li[@class='profile-media-list__item']").first if parent_li and parent_li.is_visible(): time_elem = parent_li.locator("p.media-content__meta-time").first if time_elem and time_elem.is_visible(): date_str = time_elem.get_attribute("title") if date_str: # Parse date - use m/d/Y format post_date = datetime.strptime(date_str, "%m/%d/%Y, %I:%M:%S %p") self.log(f"Found date via parent li: {date_str}") except Exception as e: self.log(f"Could not extract date: {e}") # Check date filtering for all content types when date range is specified if post_date and (self.date_from or self.date_to): # Extract media_id for tracking href = element.get_attribute("href") or "" media_id_for_tracking = None if "filename=" in href: parsed = urllib.parse.urlparse(href) params = urllib.parse.parse_qs(parsed.query) if 'filename' in params: url_filename = params['filename'][0] media_id_for_tracking = self._extract_media_id_from_filename(url_filename) # Apply date filtering if self.date_from and post_date < self.date_from: self.log(f"Skipping item - too old: {post_date.strftime('%Y-%m-%d')}") # Record as checked if we have media_id if media_id_for_tracking: self._record_checked(media_id_for_tracking, self.profile_name, self.content_type, reason="old_post", post_date=post_date) consecutive_old_posts += 1 # If we've seen 5 consecutive old posts, stop checking # (posts are usually in chronological order) # For highlights, don't stop early as they may have mixed dates if self.content_type != "highlights" and consecutive_old_posts >= 5: self.log("Reached old posts, stopping...") break continue if self.date_to and post_date > self.date_to: self.log(f"Skipping item - too new: {post_date.strftime('%Y-%m-%d')}") # Record as checked if we have media_id if media_id_for_tracking: self._record_checked(media_id_for_tracking, self.profile_name, self.content_type, reason="too_new", post_date=post_date) continue # Post is within range consecutive_old_posts = 0 # Reset counter self.log(f"Item within date range: {post_date.strftime('%Y-%m-%d')}") # Check for phrase matching if configured (only for posts, not reels or stories) if self.phrase_config and self.phrase_config.get('enabled'): if self.content_type == 'posts': # The caption is visible on the profile page itself # Find the parent li element that contains this download link parent_item = element.locator("xpath=ancestor::li[@class='profile-media-list__item']").first if parent_item and parent_item.is_visible(): # Get the caption from this specific post item caption_elem = parent_item.locator("p.media-content__caption").first if caption_elem and caption_elem.is_visible(): caption_text = caption_elem.text_content() or "" # Check if caption matches phrases phrases = self.phrase_config.get('phrases', []) if phrases: case_sensitive = self.phrase_config.get('case_sensitive', False) match_all = self.phrase_config.get('match_all', False) if not case_sensitive: caption_text = caption_text.lower() phrases = [p.lower() for p in phrases] matches = [] for phrase in phrases: if phrase in caption_text: matches.append(phrase) if match_all: result = len(matches) == len(phrases) else: result = len(matches) > 0 if not result: self.log(f"Post {i+1} caption doesn't match phrases, skipping") # Extract media_id for tracking href = element.get_attribute("href") or "" if "filename=" in href: parsed = urllib.parse.urlparse(href) params = urllib.parse.parse_qs(parsed.query) if 'filename' in params: url_filename = params['filename'][0] media_id_for_phrase = self._extract_media_id_from_filename(url_filename) # Record as checked so we don't check again self._record_checked(media_id_for_phrase, self.profile_name, self.content_type, reason="phrase_checked", post_date=post_date) continue else: self.log(f"Post {i+1} matches phrase criteria ({len(matches)}/{len(phrases)} phrases found)") else: # No caption found, skip phrase check for this item self.log(f"No caption found for post {i+1}, skipping phrase check", "debug") # Download the file try: href = element.get_attribute("href") or "" download_timeout = 30000 # 30 seconds for videos # Try browser download first, fall back to direct HTTP download filepath = None try: with page.expect_download(timeout=download_timeout) as download_info: element.click(force=True) download = download_info.value original_filename = download.suggested_filename media_id = self._extract_media_id_from_filename(original_filename) normalized_media_id = extract_instagram_media_id(media_id) if media_id else media_id ext = Path(original_filename).suffix profile = self.profile_name or "unknown" if post_date: date_str = post_date.strftime('%Y%m%d_%H%M%S') else: date_str = datetime.now().strftime('%Y%m%d_%H%M%S') new_filename = f"{profile}_{date_str}_{media_id}{ext}" filepath = self.output_dir / new_filename self.output_dir.mkdir(parents=True, exist_ok=True) download.save_as(filepath) except Exception as dl_err: if not href: raise dl_err self.log(f"Browser download failed ({dl_err}), trying direct HTTP download", "debug") # Direct HTTP download fallback using the href URL url_filename = "" if "filename=" in href: parsed = urllib.parse.urlparse(href) params = urllib.parse.parse_qs(parsed.query) url_filename = params.get('filename', [''])[0] if not url_filename: url_filename = Path(urllib.parse.urlparse(href).path).name media_id = self._extract_media_id_from_filename(url_filename) normalized_media_id = extract_instagram_media_id(media_id) if media_id else media_id ext = Path(url_filename).suffix if url_filename else '.jpg' profile = self.profile_name or "unknown" if post_date: date_str = post_date.strftime('%Y%m%d_%H%M%S') else: date_str = datetime.now().strftime('%Y%m%d_%H%M%S') new_filename = f"{profile}_{date_str}_{media_id}{ext}" filepath = self.output_dir / new_filename self.output_dir.mkdir(parents=True, exist_ok=True) resp = requests.get(href, timeout=60, stream=True) resp.raise_for_status() with open(filepath, 'wb') as f: for chunk in resp.iter_content(chunk_size=8192): f.write(chunk) # Check for duplicate hash before recording file_hash = self.db.get_file_hash(str(filepath)) if self.db else None if file_hash: existing = self.db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): # Duplicate file with same hash exists existing_path = Path(existing['file_path']) if existing_path.exists(): self.log(f"⚠ Duplicate file detected: {filepath.name} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") # Delete the duplicate and skip to next try: filepath.unlink() self.log(f"Deleted duplicate: {filepath.name}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {filepath.name}: {e}", "warning") # Update all timestamps if we have the post date if post_date: self._update_all_timestamps(filepath, post_date) self.log(f"Saved: {filepath.name} (dated: {post_date.strftime('%Y-%m-%d %H:%M')})") else: self.log(f"Saved: {filepath.name}") # Record in database with normalized media_id for cross-module detection # Include pk in metadata if available from API capture (for stories) dl_metadata = None pk_map = getattr(self, '_cdn_to_pk_map', {}) if pk_map: pk = None # Try matching media_id directly (works if it's _n format) if media_id: pk = pk_map.get(media_id) or pk_map.get(normalized_media_id) # Try extracting CDN filename from download URL path if not pk and href: cdn_filename = self._extract_media_id_from_cdn_url(href) if cdn_filename: pk = pk_map.get(cdn_filename) # Fallback: check if href has a url= param with embedded CDN URL if not pk and href and 'url=' in href: try: href_params = urllib.parse.parse_qs(urllib.parse.urlparse(href).query) inner_url = href_params.get('url', [''])[0] if inner_url: cdn_filename = self._extract_media_id_from_cdn_url(inner_url) if cdn_filename: pk = pk_map.get(cdn_filename) except Exception: pass if pk: dl_metadata = {'pk': pk} self.log(f"Mapped browser download {media_id} -> pk {pk}", "debug") elif pk_map: self.log(f"Could not map browser download {media_id} to pk (map has {len(pk_map)} entries)", "warning") self._record_download( media_id=normalized_media_id, username=self.profile_name, content_type=self.content_type, filename=str(filepath), download_url=href if 'href' in locals() else None, post_date=post_date, metadata=dl_metadata, deferred=self.defer_database ) self.downloaded_files.add(media_id) self.downloaded_files.add(normalized_media_id) success_count += 1 # Add smart delay between downloads if i < len(download_links) - 1: # Don't delay after last item self._smart_delay() except Exception as e: self.log(f"Error downloading item {i+1}: {e}") continue return success_count def _download_highlights(self, page): """Download highlights - each highlight category is clicked and downloaded""" total_downloaded = 0 # Find all highlight categories highlight_buttons = page.locator("li.highlight button.highlight__button").all() if not highlight_buttons: self.log("No highlight categories found") return 0 self.log(f"Found {len(highlight_buttons)} highlight categories") # Get all category names first categories = [] for button in highlight_buttons: title_elem = button.locator("p.highlight__title").first if title_elem and title_elem.is_visible(): name = title_elem.text_content().strip() categories.append(name) # Process each highlight category for i, highlight_name in enumerate(categories): try: self.log(f"\nProcessing highlight {i+1}/{len(categories)}: {highlight_name}") self.log("="*50) # Create folder for this highlight only when needed highlight_folder = self.output_dir / highlight_name # Re-find and click the highlight button (page may have changed) # Use filter instead of CSS selector to handle special characters all_buttons = page.locator("button.highlight__button").all() button = None for btn in all_buttons: title = btn.locator("p.highlight__title").first if title and title.is_visible(): if title.text_content().strip() == highlight_name: button = btn break if not button or not button.is_visible(): self.log(f"Could not find button for {highlight_name}") continue self.log(f"Clicking on {highlight_name}...") button.click(force=True) page.wait_for_timeout(5000) # Wait for content to load (increased for reliability) # FIRST: Scroll to load ALL content self.log("Scrolling to load all content...") self._scroll_to_load_content(page) # SECOND: Collect all items and their dates after scrolling is complete self.log("Collecting all items after scrolling...") all_media_items = page.locator("li.profile-media-list__item").all() item_dates = {} for item in all_media_items: time_elem = item.locator("p.media-content__meta-time").first if time_elem and time_elem.is_visible(): date_str = time_elem.get_attribute("title") if date_str: try: date_obj = datetime.strptime(date_str, "%m/%d/%Y, %I:%M:%S %p") # Map all download links in this item to this date item_links = item.locator("a[href*='.jpg'], a[href*='.mp4']").all() for link in item_links: href = link.get_attribute("href") if href: item_dates[href] = (date_str, date_obj) except Exception: pass # THIRD: Get all download links after everything is loaded download_links = page.locator("a[href*='.jpg'], a[href*='.mp4']").all() if not download_links: self.log(f"No items found in highlight: {highlight_name}") # Go back to highlights list highlights_tab = page.locator("button.tabs-component__button:has-text('highlights')").first if highlights_tab and highlights_tab.is_visible(): highlights_tab.click(force=True) page.wait_for_timeout(2000) continue self.log(f"Found {len(download_links)} items in {highlight_name}") self._dismiss_consent_dialog(page) # Download each item in the highlight for j, element in enumerate(download_links): if not element.is_visible(): continue # Check for duplicates before downloading href = element.get_attribute("href") or "" media_id = None if "filename=" in href: parsed = urllib.parse.urlparse(href) params = urllib.parse.parse_qs(parsed.query) if 'filename' in params: url_filename = params['filename'][0] media_id = self._extract_media_id_from_filename(url_filename) normalized_media_id = extract_instagram_media_id(media_id) if media_id else None # Check duplicates (both original and normalized) if media_id in self.downloaded_files or (normalized_media_id and normalized_media_id in self.downloaded_files): self.log(f"Skipping duplicate (session): {url_filename}") continue # Check database (both original and normalized) if self._is_already_downloaded(media_id) or (normalized_media_id and normalized_media_id != media_id and self._is_already_downloaded(normalized_media_id)): self.log(f"Skipping duplicate (database): {url_filename}", "info") self.downloaded_files.add(media_id) if normalized_media_id: self.downloaded_files.add(normalized_media_id) continue try: # Extract info for filename if not media_id: # Will be set from download filename below pass if not normalized_media_id: normalized_media_id = extract_instagram_media_id(media_id) if media_id else media_id profile = self.profile_name or "unknown" # Try to get the date for this item post_date = None dl_href = element.get_attribute("href") or "" if dl_href and dl_href in item_dates: date_str_found, post_date = item_dates[dl_href] date_str = post_date.strftime('%Y%m%d_%H%M%S') self.log(f"Found date for highlight item: {date_str_found}") else: date_str = datetime.now().strftime('%Y%m%d_%H%M%S') highlight_folder.mkdir(parents=True, exist_ok=True) # Try browser download, fall back to direct HTTP try: with page.expect_download(timeout=30000) as download_info: element.click(force=True) download = download_info.value original_filename = download.suggested_filename if not media_id: media_id = self._extract_media_id_from_filename(original_filename) normalized_media_id = extract_instagram_media_id(media_id) if media_id else media_id ext = Path(original_filename).suffix new_filename = f"{profile}_{date_str}_{media_id}{ext}" filepath = highlight_folder / new_filename download.save_as(filepath) except Exception: if not dl_href: raise self.log(f"Browser download failed, trying direct HTTP download", "debug") if not media_id: url_fn = "" if "filename=" in dl_href: parsed_url = urllib.parse.urlparse(dl_href) url_params = urllib.parse.parse_qs(parsed_url.query) url_fn = url_params.get('filename', [''])[0] if not url_fn: url_fn = Path(urllib.parse.urlparse(dl_href).path).name media_id = self._extract_media_id_from_filename(url_fn) normalized_media_id = extract_instagram_media_id(media_id) if media_id else media_id ext = Path(url_fn).suffix if url_fn else '.jpg' else: ext = '.mp4' if '.mp4' in dl_href else '.jpg' new_filename = f"{profile}_{date_str}_{media_id}{ext}" filepath = highlight_folder / new_filename resp = requests.get(dl_href, timeout=60, stream=True) resp.raise_for_status() with open(filepath, 'wb') as f: for chunk in resp.iter_content(chunk_size=8192): f.write(chunk) # Check for duplicate hash before recording file_hash = self.db.get_file_hash(str(filepath)) if self.db else None if file_hash: existing = self.db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): # Duplicate file with same hash exists existing_path = Path(existing['file_path']) if existing_path.exists(): self.log(f"⚠ Duplicate file detected: {filepath.name} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") # Delete the duplicate and skip to next try: filepath.unlink() self.log(f"Deleted duplicate: {filepath.name}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {filepath.name}: {e}", "warning") # Update all timestamps if we have the post date if post_date: self._update_all_timestamps(filepath, post_date) self.log(f"Saved: {highlight_name}/{new_filename} (dated: {post_date.strftime('%Y-%m-%d %H:%M')})") else: self.log(f"Saved: {highlight_name}/{new_filename}") # Record in database with normalized media_id for cross-module detection self._record_download( media_id=normalized_media_id or media_id, username=self.profile_name, content_type="highlights", filename=str(filepath), download_url=href if href else None, post_date=post_date, metadata={"highlight_name": highlight_name}, deferred=self.defer_database ) # Track downloaded file (both original and normalized) self.downloaded_files.add(media_id) if normalized_media_id: self.downloaded_files.add(normalized_media_id) total_downloaded += 1 # Use smart delay instead of fixed delay self._smart_delay() except Exception as e: self.log(f"Error downloading item {j+1} from {highlight_name}: {e}") continue # Go back to highlights list for next category self.log(f"Finished {highlight_name}, returning to highlights list...") # Try multiple methods to return to highlights list returned = False # Method 1: Click highlights tab highlights_tab = page.locator("button.tabs-component__button:has-text('highlights')").first if highlights_tab and highlights_tab.is_visible(): self.log("Clicking highlights tab to return to list") highlights_tab.click(force=True) page.wait_for_timeout(3000) # Check if it worked highlight_buttons_check = page.locator("li.highlight button.highlight__button").all() if highlight_buttons_check: self.log(f"Successfully returned via tab ({len(highlight_buttons_check)} categories)") returned = True # Method 2: If tab didn't work, try clicking a different tab then back if not returned: self.log("Tab click didn't work, trying tab switch...") posts_tab = page.locator("button.tabs-component__button:has-text('posts')").first if posts_tab and posts_tab.is_visible(): posts_tab.click(force=True) page.wait_for_timeout(2000) highlights_tab = page.locator("button.tabs-component__button:has-text('highlights')").first if highlights_tab and highlights_tab.is_visible(): highlights_tab.click(force=True) page.wait_for_timeout(3000) highlight_buttons_check = page.locator("li.highlight button.highlight__button").all() if highlight_buttons_check: self.log(f"Successfully returned via tab switch ({len(highlight_buttons_check)} categories)") returned = True if not returned: self.log("ERROR: Could not return to highlights list, stopping") break except Exception as e: self.log(f"Error processing highlight category {i+1}: {e}") continue return total_downloaded def _scroll_to_load_api_posts(self, page, api_responses): """Scroll slowly to trigger paginated /postsV2 API calls. FastDL lazy-loads posts as the user scrolls. The API response listener captures each /postsV2 response automatically — we just need to scroll to trigger the pagination requests. Stops when no new API responses arrive after several scroll attempts, or when posts are older than the configured date_from. """ self.log("Scrolling to load all posts within date range...") initial_count = len(api_responses) no_new_responses = 0 scroll_set = 0 while no_new_responses < 5: old_count = len(api_responses) # Slow, gradual scrolling — 200px at a time, 500ms between for _ in range(10): page.evaluate("window.scrollBy(0, 200)") page.wait_for_timeout(500) # Wait for API response to arrive page.wait_for_timeout(3000) new_count = len(api_responses) if new_count > old_count: self.log(f"Scroll {scroll_set + 1}: captured {new_count - old_count} new API response(s) (total: {new_count})") no_new_responses = 0 scroll_set += 1 # Check if the latest postsV2 response has posts older than date_from if self.date_from: for resp in reversed(api_responses): if '/postsV2' not in resp.get('url', ''): continue body = resp.get('body', {}) if not isinstance(body, dict) or 'result' not in body: continue result = body['result'] if not isinstance(result, dict) or 'edges' not in result: continue edges = result['edges'] if not edges: continue last_edge = edges[-1] node = last_edge.get('node', last_edge) taken_at = node.get('taken_at_timestamp') or node.get('taken_at', 0) if taken_at: post_date = datetime.fromtimestamp(taken_at) if post_date < self.date_from: self.log(f"Reached posts older than date range ({post_date.strftime('%Y-%m-%d')}), stopping scroll") total_new = len(api_responses) - initial_count self.log(f"Scrolling complete: captured {total_new} additional API response(s)") return break # Only check the latest postsV2 response else: no_new_responses += 1 scroll_set += 1 total_new = len(api_responses) - initial_count self.log(f"Scrolling complete: captured {total_new} additional API response(s)") def _scroll_to_load_content(self, page): """Scroll to load all lazy-loaded content""" self.log("Scrolling to load content...") # Count downloadable items initial_count = len(page.locator("a[href*='.jpg'], a[href*='.mp4']").all()) no_change_count = 0 consecutive_old_items = 0 # Scroll slowly like you requested - human-like scrolling # Highlights may have many items (80+), so increase scrolls max_scrolls = 50 if self.content_type == "highlights" else 15 for scroll_set in range(max_scrolls): old_height = page.evaluate("document.body.scrollHeight") old_count = len(page.locator("a[href*='.jpg'], a[href*='.mp4']").all()) # Slow, gradual scrolling - 200px at a time for small_scroll in range(10): page.evaluate("window.scrollBy(0, 200)") page.wait_for_timeout(500) # 0.5 second between small scrolls # Wait for content to load after scrolling page.wait_for_timeout(3000) # 3 seconds for new content # Check for new content new_height = page.evaluate("document.body.scrollHeight") new_count = len(page.locator("a[href*='.jpg'], a[href*='.mp4']").all()) if new_count > old_count: self.log(f"Loaded more items: {old_count} → {new_count}") no_change_count = 0 # Check if we should stop based on dates (for posts/reels with date filtering) if self.content_type in ["posts", "reels"] and self.date_from: # Check the dates of the last few items all_items = page.locator("li.profile-media-list__item").all() if len(all_items) >= 10: # Check last 10 items for dates old_dates_found = 0 for item in all_items[-10:]: time_elem = item.locator("p.media-content__meta-time").first if time_elem and time_elem.is_visible(): date_str = time_elem.get_attribute("title") if date_str: try: date_obj = datetime.strptime(date_str, "%m/%d/%Y, %I:%M:%S %p") if date_obj < self.date_from: old_dates_found += 1 except Exception: pass # If ALL of the last items are too old, stop scrolling # This ensures we don't miss content at the boundary if old_dates_found >= 10: self.log(f"All {old_dates_found} items in last batch are too old, stopping scroll") break else: no_change_count += 1 # If nothing changed for 5 scrolls, stop if no_change_count >= 5: self.log("No more content loading, stopping scroll") break # Example usage function def download_instagram_content(username, content_type="all", output_dir="downloads", use_database=True, db_path="fastdl_downloads.db", **kwargs): """ Simple function to download Instagram content Args: username: Instagram username content_type: 'posts', 'stories', 'reels', 'highlights', or 'all' output_dir: Where to save files use_database: Use SQLite database to track downloads (set False to re-download) db_path: Path to SQLite database file **kwargs: Additional options (max_downloads, days_back, phrase_config, etc.) Returns: Number of downloaded items """ downloader = FastDLDownloader(headless=True, use_database=use_database, db_path=db_path) return downloader.download(username, content_type, output_dir, **kwargs) if __name__ == "__main__": # Example: Download stories for a user count = download_instagram_content( username="evalongoria", content_type="stories", output_dir="test_downloads" ) print(f"\nTotal downloaded: {count} items")