#!/usr/bin/env python3 """ ImgInn downloader module with FastDL-compatible file naming Format: {profile}_{YYYYMMDD_HHMMSS}_{media_id}{ext} """ import os import json import time import random import re import subprocess import platform import requests from pathlib import Path from datetime import datetime, timedelta from modules.base_module import LoggingMixin from modules.cloudflare_handler import ( CloudflareHandler, SiteStatus, get_flaresolverr_user_agent, get_playwright_context_options, get_playwright_stealth_scripts ) from modules.instagram_utils import ( extract_instagram_media_id, scan_existing_files_for_media_ids, record_instagram_download, is_instagram_downloaded ) from typing import Dict, Optional from playwright.sync_api import sync_playwright class ImgInnDownloader(LoggingMixin): """ImgInn downloader with FastDL-compatible naming""" def __init__(self, headless: bool = True, cookie_file: str = "/opt/media-downloader/cookies/imginn_cookies.json", show_progress: bool = True, use_database: bool = True, log_callback=None, unified_db=None, ): """Initialize downloader compatible with media-downloader system""" # Initialize logging via mixin self._init_logger('Instagram', log_callback, default_module='Download') self.headless = headless self.downloaded_files = set() # Track downloaded media IDs self.show_progress = show_progress self.use_database = use_database self.download_count = 0 self.unified_db = unified_db # Store for scraper config access self.scraper_id = 'imginn' # Scraper ID in database self.pending_downloads = [] # Track downloads for deferred database recording # Rate limiting - track last scrape time to avoid hitting Cloudflare self._last_scrape_time = None self._min_scrape_interval = 15 # Minimum seconds between scrape types # Track transient page load failures per session self._page_load_failures = 0 self._page_load_failure_threshold = 5 # Escalate to error after this many # Browser reuse across profiles self.playwright = None self.browser = None self.context = None self.page = None # Use unified database directly (no adapter needed) if unified_db and use_database: self.unified_db = unified_db else: self.unified_db = None self.use_database = False # Initialize activity status manager for real-time updates from modules.activity_status import get_activity_manager self.activity_manager = get_activity_manager(unified_db) # Load scraper configuration from database if available self.proxy_url = None self.cookie_file = None # Default to None (use database) if unified_db: scraper_config = unified_db.get_scraper(self.scraper_id) if scraper_config: # Get proxy configuration if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'): self.proxy_url = scraper_config['proxy_url'] self.log(f"Using proxy: {self.proxy_url}", "info") # Fall back to cookie file if no database if not unified_db: self.cookie_file = Path(cookie_file) self.cookie_file.parent.mkdir(parents=True, exist_ok=True) # User-Agent to match FlareSolverr (dynamically fetched for consistency) self.user_agent = get_flaresolverr_user_agent() # Initialize universal Cloudflare handler # Pass proxy_url if configured, and cookie_file=None for database storage self.cf_handler = CloudflareHandler( module_name="ImgInn", cookie_file=str(self.cookie_file) if self.cookie_file else None, user_agent=self.user_agent, logger=self.logger, aggressive_expiry=True, # Refresh cookies expiring within 7 days proxy_url=self.proxy_url # Pass proxy to FlareSolverr ) # Keep for backwards compatibility self.flaresolverr_url = self.cf_handler.flaresolverr_url self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled # Load cookies from database if available self._load_cookies_from_db() def _load_cookies_from_db(self): """Load cookies from database if available""" if not self.unified_db: return try: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) if cookies: # Load into CloudflareHandler self.cf_handler._cookies = cookies self.log(f"Loaded {len(cookies)} cookies from database", "debug") except Exception as e: self.log(f"Error loading cookies from database: {e}", "warning") def _save_cookies_to_db(self, cookies: list, user_agent: str = None): """Save cookies to database Args: cookies: List of cookie dictionaries user_agent: User agent to associate with cookies (important for cf_clearance). If not provided, uses self.user_agent as fallback. """ if not self.unified_db: return try: # Use provided user_agent or fall back to self.user_agent ua = user_agent or self.user_agent self.unified_db.save_scraper_cookies( self.scraper_id, cookies, user_agent=ua, merge=True ) self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50]}...)", "debug") except Exception as e: self.log(f"Error saving cookies to database: {e}", "warning") def _cookies_expired(self): """Check if cookies are expired - delegates to CloudflareHandler""" return self.cf_handler.cookies_expired() def _get_cookies_for_requests(self): """Get cookies in format for requests library - delegates to CloudflareHandler""" return self.cf_handler.get_cookies_dict() def _get_cookies_via_flaresolverr(self, url="https://imginn.com/", max_retries=2): """Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler Args: url: URL to fetch max_retries: Maximum number of retry attempts (default: 2) Returns: True if cookies obtained successfully, False otherwise """ success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries) # Save cookies to database if successful if success and self.unified_db: cookies_list = self.cf_handler.get_cookies_list() if cookies_list: # CRITICAL: Get the user_agent from FlareSolverr solution, not self.user_agent # cf_clearance cookies are fingerprinted to the browser that solved the challenge flaresolverr_ua = self.cf_handler.get_user_agent() self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua) return success def _enforce_rate_limit(self, scrape_type: str = "scrape"): """Enforce rate limiting between scrape operations to avoid Cloudflare blocks. Args: scrape_type: Type of scrape (posts, stories, tagged) for logging """ import random if self._last_scrape_time is not None: elapsed = time.time() - self._last_scrape_time if elapsed < self._min_scrape_interval: # Add random jitter (5-15 seconds) to the delay jitter = random.uniform(5, 15) wait_time = self._min_scrape_interval - elapsed + jitter self.log(f"Rate limiting: waiting {wait_time:.1f}s before {scrape_type} (Cloudflare avoidance)", "info") time.sleep(wait_time) self._last_scrape_time = time.time() def _has_valid_cookies(self): """Check if we have valid cookies (either in file or database)""" if self.unified_db: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) return cookies and len(cookies) > 0 elif self.cookie_file: return self.cookie_file.exists() return False def _start_browser(self): """Start browser if not already running (reusable across profiles)""" # Try to get fresh cookies via FlareSolverr if we don't have them or they're old # Do this BEFORE the browser reuse check so cookies are always checked if not self._has_valid_cookies() or self._cookies_expired(): self.log("Cookies missing or expired, attempting FlareSolverr bypass...", "info") if self._get_cookies_via_flaresolverr(): self.log("Successfully got fresh cookies from FlareSolverr", "info") else: self.log("FlareSolverr unavailable, will try with Playwright", "warning") if self.browser is not None: self.log("Browser already running, reusing...", "debug") return import os # Use environment variable if set, otherwise use standard location if 'PLAYWRIGHT_BROWSERS_PATH' not in os.environ: os.environ['PLAYWRIGHT_BROWSERS_PATH'] = '/root/.cache/ms-playwright' os.environ['DISPLAY'] = ':100' # Use Xvfb virtual display self.log("Starting browser (Chromium)...", "info") self.playwright = sync_playwright().start() self.browser = self.playwright.chromium.launch( headless=self.headless, args=[ '--disable-blink-features=AutomationControlled', '--disable-dev-shm-usage', '--no-sandbox', '--disable-setuid-sandbox', '--disable-infobars', '--disable-background-timer-throttling', '--disable-backgrounding-occluded-windows', '--disable-renderer-backgrounding' ] ) # CRITICAL: Browser fingerprint must match FlareSolverr for cookies to work # Get dynamic fingerprint settings from FlareSolverr context_options = get_playwright_context_options() # IMPORTANT: If cookies have a stored user_agent, use THAT user_agent # Cloudflare cf_clearance cookies are fingerprinted to the browser that solved the challenge try: stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id) if stored_user_agent: self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug") context_options['user_agent'] = stored_user_agent else: self.log(f"Using fingerprint: UA={context_options['user_agent'][:50]}...", "debug") except Exception as e: self.log(f"Error getting stored user_agent, using default: {e}", "debug") self.context = self.browser.new_context(**context_options) # Load cookies self.load_cookies(self.context) self.page = self.context.new_page() # Add comprehensive anti-detection scripts (dynamically from cloudflare_handler) self.page.add_init_script(get_playwright_stealth_scripts()) self.log("Browser started and ready", "info") def _stop_browser(self): """Stop the browser safely with proper error handling""" # Close context first if self.context: try: self.context.close() self.log("Browser context closed", "debug") except Exception as e: self.log(f"Error closing browser context: {e}", "warning") finally: self.context = None # Close browser if self.browser: try: self.browser.close() self.log("Browser closed", "debug") except Exception as e: self.log(f"Error closing browser: {e}", "warning") finally: self.browser = None # Stop playwright if self.playwright: try: self.playwright.stop() except Exception as e: self.log(f"Error stopping playwright: {e}", "warning") finally: self.playwright = None self.page = None def __del__(self): """Cleanup browser when instance is destroyed""" self._stop_browser() def __enter__(self): """Context manager entry - allows using 'with' statement""" return self def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit - ensures browser cleanup""" self._stop_browser() return False # Don't suppress exceptions def get_profile_info(self, username: str) -> Optional[Dict]: """Extract profile info (avatar URL, bio, display name) from imginn profile page. Returns dict with keys: avatar_url, bio, display_name, or None on failure. """ import time as _time import random as _random self._enforce_rate_limit("posts") self._start_browser() page = self.page if not page: return None try: url = f"https://imginn.com/{username.lower()}/?ref=index" self.log(f"Fetching profile info for @{username} from imginn", "info") page.goto(url, wait_until='domcontentloaded') wait_time = 5 + _random.uniform(0, 2) _time.sleep(wait_time) if not self.wait_for_cloudflare(page): self.log("Page didn't load for profile info extraction", "warning") return None self.save_cookies(self.context) _time.sleep(2) # Use JavaScript to extract profile info with multiple selector strategies profile_info = page.evaluate("""() => { const result = {}; // --- Avatar --- // Strategy 1: img inside a profile/user info section const avatarSelectors = [ '.profile-avatar img', '.user-avatar img', '.avatar img', '.profile-info img', '.info img:first-of-type', 'header img', '.user img', ]; for (const sel of avatarSelectors) { const el = document.querySelector(sel); if (el && el.src && !el.src.includes('lazy') && !el.src.includes('data:')) { result.avatar_url = el.src; break; } } // Strategy 2: find small/round img with scontent or profile in src if (!result.avatar_url) { const imgs = document.querySelectorAll('img'); for (const img of imgs) { const src = img.src || ''; if ((src.includes('scontent') || src.includes('profile') || src.includes('avatar') || src.includes('imginn.com')) && !src.includes('lazy') && !src.includes('data:')) { const rect = img.getBoundingClientRect(); if (rect.width > 20 && rect.width < 250) { result.avatar_url = src; break; } } } } // Clean avatar URL: strip query params (imginn CDN works without them // and the full URL often has malformed double-? from Instagram CDN paths) if (result.avatar_url && result.avatar_url.includes('?')) { result.avatar_url = result.avatar_url.split('?')[0]; } // --- Bio --- const bioSelectors = [ '.biography', '.bio', '.user-bio', '.profile-bio', '.profile-info .description', '.info .bio', ]; for (const sel of bioSelectors) { const el = document.querySelector(sel); if (el && el.textContent.trim().length > 2) { result.bio = el.textContent.trim(); break; } } // --- Display Name --- const nameSelectors = [ '.fullname', '.display-name', '.profile-name', '.name', '.user-info h1', 'h1', ]; for (const sel of nameSelectors) { const el = document.querySelector(sel); if (el && el.textContent.trim().length > 1 && el.textContent.trim().length < 100) { result.display_name = el.textContent.trim(); break; } } return result; }""") # Save debug screenshot for future selector tuning try: screenshot_path = Path(f"/tmp/imginn_profile_{username}.png") page.screenshot(path=str(screenshot_path)) self.log(f"Profile screenshot saved to {screenshot_path}", "debug") except Exception: pass if profile_info and any(profile_info.values()): self.log(f"Extracted profile info: avatar={'yes' if profile_info.get('avatar_url') else 'no'}, " f"bio={'yes' if profile_info.get('bio') else 'no'}, " f"name={profile_info.get('display_name', 'no')}", "info") return profile_info else: # Save page HTML for debugging try: html_path = Path(f"/tmp/imginn_profile_{username}.html") html_path.write_text(page.content()[:50000]) self.log(f"No profile info found - HTML saved to {html_path}", "warning") except Exception: pass return None except Exception as e: self.log(f"Error getting profile info for @{username}: {e}", "error") return None def _extract_media_id_from_url(self, url: str) -> str: """Extract Instagram media ID from URL""" # URL format: https://imginn.com/p/MEDIA_ID/ # or just /p/MEDIA_ID/ match = re.search(r'/p/([^/]+)/?', url) if match: return match.group(1) return None def _update_file_timestamps(self, filepath: Path, post_date: datetime): """Update all timestamps for a file to match the post date""" try: # Convert datetime to timestamp timestamp = post_date.timestamp() # 1. Update file system timestamps (access time and modification time) os.utime(filepath, (timestamp, timestamp)) self.log(f"Updated file timestamps to {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug") # 2. Try to update creation time (platform-specific) if platform.system() == 'Darwin': # macOS # Use SetFile command on macOS date_str = post_date.strftime('%m/%d/%Y %H:%M:%S') try: subprocess.run( ['SetFile', '-d', date_str, str(filepath)], capture_output=True, text=True ) except (subprocess.SubprocessError, FileNotFoundError, OSError): pass # SetFile not available on this system elif platform.system() == 'Windows': # On Windows, use PowerShell with proper escaping to prevent injection filepath_escaped = str(filepath).replace("'", "''") date_escaped = post_date.isoformat().replace("'", "''") ps_command = f"(Get-Item -LiteralPath '{filepath_escaped}').CreationTime = Get-Date '{date_escaped}'" try: subprocess.run( ['powershell', '-Command', ps_command], capture_output=True, text=True ) except (subprocess.SubprocessError, FileNotFoundError, OSError): pass # PowerShell command failed # Linux doesn't support changing creation time # 3. Update EXIF data for images if str(filepath).lower().endswith(('.jpg', '.jpeg', '.png', '.heic')): self._update_exif_timestamps(filepath, post_date) except Exception as e: self.log(f"Error updating timestamps: {e}", "warning") def _update_exif_timestamps(self, filepath: Path, post_date: datetime): """Update EXIF timestamps in image files""" try: # Check if exiftool is available result = subprocess.run(['which', 'exiftool'], capture_output=True, text=True) if result.returncode == 0: # Format date for EXIF exif_date = post_date.strftime('%Y:%m:%d %H:%M:%S') # Update all date fields in EXIF including MetadataDate for Immich cmd = [ 'exiftool', '-overwrite_original', '-quiet', f'-AllDates={exif_date}', f'-MetadataDate={exif_date}', '-HistoryWhen=', f'-FileModifyDate={exif_date}', str(filepath) ] subprocess.run(cmd, capture_output=True, text=True) self.log(f"Updated EXIF timestamps", "debug") except Exception: # Silently skip if exiftool not available pass def _extract_post_date(self, page) -> datetime: """Try to extract post date from page""" try: # Wait a moment for dynamic content to load page.wait_for_timeout(500) # FIRST: Look for data-created attribute (Unix timestamp) elements_with_data_created = page.locator('[data-created]').all() self.log(f"Found {len(elements_with_data_created)} elements with data-created attribute", "debug") for elem in elements_with_data_created: timestamp_str = elem.get_attribute('data-created') if timestamp_str: try: # Convert Unix timestamp to datetime timestamp = int(timestamp_str) post_date = datetime.fromtimestamp(timestamp) self.log(f"Found data-created timestamp: {timestamp} -> {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug") return post_date except Exception as e: self.log(f"Failed to parse timestamp {timestamp_str}: {e}", "debug") pass # If no data-created found, wait a bit more and try again if len(elements_with_data_created) == 0: self.log("No data-created elements found, waiting for dynamic content...", "debug") # Try to wait for the element to appear try: page.wait_for_selector('[data-created]', timeout=2000) elements_with_data_created = page.locator('[data-created]').all() self.log(f"After waiting for selector: found {len(elements_with_data_created)} elements with data-created", "debug") except Exception: # Still try one more time with a longer wait page.wait_for_timeout(1500) elements_with_data_created = page.locator('[data-created]').all() self.log(f"After timeout wait: found {len(elements_with_data_created)} elements with data-created", "debug") for elem in elements_with_data_created: timestamp_str = elem.get_attribute('data-created') if timestamp_str: try: timestamp = int(timestamp_str) post_date = datetime.fromtimestamp(timestamp) self.log(f"Found data-created timestamp after wait: {timestamp} -> {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug") return post_date except Exception as e: self.log(f"Failed to parse timestamp {timestamp_str}: {e}", "debug") # Fallback: Look for other date elements date_selectors = [ 'time[datetime]', 'time', '.date', '[datetime]', 'span.date', 'div.date' ] for selector in date_selectors: elem = page.locator(selector).first if elem.count() > 0: # Try datetime attribute first datetime_str = elem.get_attribute('datetime') if datetime_str: # Parse ISO format for fmt in ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d']: try: return datetime.strptime(datetime_str.split('.')[0].replace('Z', ''), fmt) except Exception: continue # Try text content text = elem.text_content() if text: # Parse various date formats # Could be "2 days ago", "September 6, 2025", etc. if "ago" in text.lower(): # Handle relative dates if "hour" in text: hours = int(re.search(r'(\d+)', text).group(1)) return datetime.now() - timedelta(hours=hours) elif "day" in text: days = int(re.search(r'(\d+)', text).group(1)) return datetime.now() - timedelta(days=days) elif "week" in text: weeks = int(re.search(r'(\d+)', text).group(1)) return datetime.now() - timedelta(weeks=weeks) else: # Try parsing absolute date for fmt in ['%B %d, %Y', '%b %d, %Y', '%Y-%m-%d']: try: return datetime.strptime(text, fmt) except Exception: continue except Exception as e: self.log(f"Error extracting date: {e}", "debug") return None def _scan_existing_files(self, output_dir: Path, profile_name: str): """Scan directory for existing files and extract media IDs""" self.downloaded_files = scan_existing_files_for_media_ids( output_dir, profile_name, min_file_size=20000, recursive=False ) if self.downloaded_files: self.log(f"Found {len(self.downloaded_files)} existing media IDs for {profile_name}", "debug") def _is_already_downloaded(self, media_id: str) -> bool: """Check if media_id has already been downloaded (uses centralized function)""" if not self.use_database or not self.unified_db: return False # Use centralized function for consistent cross-module detection return is_instagram_downloaded(self.unified_db, media_id) def _record_download(self, media_id: str, username: str, filename: str, url: str = None, post_date=None, file_path: str = None, content_type: str = 'post', metadata: dict = None, deferred: bool = False): """Record a successful download in the database (uses centralized function) Args: deferred: If True, don't record to database now - add to pending_downloads list for later recording after file move is complete """ # If deferred, store for later recording instead of recording now if deferred: self.pending_downloads.append({ 'media_id': media_id, 'username': username, 'filename': filename, 'url': url, 'post_date': post_date.isoformat() if post_date else None, 'file_path': file_path, 'content_type': content_type, 'metadata': metadata }) self.log(f"Deferred recording for {media_id}", "debug") return True if not self.use_database or not self.unified_db: return False try: # Use centralized function for consistent cross-module storage result = record_instagram_download( db=self.unified_db, media_id=media_id, username=username, content_type=content_type, filename=filename, url=url, post_date=post_date, file_path=file_path, method='imginn', extra_metadata=metadata ) if result: self.log(f"Recorded download for {media_id}", "debug") return result except Exception as e: self.log(f"Failed to record download: {e}", "debug") return False def get_pending_downloads(self): """Get list of downloads that were deferred for later recording Returns: List of download metadata dicts ready for database recording """ return self.pending_downloads.copy() def clear_pending_downloads(self): """Clear the pending downloads list after they've been recorded""" self.pending_downloads = [] def _get_processed_posts(self, username: str) -> set: """Get set of post/story IDs that have been processed from database NOTE: Checks ALL Instagram posts globally, not just this user's, because the same post can appear on multiple profiles (shared posts, tags, reposts) """ processed = set() if not self.unified_db: return processed try: with self.unified_db.get_connection() as conn: cursor = conn.cursor() # Get all Instagram posts globally (same post can appear on multiple profiles) cursor.execute(''' SELECT url, filename, metadata FROM downloads WHERE platform = 'instagram' ''') for row in cursor.fetchall(): url, filename, metadata_str = row # Add full URL to processed set if url: processed.add(url) # Also extract and add post ID from URL for backward compatibility if url and '/p/' in url: match = re.search(r'/p/([^/]+)/', url) if match: processed.add(match.group(1)) # For stories, extract media_id from filename if filename and '_story' in filename: # Extract the long media ID before _story # Format: username_date_MEDIAID_storyN.ext parts = filename.split('_story') if len(parts) >= 2: # Get everything before _story, then get the media ID (last underscore-separated part) pre_story = parts[0] # Split by underscore and skip first 3 parts (username_YYYYMMDD_HHMMSS) id_parts = pre_story.split('_') if len(id_parts) > 3: # Join everything after date as the media_id media_id_full = '_'.join(id_parts[3:]) processed.add(media_id_full) # Also add the extracted Instagram media ID (18-digit number) normalized_id = extract_instagram_media_id(media_id_full) if normalized_id and normalized_id != media_id_full: processed.add(normalized_id) # Also check metadata for media_id if metadata_str: try: metadata = json.loads(metadata_str) if 'post_id' in metadata: processed.add(metadata['post_id']) if 'media_id' in metadata: media_id = metadata['media_id'] processed.add(media_id) # Also add the extracted Instagram media ID normalized_id = extract_instagram_media_id(media_id) if normalized_id and normalized_id != media_id: processed.add(normalized_id) if 'media_id_full' in metadata: processed.add(metadata['media_id_full']) except Exception: pass if processed: self.log(f"Found {len(processed)} processed posts in database for {username}", "debug") except Exception as e: self.log(f"Error loading processed posts from database: {e}", "debug") return processed def save_cookies(self, context): """Save cookies to database or file""" cookies = context.cookies() # Save to database if available if self.unified_db: try: # CRITICAL: Include user_agent for cf_clearance cookies to work self.unified_db.save_scraper_cookies( self.scraper_id, cookies, user_agent=self.user_agent, merge=True ) self.log(f"Saved {len(cookies)} cookies to database", "debug") return except Exception as e: self.log(f"Error saving cookies to database: {e}", "warning") # Fallback to file-based storage if self.cookie_file: storage_data = { 'cookies': cookies, 'timestamp': datetime.now().isoformat() } with open(self.cookie_file, 'w') as f: json.dump(storage_data, f, indent=2) self.log(f"Saved {len(cookies)} cookies to file", "debug") def load_cookies(self, context): """Load saved cookies from database or file""" # Try loading from database first if self.unified_db: try: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) if cookies: # Clean cookies - remove unsupported properties and convert expiry->expires cleaned_cookies = [] for cookie in cookies: cleaned = {k: v for k, v in cookie.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']} # FlareSolverr uses 'expiry' but Playwright uses 'expires' if 'expiry' in cleaned and 'expires' not in cleaned: cleaned['expires'] = cleaned.pop('expiry') cleaned_cookies.append(cleaned) # CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect # Otherwise old cookies may override new ones from FlareSolverr try: context.clear_cookies() self.log("Cleared existing browser cookies", "debug") except Exception as e: self.log(f"Could not clear cookies: {e}", "debug") context.add_cookies(cleaned_cookies) self.log(f"Loaded {len(cleaned_cookies)} cookies from database", "info") return True except Exception as e: self.log(f"Error loading cookies from database: {e}", "warning") # Fallback to file-based cookies if not self.cookie_file or not self.cookie_file.exists(): return False try: with open(self.cookie_file, 'r') as f: data = json.load(f) # Check age (24 hours) saved_time = datetime.fromisoformat(data['timestamp']) if datetime.now() - saved_time > timedelta(hours=24): self.log("Cookies expired", "debug") return False # Clean cookies - remove unsupported properties and convert expiry->expires cleaned_cookies = [] for cookie in data['cookies']: # Remove Chrome-specific properties that Playwright doesn't support cleaned = {k: v for k, v in cookie.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']} # FlareSolverr uses 'expiry' but Playwright uses 'expires' if 'expiry' in cleaned and 'expires' not in cleaned: cleaned['expires'] = cleaned.pop('expiry') cleaned_cookies.append(cleaned) context.add_cookies(cleaned_cookies) self.log(f"Loaded {len(cleaned_cookies)} cookies from file", "info") return True except Exception as e: self.log(f"Failed to load cookies: {e}", "warning") return False def wait_for_cloudflare(self, page): """Wait for Cloudflare to auto-solve or page to load - uses CloudflareHandler with ImgInn-specific checks""" self.log("Waiting for page to load...", "debug") max_wait = 120 # Extended wait - Cloudflare challenges can take up to 120 seconds flaresolverr_attempts = 0 max_flaresolverr_attempts = 3 for i in range(max_wait): time.sleep(1) # Check current URL and content try: current_url = page.url content = page.content().lower() except Exception as e: # Page is still navigating, wait and try again if "navigating" in str(e).lower(): self.log("Page still navigating, waiting...", "debug") continue else: # Some other error, re-raise it raise # First check if the actual content is visible (not Cloudflare) # ImgInn pages will have profile content when loaded if 'imginn' in current_url.lower() and ('posts' in content or 'followers' in content or 'following' in content): # We have actual content, not a challenge self.log(f"Page loaded successfully after {i+1} seconds", "info") return True # Check for actual Cloudflare challenge or server error # NOTE: 'challenge-platform' is NOT a reliable indicator - it's embedded JS that stays on the page # even after successful bypass. Only check for visible interstitial text. challenge_indicators = ['checking your browser', 'just a moment', 'verify you are human', 'enable javascript'] error_indicators = ['internal server error', 'error code 500', 'error code 502', 'error code 503'] has_challenge = any(indicator in content for indicator in challenge_indicators) has_error = any(indicator in content for indicator in error_indicators) if has_error: self.log("Server error detected (500/502/503) - site is likely down", "error") # Save screenshot for debugging try: debug_dir = Path("debug") debug_dir.mkdir(exist_ok=True) screenshot_path = debug_dir / f"server_error_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png" page.screenshot(path=str(screenshot_path)) self.log(f"Screenshot saved to {screenshot_path}", "debug") except Exception: pass return False if has_challenge: # Try FlareSolverr at specific intervals (0s, 15s, 30s) # Note: Turnstile checkbox clicking doesn't work - it's designed to block automation if i == 0 or (i in [15, 30] and flaresolverr_attempts < max_flaresolverr_attempts): flaresolverr_attempts += 1 self.log(f"Cloudflare challenge detected, attempting FlareSolverr bypass (attempt {flaresolverr_attempts})...", "info") # Get current browser user_agent for comparison current_browser_ua = None try: current_browser_ua = page.evaluate('() => navigator.userAgent') except Exception: pass # Try to get fresh cookies via FlareSolverr if self._get_cookies_via_flaresolverr(page.url): self.log("Got fresh cookies, reloading page...", "info") # Check if user_agent changed - if so, restart browser new_ua = None try: new_ua = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id) self.log(f"Stored cookie UA: {new_ua[:60] if new_ua else 'None'}...", "debug") self.log(f"Browser UA: {current_browser_ua[:60] if current_browser_ua else 'None'}...", "debug") except Exception as e: self.log(f"Error getting stored UA: {e}", "debug") if new_ua and current_browser_ua and new_ua != current_browser_ua: self.log("User-agent changed, restarting browser with new fingerprint...", "info") self._stop_browser() self._start_browser() page = self.page try: page.goto(current_url, wait_until='domcontentloaded', timeout=30000) except Exception as e: self.log(f"Error navigating after browser restart: {e}", "debug") else: # Reload cookies in browser context try: self.load_cookies(self.context) # Reload the page with new cookies page.reload(wait_until='domcontentloaded', timeout=10000) # CRITICAL: Wait for Cloudflare background JS validation (5-7 seconds) wait_time = 5 + random.uniform(0, 2) self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug") time.sleep(wait_time) except Exception as e: self.log(f"Error reloading page with new cookies: {e}", "debug") else: self.log("FlareSolverr failed, waiting for challenge to resolve...", "warning") # Continue waiting for challenge to resolve continue # Check if we're on the correct page with content if '/p/' in current_url: # Post page # Look for download button or image if 'download' in content or 'data-created' in content: self.log(f"Post page loaded after {i+1} seconds", "info") return True elif '/stories/' in current_url: # Stories page # Stories pages have swiper, reels, or story content if 'swiper' in content or 'data-uid' in content or 'reel' in content: self.log(f"Stories page loaded after {i+1} seconds", "info") return True # Also check for counter/profile info which is on stories pages too if 'counter-item' in content or ('posts' in content and 'followers' in content): self.log(f"Stories page loaded after {i+1} seconds", "info") return True elif '/tagged/' in current_url: # Tagged page # Tagged pages have items grid if 'class="item"' in content or 'data-uid' in content: self.log(f"Tagged page loaded after {i+1} seconds", "info") return True if 'posts' in content and 'followers' in content: self.log(f"Tagged page loaded after {i+1} seconds", "info") return True else: # Profile page # Check if profile content is visible - ImgInn specific if 'imginn' in current_url.lower(): if ('posts' in content and 'followers' in content) or 'following' in content: self.log(f"Profile page loaded after {i+1} seconds", "info") return True # Also check for actual post links if 'href="/p/' in content or 'class="item"' in content: self.log(f"Profile page loaded after {i+1} seconds", "info") return True # Debug: Log what we're seeing if we've been waiting a while if i == 15: self.log(f"Debug: URL={current_url[:50]}, has posts={('posts' in content)}, has swiper={('swiper' in content)}", "debug") # Status updates (only if we haven't detected content yet) if i == 10: self.log("Still waiting (10s)... page loading", "debug") elif i == 20: self.log("Still waiting (20s)... page not ready yet", "info") elif i == 30: self.log("Still waiting (30s)... slow response from server", "info") elif i == 45: self.log("Still waiting (45s)... checking if blocked", "info") elif i == 60: self.log("Still waiting (60s)... page load is slow", "warning") elif i == 90: self.log("Still waiting (90s)... this is taking too long", "warning") # Timeout reached - page didn't load self._page_load_failures += 1 level = "error" if self._page_load_failures >= self._page_load_failure_threshold else "warning" self.log(f"Page load timeout ({self._page_load_failures}x this session). URL: {page.url}", level) # Save screenshot for debugging try: debug_dir = Path("debug") debug_dir.mkdir(exist_ok=True) screenshot_path = debug_dir / f"cloudflare_block_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png" page.screenshot(path=str(screenshot_path)) self.log(f"Screenshot saved to {screenshot_path}", "debug") except Exception: pass return False def _dismiss_consent_dialog(self, page): """Dismiss cookie consent / GDPR overlay if present (Google FundingChoices).""" try: # Google FundingChoices consent dialog consent_btn = page.locator( 'button.fc-cta-consent, ' # "Consent" button 'button.fc-cta-do-not-consent, ' # "Do not consent" button 'button[aria-label="Consent"], ' 'button.fc-dismiss-button, ' # Dismiss/close button '.fc-dialog button.fc-primary-button' ).first if consent_btn.count() > 0 and consent_btn.is_visible(): consent_btn.click(force=True) self.log("Dismissed consent dialog", "debug") time.sleep(0.5) return # Fallback: remove the overlay via JS if buttons aren't found overlay = page.locator('.fc-consent-root, .fc-dialog-overlay').first if overlay.count() > 0: page.evaluate("document.querySelectorAll('.fc-consent-root, .fc-dialog-overlay, .fc-dialog-container').forEach(el => el.remove())") self.log("Removed consent overlay via JS", "debug") except Exception: pass def _safe_go_back(self, page, username: str, tagged: bool = False): """Navigate back to profile page safely with timeout handling. Tries go_back() first with a short timeout, falls back to direct navigation. """ try: page.go_back(timeout=10000) except Exception: self.log("go_back timed out, navigating directly to profile", "debug") try: suffix = f"/tagged/?ref=index" if tagged else "/?ref=index" page.goto(f"https://imginn.com/{username}{suffix}", timeout=15000) except Exception as nav_err: self.log(f"Direct navigation back also failed: {nav_err}", "warning") def _is_cloudflare_challenge(self, page) -> bool: """Check if current page is a Cloudflare challenge page. Returns: True if Cloudflare challenge detected, False otherwise """ try: title = page.title().lower() content = page.content().lower()[:2000] # Check first 2000 chars challenge_indicators = ['just a moment', 'checking your browser', 'verify you are human', 'enable javascript', 'cloudflare'] # Check title first (most reliable) if any(indicator in title for indicator in challenge_indicators): return True # Check content if any(indicator in content for indicator in challenge_indicators): return True return False except Exception: return False def _handle_cloudflare_on_post(self, page, post_url: str, max_retries: int = 2) -> bool: """Handle Cloudflare challenge on a post page by getting fresh cookies and retrying. Args: page: Playwright page object post_url: URL of the post to retry max_retries: Maximum number of retry attempts Returns: True if page loaded successfully (no Cloudflare), False if still blocked """ if not self._is_cloudflare_challenge(page): return True # No challenge, page is good self.log(f"Cloudflare challenge detected on post page, attempting bypass...", "warning") for attempt in range(max_retries): # Wait before FlareSolverr attempt - give Cloudflare time to cool down if attempt == 0: wait_time = random.uniform(15, 25) else: wait_time = random.uniform(30, 60) self.log(f"Waiting {wait_time:.1f}s before FlareSolverr attempt {attempt + 1}...", "info") time.sleep(wait_time) # Get fresh cookies via FlareSolverr using the post URL if self._get_cookies_via_flaresolverr(post_url): self.log(f"Got fresh cookies (attempt {attempt + 1}), reloading post...", "info") # Check if user_agent changed - if so, restart browser try: current_browser_ua = page.evaluate('() => navigator.userAgent') new_ua = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id) if new_ua and current_browser_ua and new_ua != current_browser_ua: self.log("User-agent changed, restarting browser...", "info") self._stop_browser() self._start_browser() page = self.page except Exception as e: self.log(f"Error checking user_agent: {e}", "debug") # Reload cookies into browser context try: self.load_cookies(self.context) except Exception as e: self.log(f"Error loading cookies: {e}", "debug") # Navigate directly to the post URL try: page.goto(post_url, wait_until='domcontentloaded', timeout=30000) # Wait for Cloudflare background JS validation (5-7 seconds) wait_time = 5 + random.uniform(0, 2) self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug") time.sleep(wait_time) # Check if still blocked if not self._is_cloudflare_challenge(page): self.log("Cloudflare bypass successful on post page", "info") # IMPORTANT: Save browser cookies after successful bypass # This captures any cookies set by Cloudflare's JS validation try: self.save_cookies(self.context) self.log("Saved browser cookies after successful bypass", "debug") except Exception as e: self.log(f"Error saving cookies after bypass: {e}", "debug") return True else: self.log(f"Still blocked after retry {attempt + 1}", "warning") except Exception as e: self.log(f"Navigation failed after cookie refresh: {e}", "warning") else: self.log(f"FlareSolverr failed (attempt {attempt + 1})", "warning") self.log("Failed to bypass Cloudflare on post page after all retries", "error") return False def _check_post_phrases(self, page, phrase_config: dict) -> bool: """Check if post contains required phrases Args: page: Playwright page object phrase_config: Phrase search configuration { 'phrases': list of phrases to search for, 'case_sensitive': bool, 'match_all': bool (True = all phrases must match, False = any phrase) } Returns: True if post matches phrase criteria, False otherwise """ try: # Get post caption/text caption_selectors = [ '.caption', '.post-caption', 'meta[property="og:description"]', 'meta[name="description"]', '.content', 'div[class*="caption"]', 'span[class*="caption"]' ] post_text = "" for selector in caption_selectors: try: element = page.locator(selector).first if element.count() > 0: text = element.text_content() or element.get_attribute('content') or "" if text: post_text += " " + text except Exception: continue # Also check visible text in the main content area try: main_content = page.locator('main, article, .post-content, div[role="main"]').first if main_content.count() > 0: post_text += " " + (main_content.text_content() or "") except Exception: pass if not post_text: self.log("Could not extract post text for phrase matching", "warning") return False # Clean up text post_text = ' '.join(post_text.split()) # Normalize whitespace phrases = phrase_config.get('phrases', []) if not phrases: return True # No phrases to match = match all case_sensitive = phrase_config.get('case_sensitive', False) match_all = phrase_config.get('match_all', False) if not case_sensitive: post_text = post_text.lower() phrases = [p.lower() for p in phrases] self.log(f"Checking post text ({len(post_text)} chars) for phrases: {phrases}", "debug") # Check phrase matching matches = [] for phrase in phrases: if phrase in post_text: matches.append(phrase) self.log(f"Found phrase: '{phrase}'", "debug") if match_all: # All phrases must be found result = len(matches) == len(phrases) if not result: missing = [p for p in phrases if p not in matches] self.log(f"Missing required phrases: {missing}", "debug") else: # At least one phrase must be found result = len(matches) > 0 if not result: self.log(f"No matching phrases found", "debug") return result except Exception as e: self.log(f"Error checking post phrases: {e}", "error") return False def download(self, username: str, content_type: str = "posts", days_back: int = 14, max_downloads: int = 50, output_dir: str = None, phrase_config: dict = None, defer_database: bool = False): """Download content from a user - compatible with media-downloader interface Args: username: Instagram username content_type: Type of content ("posts", "stories", or "tagged") days_back: How many days back to search max_downloads: Maximum posts to download output_dir: Output directory phrase_config: Optional phrase search configuration { 'enabled': bool, 'phrases': list of phrases to search for, 'case_sensitive': bool, 'match_all': bool (True = all phrases must match, False = any phrase) } defer_database: If True, defer database recording to pending_downloads list for later recording after file move is complete """ # Clear downloaded_files cache between accounts to prevent memory growth self.downloaded_files.clear() # Clear pending downloads for fresh batch self.pending_downloads = [] # Set output directory if output_dir: output_path = Path(output_dir) / username else: output_path = Path(f"/opt/media-downloader/downloads/{username}") # Route to appropriate download method if content_type == "posts": files = self.download_posts( username=username, days_back=days_back, max_posts=max_downloads, output_dir=output_path, phrase_config=phrase_config, defer_database=defer_database ) elif content_type == "stories": files = self.download_stories( username=username, days_back=days_back, max_stories=max_downloads, output_dir=output_path, defer_database=defer_database ) elif content_type == "tagged": files = self.download_tagged( username=username, days_back=days_back, max_posts=max_downloads, output_dir=output_path, phrase_config=phrase_config, defer_database=defer_database ) else: self.log(f"ImgInn does not support content type: {content_type}", "warning") return 0 return len(files) def download_posts(self, username: str, days_back: int = 14, max_posts: int = 50, specific_post_url: str = None, output_dir: Path = None, phrase_config: dict = None, skip_database: bool = False, max_age_hours: int = None, defer_database: bool = False): """Download posts from a user with FastDL naming Args: username: Instagram username days_back: How many days back to search max_posts: Maximum posts to check specific_post_url: Download a specific post output_dir: Output directory phrase_config: Optional phrase search configuration skip_database: If True, don't record downloads in database (for temporary processing) max_age_hours: If specified, only download posts newer than N hours (overrides days_back) defer_database: If True, defer database recording to pending_downloads list for later recording after file move is complete """ # Rate limiting to avoid Cloudflare blocks self._enforce_rate_limit("posts") profile_name = username.lower() if output_dir is None: output_dir = Path(f"/opt/media-downloader/downloads/{profile_name}") output_dir.mkdir(parents=True, exist_ok=True) # Check site status before doing anything else self.log("Checking ImgInn site status...", "debug") site_status, error_msg = self.cf_handler.check_site_status("https://imginn.com/", timeout=10) if self.cf_handler.should_skip_download(site_status): self.log(f"Skipping download for @{profile_name} - ImgInn is unavailable: {error_msg}", "warning") self.activity_manager.update_status(f"Skipped - ImgInn unavailable ({error_msg})") return [] elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE: self.log("Cloudflare challenge detected, will attempt bypass during download", "info") # Scan existing files self._scan_existing_files(output_dir, profile_name) # Get processed posts from database processed_posts = self._get_processed_posts(profile_name) self.log(f"Loaded {len(processed_posts)} processed posts for {profile_name} from database", "info") if len(processed_posts) > 0 and len(processed_posts) < 20: self.log(f"Processed post IDs: {processed_posts}", "debug") downloaded_files = [] # Use max_age_hours if specified, otherwise use days_back if max_age_hours is not None: cutoff_date = datetime.now() - timedelta(hours=max_age_hours) else: cutoff_date = datetime.now() - timedelta(days=days_back) # Update activity status if specific_post_url and profile_name == 'unknown': self.activity_manager.update_status(f"Fetching post...") else: self.activity_manager.update_status("Checking posts") # Start or reuse browser self._start_browser() page = self.page try: # If specific post URL provided, go directly to it if specific_post_url: self.log(f"Navigating to specific post", "info") page.goto(specific_post_url, wait_until='domcontentloaded') else: # Navigate to profile self.log(f"Navigating to @{username} profile", "info") page.goto(f"https://imginn.com/{username}/?ref=index", wait_until='domcontentloaded') # CRITICAL: Wait 5-7 seconds for Cloudflare background JS challenges to complete # Per browserless.io: "Allow 5+ seconds post-page load for background JavaScript challenges" import random wait_time = 5 + random.uniform(0, 2) # 5-7 seconds self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug") time.sleep(wait_time) # Wait for page to load if not self.wait_for_cloudflare(page): self._page_load_failures += 1 level = "error" if self._page_load_failures >= self._page_load_failure_threshold else "warning" self.log(f"Page didn't load properly ({self._page_load_failures}x this session)", level) return [] # Save cookies self.save_cookies(self.context) # Wait for JavaScript to load posts (ImgInn loads posts dynamically) self.log("Waiting for posts to load via JavaScript...", "info") try: # Wait for post links to appear (up to 10 seconds) page.wait_for_selector('a[href*="/p/"]', timeout=10000) self.log("Posts loaded successfully", "info") except Exception: # Timeout - posts might not exist, or page structure changed self.log("Timeout waiting for posts to appear", "warning") time.sleep(2) # Give it a bit more time anyway # If specific post, process it directly if specific_post_url: self.log("Processing specific post", "info") # Extract media ID from URL media_id = self._extract_media_id_from_url(specific_post_url) if not media_id: self.log("Could not extract media ID", "warning") return [] self.log(f"URL Media ID: {media_id}", "debug") # Process this single post (bypass date filter for specific posts) post_links = [None] # Dummy list for iteration bypass_date_filter = True else: # Find posts on profile page self.log("Finding posts...", "info") # Debug: Check what's actually on the page page_content = page.content() if 'no posts' in page_content.lower() or 'page not found' in page_content.lower(): self.log("Page shows 'no posts' or 'not found'", "warning") post_links = page.locator('a[href*="/p/"]').all() self.log(f"Found {len(post_links)} posts", "info") if not post_links: # Debug: Save screenshot to see what's wrong try: screenshot_path = Path(f"/tmp/imginn_no_posts_{username}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png") page.screenshot(path=str(screenshot_path)) self.log(f"No posts found - screenshot saved to {screenshot_path}", "warning") except Exception: pass self.log("No posts found", "warning") return [] bypass_date_filter = False self.log(f"Processing posts (max {max_posts})", "info") # Collect all post URLs upfront to avoid stale element issues post_urls_to_process = [] if not specific_post_url: for idx, pl in enumerate(post_links[:max_posts]): try: href = pl.get_attribute('href', timeout=5000) if href: if not href.startswith('http'): href = f"https://imginn.com{href}" post_urls_to_process.append(href) except Exception as e: self.log(f"Post {idx+1}: Failed to get URL: {str(e)[:50]}", "debug") continue self.log(f"Collected {len(post_urls_to_process)} post URLs", "debug") # Track consecutive old posts to handle pinned posts consecutive_old_posts = 0 max_consecutive_old_posts = 5 # Allow up to 5 old posts (pinned) before stopping # Set initial progress so dashboard shows 0/N immediately total_posts = len(post_urls_to_process) if not specific_post_url else 1 self.activity_manager.update_status( "Downloading posts", progress_current=0, progress_total=total_posts ) for i, post_url in enumerate(post_urls_to_process if not specific_post_url else [specific_post_url]): # Update progress at start of each iteration (fires even on skips) self.activity_manager.update_status( "Downloading posts", progress_current=i + 1, progress_total=total_posts ) try: # Handle specific post vs regular posts if specific_post_url: # Already on the specific post page post_url = specific_post_url media_id = self._extract_media_id_from_url(post_url) else: # URL already collected and formatted media_id = self._extract_media_id_from_url(post_url) if not media_id: self.log(f"Post {i+1}: Could not extract media ID", "warning") continue # Check if post was already processed (from database) if media_id in processed_posts: # Skip if in database - trust the database tracking self.log(f"Post {i+1}: {media_id} already processed (database), skipping", "debug") continue # Rate limiting between post downloads to avoid Cloudflare blocks if i > 0: post_delay = random.uniform(3, 8) self.log(f"Rate limit: waiting {post_delay:.1f}s before post {i+1}", "debug") time.sleep(post_delay) self.log(f"Post {i+1}: Processing {media_id}", "info") # Navigate directly to post URL (more reliable than clicking which can timeout) try: page.goto(post_url, wait_until='domcontentloaded', timeout=30000) except Exception as nav_err: self.log(f"Post {i+1}: Navigation failed: {nav_err}", "warning") continue # Wait for page to load time.sleep(2) # Wait for navigation to complete try: page.wait_for_load_state('networkidle', timeout=5000) except Exception: # Continue even if network isn't idle - page might still be usable self.log("Network didn't idle, but continuing", "debug") # Check if on post page if "/p/" not in page.url: self.log(f"Not a downloadable post (URL: {page.url})", "warning") self._safe_go_back(page, username) continue # IMPORTANT: Wait for post page content to fully render # This ensures download buttons are from the POST PAGE, not profile page preview try: # Wait for the post container to be visible (imginn uses main-content now) page.wait_for_selector('div.main-content, div.post, div.content, div.single-post', timeout=3000) time.sleep(1) # Additional wait for download buttons to render except Exception: self.log("Post container not found, checking for Cloudflare...", "debug") # Check for Cloudflare challenge and handle it cloudflare_bypassed = False if self._is_cloudflare_challenge(page): self.log(f"Cloudflare challenge detected on post {media_id}", "warning") if not self._handle_cloudflare_on_post(page, post_url): # Cloudflare bypass failed - skip this post WITHOUT marking as processed # so it can be retried on next run self.log(f"Skipping post {media_id} due to Cloudflare block (will retry later)", "warning") try: page.goto(f"https://imginn.com/{username}/?ref=index") time.sleep(3) except Exception: pass continue cloudflare_bypassed = True self.log(f"Navigated to post page: {page.url}", "debug") self._dismiss_consent_dialog(page) # Extract actual username from post page if we don't have it (e.g., specific_post_url with unknown user) if profile_name == 'unknown' or specific_post_url: try: username_elem = page.locator('div.username a').first if username_elem.count() > 0: username_href = username_elem.get_attribute('href') if username_href: # Extract username from href like "/evalongoria/" -> "evalongoria" extracted_username = username_href.strip('/').lower() if extracted_username and extracted_username != 'unknown': profile_name = extracted_username self.log(f"Extracted username from post page: @{profile_name}", "info") # Update activity status with real username self.activity_manager.update_status("Downloading posts") except Exception as e: self.log(f"Could not extract username from post page: {e}", "debug") # Extract post date - ALWAYS extract for proper file naming post_date = self._extract_post_date(page) # Use post date for filename, or current date if post_date: date_str = post_date.strftime('%Y%m%d_%H%M%S') self.log(f"Original post date: {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug") else: date_str = datetime.now().strftime('%Y%m%d_%H%M%S') self.log(f"No original date found, using current time", "debug") # Check date filter AFTER extracting date (bypass for specific posts) if not bypass_date_filter and post_date and post_date < cutoff_date: consecutive_old_posts += 1 self.log(f"Post too old ({post_date.strftime('%Y-%m-%d')}), skipping (consecutive old: {consecutive_old_posts}/{max_consecutive_old_posts})", "info") # Mark this old post as checked in database to avoid re-checking # Only mark if doing phrase search (has phrase_config) if phrase_config and media_id: self._record_download( media_id=media_id, username=profile_name, filename=f"_old_post_{media_id}", url=post_url, post_date=post_date, content_type='post', metadata={'marker': True, 'reason': 'old_post'} ) self._safe_go_back(page, username) # Stop only after 5 consecutive old posts (handles pinned posts at top) if consecutive_old_posts >= max_consecutive_old_posts: self.log(f"Found {consecutive_old_posts} consecutive old posts - stopping", "info") break else: continue # Skip this old post but keep checking (might be pinned) # Reset consecutive old posts counter - we found a post within date range consecutive_old_posts = 0 # Check for phrase matching if configured if phrase_config and phrase_config.get('enabled'): if not self._check_post_phrases(page, phrase_config): self.log(f"Post does not match phrase criteria, skipping download", "info") # Mark this post as checked (but not downloaded) in database # This prevents re-checking the same post every run if media_id: self._record_download( media_id=media_id, username=profile_name, filename=f"_phrase_checked_{media_id}", url=post_url, post_date=post_date, content_type='post', metadata={'marker': True, 'reason': 'phrase_checked'} ) self._safe_go_back(page, username) continue else: self.log(f"Post matches phrase criteria, using high-res download", "info") # Check for carousel carousel_next = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first has_carousel = carousel_next.count() > 0 if has_carousel: self.log(f"Carousel detected - will download all carousel images", "info") self._dismiss_consent_dialog(page) # CRITICAL: Wait for POST PAGE carousel download buttons to be ready # This prevents downloading from the profile page preview try: # Wait for download buttons with POST PAGE URLs (have "scontent" or "post" in them) page.wait_for_selector('a.btn[href*="scontent"], a[download], a.download', timeout=3000) time.sleep(1.5) # Additional wait for all carousel images to load self.log("Carousel download buttons ready on post page", "debug") except Exception: self.log("Download buttons not found, but continuing", "debug") else: self.log("Single image post", "debug") # Handle downloads - always use download buttons from post page image_count = 0 max_images = 10 # Download images (carousel or single) if has_carousel: # First, let's find all carousel slides all_slides = page.locator('.swiper-slide').all() self.log(f"Found {len(all_slides)} carousel slides", "debug") # Download each slide's image for slide_index in range(min(len(all_slides), max_images)): self.log(f"Processing carousel slide {slide_index + 1}/{len(all_slides)}", "debug") # Get the current slide element to scope our searches current_slide = all_slides[slide_index] # Click next to navigate to this slide (except for first one) if slide_index > 0: next_btn = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first if next_btn.count() > 0 and next_btn.is_visible(): try: next_btn.click(force=True) except Exception: self.log(f"Carousel next button click timed out at slide {slide_index + 1}, stopping carousel", "warning") break time.sleep(2) # Wait for slide transition and image to load # First, try to find a download button for this carousel item # IMPORTANT: Search within CURRENT SLIDE only, not entire page download_btn = None download_url = None webp_fallback_url = None # Look for download button on the current slide - prefer high-res, fallback to .webp download_selectors = [ 'a.btn[href*="scontent"][href*=".jpg"]', # High-res jpg 'a.btn[href*="scontent"][href*=".mp4"]', # Video 'a.btn[href*="scontent"]', # Any scontent 'a[download][href*=".jpg"]', 'a[download][href*=".mp4"]', 'a.download', 'a[download]', 'a[href*="/post"]' ] # Search for download buttons - first try within slide, then try page-level # Imginn often has download buttons outside the .swiper-slide elements search_contexts = [current_slide, page] for search_context in search_contexts: if download_url: # Already found, skip other contexts break for selector in download_selectors: btn = search_context.locator(selector).first if btn.count() > 0: temp_url = btn.get_attribute('href') if temp_url and temp_url != '#' and temp_url != 'javascript:void(0)': if not temp_url.startswith('http'): temp_url = f"https://imginn.com{temp_url}" # Store .webp as fallback, but keep looking for better if '.webp' in temp_url.lower(): if not webp_fallback_url: webp_fallback_url = temp_url self.log(f"Found .webp link (fallback): {temp_url[:80]}...", "debug") continue # Found non-.webp link, use it download_btn = btn download_url = temp_url self.log(f"Found high-res download for carousel slide {slide_index + 1}: {download_url[:80]}...", "debug") break # Use .webp fallback if no high-res found used_webp_fallback = False if not download_url and webp_fallback_url: download_url = webp_fallback_url used_webp_fallback = True self.log(f"Using .webp fallback for carousel slide {slide_index + 1}", "info") # If we found a download button, use it for high-res if download_url: try: import requests from urllib.parse import urlparse, unquote response = requests.get(download_url, timeout=30, headers={ 'User-Agent': self.user_agent, 'Referer': 'https://imginn.com/' }, cookies=self._get_cookies_for_requests()) response.raise_for_status() # Extract filename and media ID from the actual file url_path = urlparse(download_url).path original_name = unquote(url_path.split('/')[-1].split('?')[0]) if original_name.startswith('post'): original_name = original_name[4:] # The media ID is the filename without extension actual_media_id = Path(original_name).stem ext = Path(original_name).suffix or '.jpg' # Build filename for carousel image using actual media ID filename = f"{profile_name}_{date_str}_{actual_media_id}_{slide_index + 1}{ext}" filepath = output_dir / filename # Save file with open(filepath, 'wb') as f: f.write(response.content) # Check for duplicate hash before recording if self.unified_db: from pathlib import Path as PathLib # Check for duplicate hash (hash blacklist persists even if original deleted) file_hash = self.unified_db.get_file_hash(str(filepath)) if file_hash: existing = self.unified_db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): # Duplicate hash found - content was already downloaded (prevents redownload of deleted content) self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") # Delete the duplicate regardless of whether original file still exists try: filepath.unlink() self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {filename}: {e}", "warning") # Update timestamps if post_date: self._update_file_timestamps(filepath, post_date) # Log with appropriate quality label quality_label = "fallback" if used_webp_fallback else "high-res" self.log(f"Downloaded ({quality_label}): {filename} ({len(response.content)} bytes)", "info") downloaded_files.append(str(filepath)) image_count += 1 # Add to tracking self.downloaded_files.add(actual_media_id) # Mark in database (or defer for later) if not skip_database or defer_database: unique_url = f"{post_url}#{filename}" self._record_download( media_id=actual_media_id, username=profile_name, filename=filename, url=unique_url, post_date=post_date, file_path=str(filepath), content_type='post', deferred=defer_database ) continue # Skip to next slide except Exception as e: self.log(f"Failed to download high-res carousel image {slide_index + 1}: {e}, falling back to standard res", "warning") # Fallback: Find the current slide's media (img or video) if no download button # current_slide already defined at top of loop # Try img first, then video media_src = None slide_img = current_slide.locator('img').first if slide_img.count() > 0: media_src = slide_img.get_attribute('src') # If it's a lazy placeholder, wait for it to load if media_src and 'lazy.jpg' in media_src: self.log(f"Slide {slide_index + 1} is lazy, waiting for load...", "debug") # Trigger load by making it visible current_slide.scroll_into_view_if_needed() time.sleep(1) # Get src again media_src = slide_img.get_attribute('src') else: # Check for video tag slide_video = current_slide.locator('video source, video').first if slide_video.count() > 0: media_src = slide_video.get_attribute('src') self.log(f"Found video for slide {slide_index + 1}", "debug") if media_src and 'lazy.jpg' not in media_src and '483011604' not in media_src: self.log(f"Downloading carousel media {slide_index + 1} (standard res): {media_src[:80]}...", "debug") # Download this media try: import requests from urllib.parse import urlparse, unquote if not media_src.startswith('http'): media_src = f"https:{media_src}" if media_src.startswith('//') else f"https://imginn.com{media_src}" response = requests.get(media_src, timeout=30, headers={ 'User-Agent': self.user_agent, 'Referer': 'https://imginn.com/' }, cookies=self._get_cookies_for_requests()) response.raise_for_status() # Extract filename and media ID from the actual file url_path = urlparse(media_src).path original_name = unquote(url_path.split('/')[-1].split('?')[0]) if original_name.startswith('post'): original_name = original_name[4:] # The media ID is the filename without extension actual_media_id = Path(original_name).stem ext = Path(original_name).suffix or '.jpg' # Build filename for carousel image using actual media ID filename = f"{profile_name}_{date_str}_{actual_media_id}_{slide_index + 1}{ext}" filepath = output_dir / filename # Save file with open(filepath, 'wb') as f: f.write(response.content) # Check for duplicate hash before recording if self.unified_db: from pathlib import Path as PathLib file_hash = self.unified_db.get_file_hash(str(filepath)) if file_hash: existing = self.unified_db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): existing_path = PathLib(existing['file_path']) if existing_path.exists(): self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") try: filepath.unlink() self.log(f"Deleted duplicate: {filename}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {filename}: {e}", "warning") # Update timestamps if post_date: self._update_file_timestamps(filepath, post_date) self.log(f"Downloaded: {filename} ({len(response.content)} bytes)", "info") downloaded_files.append(str(filepath)) image_count += 1 # Add to tracking self.downloaded_files.add(actual_media_id) # Mark in database (or defer for later) if not skip_database or defer_database: unique_url = f"{post_url}#{filename}" self._record_download( media_id=actual_media_id, username=profile_name, filename=filename, url=unique_url, post_date=post_date, file_path=str(filepath), content_type='post', deferred=defer_database ) except Exception as e: self.log(f"Failed to download carousel media {slide_index + 1}: {e}", "error") else: self.log(f"Slide {slide_index + 1} has no valid media (img/video)", "warning") # Skip the old carousel download logic pass # This duplicate block is not needed - single image logic is already handled above # OLD carousel logic removed - handled above if False: # Wait for carousel content to load time.sleep(1) # Find download button AND image elements # ImgInn sometimes has the full image in an img tag, not just download button download_selectors = [ 'a[download]', 'a.download-btn', 'a[href*="scontent"]', 'a[href*="s3.imginn.com"]', 'a.download', 'a[href*="/post"][href*=".jpg"]', 'a[href*="/post"][href*=".mp4"]', 'button.download', 'a.btn-download' ] # For carousel images, we need to find the actual post image, not the profile thumbnail # Look for images that are NOT the profile pic and NOT lazy placeholders img_src = None # Try to find the carousel image (exclude profile pic and lazy images) possible_images = page.locator('img[src*="post"], img[src*="scontent"]:not([src*="profile"])').all() for img_elem in possible_images: src = img_elem.get_attribute('src') if src and 'lazy.jpg' not in src and '483011604' not in src: # Exclude profile pic img_src = src self.log(f"Found carousel image src: {img_src[:100]}...", "debug") break # If no good image found, wait and try again if not img_src or 'lazy.jpg' in img_src: time.sleep(1) # Try once more after waiting main_image = page.locator('img[src*="post"]:not([src*="lazy"])').first if main_image.count() > 0: img_src = main_image.get_attribute('src') if img_src: self.log(f"Found carousel image after wait: {img_src[:100]}...", "debug") download_btn = None for selector in download_selectors: btn = page.locator(selector).first if btn.count() > 0: download_btn = btn break if download_btn and download_btn.count() > 0: try: # For ImgInn, we should click the download button to get the full-size image # The href often points to a thumbnail, not the full image download_url = download_btn.get_attribute('href') self.log(f"Download button href: {download_url[:100] if download_url else 'None'}...", "debug") # Try clicking the button for browser download first try: self.log(f"Attempting browser download (clicking button)", "debug") with page.expect_download(timeout=5000) as download_info: download_btn.click() download = download_info.value original_name = download.suggested_filename media_id_from_file = Path(original_name).stem ext = Path(original_name).suffix or '.jpg' download_method = 'browser' response = None self.log(f"Browser download completed: {original_name}", "debug") except Exception: # Fallback to direct download if clicking doesn't work self.log(f"Browser download failed, trying direct download", "debug") # For carousels, if no download URL or it's invalid, use image src if has_carousel and (not download_url or download_url == "None" or download_url == "null"): if img_src: self.log(f"No download button for carousel, using image src", "debug") download_url = img_src # Be more lenient with download URLs - accept any https URL that looks like it could be an image/video if download_url and download_url.startswith('http'): # Make sure it's not just the post page URL if '/p/' not in download_url or download_url.endswith(('.jpg', '.jpeg', '.png', '.heic', '.mp4', '.webm')): import requests response = requests.get(download_url, timeout=30, headers={ 'User-Agent': self.user_agent, 'Referer': 'https://imginn.com/' }, cookies=self._get_cookies_for_requests()) response.raise_for_status() self.log(f"Downloaded {len(response.content)} bytes", "debug") download_method = 'direct' # Extract filename from URL from urllib.parse import urlparse, unquote url_path = urlparse(download_url).path original_name = unquote(url_path.split('/')[-1].split('?')[0]) # Remove 'post' prefix if present if original_name.startswith('post'): original_name = original_name[4:] media_id_from_file = Path(original_name).stem # This is the actual media ID ext = Path(original_name).suffix or '.jpg' else: # Try to use image src instead if img_src: self.log(f"Download URL is post page, using image src instead", "debug") download_url = img_src if not download_url.startswith('http'): download_url = f"https://imginn.com{download_url}" import requests response = requests.get(download_url, timeout=30, headers={ 'User-Agent': self.user_agent, 'Referer': 'https://imginn.com/' }, cookies=self._get_cookies_for_requests()) response.raise_for_status() download_method = 'direct' from urllib.parse import urlparse, unquote url_path = urlparse(download_url).path original_name = unquote(url_path.split('/')[-1].split('?')[0]) if original_name.startswith('post'): original_name = original_name[4:] media_id_from_file = Path(original_name).stem ext = Path(original_name).suffix or '.jpg' else: raise Exception("No valid download URL found") else: raise Exception("No valid download URL found") # Update our tracked media ID with the correct one from the file if media_id_from_file: media_id = media_id_from_file self.log(f"Media ID from file: {media_id}", "debug") # For carousels, if we don't get a unique media ID, generate one normalized_media_id = extract_instagram_media_id(media_id) if media_id else None if has_carousel and (not media_id or media_id in self.downloaded_files or (normalized_media_id and normalized_media_id in self.downloaded_files)): # Generate unique ID for this carousel image media_id = f"{media_id_base}_carousel_{carousel_index}" normalized_media_id = extract_instagram_media_id(media_id) self.log(f"Generated carousel media ID: {media_id}", "debug") # Check if this media ID is already downloaded (both original and normalized) if media_id in self.downloaded_files or (normalized_media_id and normalized_media_id in self.downloaded_files): self.log(f"Already have {media_id}, skipping download but continuing carousel", "debug") # Still count this as an image even if skipped image_count += 1 if has_carousel: carousel_index += 1 else: self.log(f"Downloading new file for {media_id}", "debug") # Build filename with FastDL format if has_carousel: # For carousel items, append index (simpler format) filename = f"{profile_name}_{date_str}_{media_id_base}_{carousel_index}{ext}" else: filename = f"{profile_name}_{date_str}_{media_id}{ext}" filepath = output_dir / filename # Save the downloaded content if download_method == 'direct': with open(filepath, 'wb') as f: f.write(response.content) else: download.save_as(filepath) # Check for duplicate hash before recording if self.unified_db: from pathlib import Path as PathLib # Check for duplicate hash (hash blacklist persists even if original deleted) file_hash = self.unified_db.get_file_hash(str(filepath)) if file_hash: existing = self.unified_db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): # Duplicate hash found - content was already downloaded (prevents redownload of deleted content) self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") # Delete the duplicate regardless of whether original file still exists try: filepath.unlink() self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {filename}: {e}", "warning") # Update file timestamps to match post date if post_date: self._update_file_timestamps(filepath, post_date) self.log(f"Downloaded: {filename}", "info") downloaded_files.append(str(filepath)) image_count += 1 # Add to tracking self.downloaded_files.add(media_id) # Increment carousel index for next image if has_carousel: carousel_index += 1 # Mark as downloaded in database (or defer for later) # Use per-slide URL for carousels so each slide gets a unique url_hash record_url = f"{post_url}?img_index={carousel_index + 1}" if has_carousel else post_url if not skip_database or defer_database: self._record_download( media_id=media_id, username=profile_name, filename=filename, url=record_url, post_date=post_date, file_path=str(filepath), content_type='post', deferred=defer_database ) except Exception as e: self.log(f"Download failed: {e}", "error") import traceback self.log(f"Traceback: {traceback.format_exc()}", "debug") break else: # No download button found, try using the image src as fallback page_url = page.url self.log(f"No download button found on {page_url}, trying image src", "warning") # Use the image src we found earlier if img_src: try: self.log(f"Using image src as fallback: {img_src[:100]}...", "debug") import requests from urllib.parse import urlparse, unquote # Ensure full URL if not img_src.startswith('http'): img_src = f"https://imginn.com{img_src}" response = requests.get(img_src, timeout=30, headers={ 'User-Agent': self.user_agent, 'Referer': 'https://imginn.com/' }, cookies=self._get_cookies_for_requests()) response.raise_for_status() # Extract filename from URL url_path = urlparse(img_src).path original_name = unquote(url_path.split('/')[-1].split('?')[0]) if original_name.startswith('post'): original_name = original_name[4:] media_id = Path(original_name).stem ext = Path(original_name).suffix or '.jpg' # Build filename with carousel index if needed if has_carousel and carousel_index > 1: filename = f"{profile_name}_{date_str}_{media_id}_{carousel_index}{ext}" else: filename = f"{profile_name}_{date_str}_{media_id}{ext}" filepath = output_dir / filename # Save file with open(filepath, 'wb') as f: f.write(response.content) self.log(f"Downloaded via image src: {filename} ({len(response.content)} bytes)", "info") downloaded_files.append(str(filepath)) # Check for duplicate hash before recording if self.unified_db: from pathlib import Path as PathLib # Check for duplicate hash (hash blacklist persists even if original deleted) file_hash = self.unified_db.get_file_hash(str(filepath)) if file_hash: existing = self.unified_db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): # Duplicate hash found - content was already downloaded (prevents redownload of deleted content) self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") # Delete the duplicate regardless of whether original file still exists try: filepath.unlink() self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {filename}: {e}", "warning") # Update timestamps if post_date: self._update_file_timestamps(filepath, post_date) image_count += 1 self.downloaded_files.add(media_id) # Mark in database (or defer for later) # Use per-slide URL for carousels so each slide gets a unique url_hash record_url = f"{post_url}?img_index={carousel_index + 1}" if has_carousel else post_url if not skip_database or defer_database: self._record_download( media_id=media_id, username=profile_name, filename=filename, url=record_url, post_date=post_date, file_path=str(filepath), content_type='post', deferred=defer_database ) except Exception as e: self.log(f"Failed to download via image src: {e}", "error") # Don't break here - might be a temporary issue with one image if not has_carousel: break else: self.log(f"No image src available as fallback", "debug") # For carousels, we might still have more images after clicking next if not has_carousel: break # Check for next image in carousel if has_carousel and image_count < max_images: next_btn = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first if next_btn.count() > 0 and next_btn.is_visible(): # Store current image src to detect when it changes current_img_src = img_src if img_src else "" self.log(f"Clicking next for carousel image {carousel_index}", "debug") try: next_btn.click(force=True) except Exception: self.log(f"Carousel next button click timed out at image {carousel_index}, stopping carousel", "warning") break # Wait for the image to change time.sleep(2) # Give more time for slide transition and new image to load else: self.log("No more carousel images", "debug") break else: break else: # Single image - download from post page using download button download_url = None webp_fallback_url = None download_selectors = [ 'a.btn[href*="scontent"][href*=".jpg"]', # High-res jpg 'a.btn[href*="scontent"][href*=".mp4"]', # Video 'a.btn[href*="scontent"]', # Any scontent 'a[download][href*=".jpg"]', 'a[download][href*=".mp4"]', 'a.download', 'a[href*="/post"]' ] for selector in download_selectors: btn = page.locator(selector).first if btn.count() > 0: temp_url = btn.get_attribute('href') if temp_url and temp_url != '#' and temp_url != 'javascript:void(0)': if not temp_url.startswith('http'): temp_url = f"https://imginn.com{temp_url}" # Store .webp as fallback, but keep looking for better if '.webp' in temp_url.lower(): if not webp_fallback_url: webp_fallback_url = temp_url self.log(f"Found .webp link (fallback): {temp_url[:80]}...", "debug") continue # Found non-.webp link, use it download_url = temp_url self.log(f"Found high-res download for single image: {download_url[:80]}...", "debug") break # Use .webp fallback if no high-res found if not download_url and webp_fallback_url: download_url = webp_fallback_url self.log(f"Using .webp fallback for single image", "info") if download_url: try: import requests from urllib.parse import urlparse, unquote response = requests.get(download_url, timeout=30, headers={ 'User-Agent': self.user_agent, 'Referer': 'https://imginn.com/' }, cookies=self._get_cookies_for_requests()) response.raise_for_status() # Extract filename and media ID from the actual file url_path = urlparse(download_url).path original_name = unquote(url_path.split('/')[-1].split('?')[0]) if original_name.startswith('post'): original_name = original_name[4:] # The media ID is the filename without extension actual_media_id = Path(original_name).stem ext = Path(original_name).suffix or '.jpg' # Build filename filename = f"{profile_name}_{date_str}_{actual_media_id}{ext}" filepath = output_dir / filename # Save file with open(filepath, 'wb') as f: f.write(response.content) self.log(f"Downloaded (high-res): {filename} ({len(response.content)} bytes)", "info") downloaded_files.append(str(filepath)) # Check for duplicate hash before recording if self.unified_db: from pathlib import Path as PathLib file_hash = self.unified_db.get_file_hash(str(filepath)) if file_hash: existing = self.unified_db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): existing_path = PathLib(existing['file_path']) if existing_path.exists(): self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") try: filepath.unlink() self.log(f"Deleted duplicate: {filename}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {filename}: {e}", "warning") # Update timestamps if post_date: self._update_file_timestamps(filepath, post_date) image_count = 1 # Add to tracking self.downloaded_files.add(actual_media_id) # Mark in database (or defer for later) if not skip_database or defer_database: self._record_download( media_id=actual_media_id, username=profile_name, filename=filename, url=post_url, post_date=post_date, file_path=str(filepath), content_type='post', deferred=defer_database ) except Exception as e: self.log(f"Failed to download single image: {e}", "warning") else: # No download button found - try video/image src as fallback self.log("No download button found, trying video/image src fallback", "debug") media_src = None # Try video first - multiple selectors for different page structures video_selectors = [ 'video source[src]', 'video[src]', 'video source[type*="mp4"]', '.video-container video', '.post-video video', 'div[class*="video"] video', 'video' ] for v_selector in video_selectors: video_elem = page.locator(v_selector).first if video_elem.count() > 0: # Try src attribute first, then check source child media_src = video_elem.get_attribute('src') if not media_src: source_elem = video_elem.locator('source').first if source_elem.count() > 0: media_src = source_elem.get_attribute('src') if media_src and media_src != '#': self.log(f"Found video src via '{v_selector}': {media_src[:80]}...", "debug") break # If no video found, wait a bit and try again (videos may lazy-load) if not media_src: time.sleep(2) for v_selector in video_selectors: video_elem = page.locator(v_selector).first if video_elem.count() > 0: media_src = video_elem.get_attribute('src') if not media_src: source_elem = video_elem.locator('source').first if source_elem.count() > 0: media_src = source_elem.get_attribute('src') if media_src and media_src != '#': self.log(f"Found video src after wait via '{v_selector}': {media_src[:80]}...", "debug") break # Try image if no video if not media_src: img_elem = page.locator('img[src*="scontent"]:not([src*="profile"]), img[src*="post"]').first if img_elem.count() > 0: media_src = img_elem.get_attribute('src') if media_src and 'lazy.jpg' not in media_src: self.log(f"Found image src: {media_src[:80]}...", "debug") else: media_src = None if media_src: try: import requests from urllib.parse import urlparse, unquote if not media_src.startswith('http'): media_src = f"https://imginn.com{media_src}" response = requests.get(media_src, timeout=30, headers={ 'User-Agent': self.user_agent, 'Referer': 'https://imginn.com/' }, cookies=self._get_cookies_for_requests()) response.raise_for_status() # Extract filename from URL url_path = urlparse(media_src).path original_name = unquote(url_path.split('/')[-1].split('?')[0]) if original_name.startswith('post'): original_name = original_name[4:] actual_media_id = Path(original_name).stem ext = Path(original_name).suffix or '.mp4' filename = f"{profile_name}_{date_str}_{actual_media_id}{ext}" filepath = output_dir / filename with open(filepath, 'wb') as f: f.write(response.content) self.log(f"Downloaded (fallback): {filename} ({len(response.content)} bytes)", "info") downloaded_files.append(str(filepath)) if post_date: self._update_file_timestamps(filepath, post_date) image_count = 1 self.downloaded_files.add(actual_media_id) if not skip_database or defer_database: self._record_download( media_id=actual_media_id, username=profile_name, filename=filename, url=post_url, post_date=post_date, file_path=str(filepath), content_type='post', deferred=defer_database ) except Exception as e: self.log(f"Failed to download via fallback: {e}", "error") else: self.log("No download button or media src found for single post", "warning") # Debug: capture screenshot and page content when download fails try: debug_dir = Path("debug") debug_dir.mkdir(exist_ok=True) timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') screenshot_path = debug_dir / f"no_media_{media_id}_{timestamp}.png" page.screenshot(path=str(screenshot_path)) self.log(f"Debug screenshot saved: {screenshot_path}", "debug") # Also log page title and some content title = page.title() self.log(f"Page title: {title}", "debug") # Check if this is a Cloudflare block - don't mark as processed if so if self._is_cloudflare_challenge(page): self.log(f"Cloudflare block detected - NOT marking {media_id} as processed (will retry later)", "warning") # Skip to next post without marking as processed try: page.goto(f"https://imginn.com/{username}/?ref=index") time.sleep(3) except Exception: pass continue except Exception as e: self.log(f"Failed to capture debug screenshot: {e}", "debug") # Mark post as processed in database even if no downloads # (might be already downloaded or failed - but NOT if Cloudflare blocked) if image_count == 0: # Still mark the post URL as processed to avoid re-checking self._record_download( media_id=media_id, username=profile_name, filename=f"{media_id}_skipped", url=post_url, post_date=post_date, content_type='post', metadata={'marker': True, 'reason': 'skipped'} ) # Go back to profile self._safe_go_back(page, username) # If we just bypassed Cloudflare, wait longer to let session stabilize if cloudflare_bypassed: cooldown = random.uniform(15, 25) self.log(f"Post-bypass cooldown: waiting {cooldown:.1f}s to stabilize session", "info") time.sleep(cooldown) else: time.sleep(random.uniform(1, 3)) # Check if back on profile if username not in page.url: page.goto(f"https://imginn.com/{username}/?ref=index") time.sleep(3) except Exception as e: self.log(f"Error processing post: {e}", "error") try: page.goto(f"https://imginn.com/{username}/?ref=index") time.sleep(3) except Exception: pass self.log(f"Downloaded {len(downloaded_files)} files", "info") except Exception as e: self.log(f"Error: {e}", "error") # Don't close browser here - reuse it for next profile # Call _stop_browser() explicitly when done with all profiles return downloaded_files def download_tagged(self, username: str, days_back: int = 14, max_posts: int = 50, output_dir: Path = None, phrase_config: dict = None, defer_database: bool = False): """Download tagged posts from a user Args: username: Instagram username days_back: How many days back to search max_posts: Maximum posts to check output_dir: Output directory phrase_config: Optional phrase search configuration defer_database: If True, defer database recording to pending_downloads list for later recording after file move is complete """ # Rate limiting to avoid Cloudflare blocks self._enforce_rate_limit("tagged") profile_name = username.lower() if output_dir is None: output_dir = Path(f"/opt/media-downloader/downloads/{profile_name}") output_dir.mkdir(parents=True, exist_ok=True) # Check site status before doing anything else self.log("Checking ImgInn site status...", "debug") site_status, error_msg = self.cf_handler.check_site_status("https://imginn.com/", timeout=10) if self.cf_handler.should_skip_download(site_status): self.log(f"Skipping tagged download for @{profile_name} - ImgInn is unavailable: {error_msg}", "warning") return [] elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE: self.log("Cloudflare challenge detected, will attempt bypass during download", "info") # Scan existing files self._scan_existing_files(output_dir, profile_name) # Get processed posts from database processed_posts = self._get_processed_posts(profile_name) self.log(f"Loaded {len(processed_posts)} processed tagged posts for {profile_name} from database", "info") downloaded_files = [] cutoff_date = datetime.now() - timedelta(days=days_back) # Start or reuse browser self._start_browser() page = self.page try: # Navigate to tagged page directly self.log(f"Navigating to @{username} tagged posts page", "info") page.goto(f"https://imginn.com/tagged/{username}/?ref=index", wait_until='domcontentloaded') # CRITICAL: Wait for Cloudflare background JS challenges import random wait_time = 5 + random.uniform(0, 2) self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug") time.sleep(wait_time) # Wait for page to load if not self.wait_for_cloudflare(page): self._page_load_failures += 1 level = "error" if self._page_load_failures >= self._page_load_failure_threshold else "warning" self.log(f"Page didn't load properly ({self._page_load_failures}x this session)", level) return [] # Save cookies self.save_cookies(self.context) # Wait for JavaScript to load posts (ImgInn loads posts dynamically on tagged page) self.log("Waiting for tagged posts to load via JavaScript...", "info") try: # Wait for post links to appear (up to 10 seconds) page.wait_for_selector('a[href*="/p/"]', timeout=10000) self.log("Tagged posts loaded successfully", "info") except Exception: # Timeout - posts might not exist, or page structure changed self.log("Timeout waiting for tagged posts to appear", "warning") time.sleep(2) # Give it a bit more time anyway # Scroll to load more posts (ImgInn uses infinite scroll on tagged page) self.log("Scrolling to load more tagged posts...", "info") previous_count = 0 scroll_attempts = 0 max_scroll_attempts = 10 # Scroll up to 10 times to load posts while scroll_attempts < max_scroll_attempts: # Get current count of post links current_count = page.locator('a[href*="/p/"]').count() if current_count == previous_count and scroll_attempts > 0: # No new posts loaded after scroll, we've reached the end self.log(f"No more tagged posts to load (total: {current_count})", "debug") break if current_count >= max_posts: # We have enough posts self.log(f"Loaded {current_count} tagged posts (reached max_posts limit)", "debug") break previous_count = current_count # Scroll to bottom of page page.evaluate("window.scrollTo(0, document.body.scrollHeight)") time.sleep(1.5) # Wait for new posts to load scroll_attempts += 1 self.log(f"Scroll {scroll_attempts}: Found {current_count} tagged posts", "debug") # Find posts on tagged page self.log("Finding tagged posts...", "info") # Debug: Check what's actually on the page page_content = page.content() if 'no posts' in page_content.lower() or 'page not found' in page_content.lower(): self.log("Page shows 'no posts' or 'not found'", "warning") post_links = page.locator('a[href*="/p/"]').all() self.log(f"Found {len(post_links)} tagged posts", "info") if not post_links: # Debug: Save screenshot to see what's wrong try: screenshot_path = Path(f"/tmp/imginn_no_tagged_{username}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png") page.screenshot(path=str(screenshot_path)) self.log(f"No tagged posts found - screenshot saved to {screenshot_path}", "warning") except Exception: pass self.log("No tagged posts found", "warning") return [] # Extract all post URLs upfront to avoid stale element issues # (elements become stale after page.go_back()) post_urls = [] for idx, post_link in enumerate(post_links[:max_posts]): try: href = post_link.get_attribute('href', timeout=5000) if href: # Ensure full URL if not href.startswith('http'): href = f"https://imginn.com{href}" post_urls.append(href) except Exception as e: self.log(f"Tagged {idx+1}: Failed to get URL: {str(e)[:50]}", "debug") continue self.log(f"Processing {len(post_urls)} tagged posts (max {max_posts})", "info") # Track consecutive old posts to handle pinned posts consecutive_old_posts = 0 max_consecutive_old_posts = 5 # Allow up to 5 old posts (pinned) before stopping # Set initial progress so dashboard shows 0/N immediately self.activity_manager.update_status( "Downloading tagged", progress_current=0, progress_total=len(post_urls) ) for i, post_url in enumerate(post_urls): # Update progress at start of each iteration (fires even on skips) self.activity_manager.update_status( "Downloading tagged", progress_current=i + 1, progress_total=len(post_urls) ) try: # Extract media ID from URL media_id = self._extract_media_id_from_url(post_url) if not media_id: self.log(f"Could not extract media ID from {post_url}", "warning") continue self.log(f"[{i+1}/{len(post_urls)}] Checking tagged post {media_id}", "debug") # Check if already processed (either downloaded or checked for phrases/age) if media_id in processed_posts or post_url in processed_posts: self.log(f"Post {media_id} already processed, skipping", "debug") continue # Rate limiting between post downloads to avoid Cloudflare blocks if i > 0: post_delay = random.uniform(3, 8) self.log(f"Rate limit: waiting {post_delay:.1f}s before tagged post {i+1}", "debug") time.sleep(post_delay) # For tagged posts, ALWAYS navigate to post page for high-res download # (Never use profile download which gives low-res .webp) page.goto(post_url, wait_until='domcontentloaded') # Wait for page to load time.sleep(2) # Wait for navigation to complete try: page.wait_for_load_state('networkidle', timeout=5000) except Exception: # Continue even if network isn't idle - page might still be usable self.log("Network didn't idle, but continuing", "debug") # Check if on post page if "/p/" not in page.url: self.log(f"Not a downloadable post (URL: {page.url})", "warning") self._safe_go_back(page, username, tagged=True) continue # IMPORTANT: Wait for post page content to fully render # This ensures download buttons are from the POST PAGE, not tagged page preview try: # Wait for the post container to be visible page.wait_for_selector('div.main-content, div.post, div.content, div.single-post', timeout=3000) time.sleep(1) # Additional wait for download buttons to render except Exception: self.log("Post container not found, checking for Cloudflare...", "debug") # Check for Cloudflare challenge and handle it cloudflare_bypassed = False if self._is_cloudflare_challenge(page): self.log(f"Cloudflare challenge detected on tagged post {media_id}", "warning") if not self._handle_cloudflare_on_post(page, post_url): # Cloudflare bypass failed - skip this post WITHOUT marking as processed # so it can be retried on next run self.log(f"Skipping tagged post {media_id} due to Cloudflare block (will retry later)", "warning") try: page.goto(f"https://imginn.com/tagged/{username}/?ref=index") time.sleep(3) except Exception: pass continue cloudflare_bypassed = True self.log(f"Navigated to tagged post page: {page.url}", "debug") self._dismiss_consent_dialog(page) # Extract the actual poster's username (not the tagged user) # On tagged pages, posts are FROM other users who tagged this user poster_username = profile_name # Default to tagged user try: username_elem = page.locator('div.username a').first if username_elem.count() > 0: username_href = username_elem.get_attribute('href') if username_href: # Extract username from href like "/evalongoria.of/" -> "evalongoria.of" poster_username = username_href.strip('/').lower() self.log(f"Poster username: @{poster_username}", "debug") except Exception as e: self.log(f"Could not extract poster username, using default: {e}", "debug") # Extract post date - ALWAYS extract for proper file naming post_date = self._extract_post_date(page) # Use post date for filename, or current date if post_date: date_str = post_date.strftime('%Y%m%d_%H%M%S') self.log(f"Original post date: {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug") else: date_str = datetime.now().strftime('%Y%m%d_%H%M%S') self.log(f"No original date found, using current time", "debug") # Check date filter if post_date and post_date < cutoff_date: consecutive_old_posts += 1 self.log(f"Tagged post too old ({post_date.strftime('%Y-%m-%d')}), skipping (consecutive old: {consecutive_old_posts}/{max_consecutive_old_posts})", "info") # Clean up temp file if exists if 'temp_download_path' in locals() and temp_download_path and temp_download_path.exists(): temp_download_path.unlink() self.log(f"Deleted temp file for old post", "debug") # Mark this old post as checked in database - use poster_username for tagged content if phrase_config and media_id: self._record_download( media_id=media_id, username=poster_username, filename=f"_old_post_{media_id}", url=post_url, post_date=post_date, content_type='tagged', metadata={'marker': True, 'reason': 'old_post'} ) self._safe_go_back(page, username, tagged=True) # Stop only after 5 consecutive old posts (handles pinned posts at top) if consecutive_old_posts >= max_consecutive_old_posts: self.log(f"Found {consecutive_old_posts} consecutive old tagged posts - stopping", "info") break else: continue # Skip this old post but keep checking (might be pinned) # Reset consecutive old posts counter - we found a post within date range consecutive_old_posts = 0 # Check for phrase matching if configured if phrase_config and phrase_config.get('enabled'): if not self._check_post_phrases(page, phrase_config): self.log(f"Tagged post does not match phrase criteria, skipping download", "info") # Clean up temp file if exists if 'temp_download_path' in locals() and temp_download_path and temp_download_path.exists(): temp_download_path.unlink() self.log(f"Deleted temp file for non-matching post", "debug") # Mark this post as checked (but not downloaded) in database - use poster_username if media_id: self._record_download( media_id=media_id, username=poster_username, filename=f"_phrase_checked_{media_id}", url=post_url, post_date=post_date, content_type='tagged', metadata={'marker': True, 'reason': 'phrase_checked'} ) self._safe_go_back(page, username, tagged=True) continue else: self.log(f"Tagged post matches phrase criteria, using high-res download", "info") # Check for carousel carousel_next = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first has_carousel = carousel_next.count() > 0 if has_carousel: self.log(f"Carousel detected in tagged post - will download all carousel images", "info") self._dismiss_consent_dialog(page) # CRITICAL: Wait for POST PAGE carousel download buttons to be ready # This prevents downloading from the tagged page preview try: # Wait for download buttons with POST PAGE URLs (have "scontent" or "post" in them) page.wait_for_selector('a.btn[href*="scontent"], a[download], a.download', timeout=3000) time.sleep(1.5) # Additional wait for all carousel images to load self.log("Carousel download buttons ready on post page", "debug") except Exception: self.log("Download buttons not found, but continuing", "debug") else: self.log("Single image tagged post", "debug") # Handle downloads - always use download buttons from post page image_count = 0 max_images = 10 # Download images (carousel or single) if has_carousel: all_slides = page.locator('.swiper-slide').all() self.log(f"Found {len(all_slides)} carousel slides in tagged post", "debug") # Download each slide's image for slide_index in range(min(len(all_slides), max_images)): self.log(f"Processing carousel slide {slide_index + 1}/{len(all_slides)}", "debug") # Get the current slide element to scope our searches current_slide = all_slides[slide_index] # Click next to navigate to this slide (except for first one) if slide_index > 0: next_btn = page.locator('div[role="button"][aria-label*="Next"], .swiper-button-next').first if next_btn.count() > 0 and next_btn.is_visible(): try: next_btn.click(force=True) except Exception: self.log(f"Carousel next button click timed out at slide {slide_index + 1}, stopping carousel", "warning") break time.sleep(2) # Wait for slide transition and image to load # Look for download button - prefer high-res, fallback to .webp # IMPORTANT: Search within CURRENT SLIDE only, not entire page download_url = None webp_fallback_url = None slide_downloaded = False # Track if this specific slide was downloaded download_selectors = [ 'a.btn[href*="scontent"][href*=".jpg"]', # High-res jpg 'a.btn[href*="scontent"][href*=".mp4"]', # Video 'a.btn[href*="scontent"]', # Any scontent 'a[download][href*=".jpg"]', 'a[download][href*=".mp4"]', 'a.download', 'a[href*="/post"]' ] # Search for download buttons - first try within slide, then try page-level # Imginn often has download buttons outside the .swiper-slide elements search_contexts = [current_slide, page] for search_context in search_contexts: if download_url: # Already found, skip other contexts break for selector in download_selectors: btn = search_context.locator(selector).first if btn.count() > 0: temp_url = btn.get_attribute('href') if temp_url and temp_url != '#' and temp_url != 'javascript:void(0)': if not temp_url.startswith('http'): temp_url = f"https://imginn.com{temp_url}" # Store .webp as fallback, but keep looking for better if '.webp' in temp_url.lower(): if not webp_fallback_url: webp_fallback_url = temp_url self.log(f"Found .webp link (fallback): {temp_url[:80]}...", "debug") continue # Found non-.webp link, use it download_url = temp_url self.log(f"Found high-res download for carousel slide {slide_index + 1}: {download_url[:80]}...", "debug") break # Use .webp fallback if no high-res found used_webp_fallback = False if not download_url and webp_fallback_url: download_url = webp_fallback_url used_webp_fallback = True self.log(f"Using .webp fallback for carousel slide {slide_index + 1}", "info") # If we found a download button, use it for high-res if download_url: try: import requests from urllib.parse import urlparse, unquote response = requests.get(download_url, timeout=30, headers={ 'User-Agent': self.user_agent, 'Referer': 'https://imginn.com/' }, cookies=self._get_cookies_for_requests()) response.raise_for_status() # Extract filename and media ID from the actual file url_path = urlparse(download_url).path original_name = unquote(url_path.split('/')[-1].split('?')[0]) if original_name.startswith('post'): original_name = original_name[4:] # The media ID is the filename without extension actual_media_id = Path(original_name).stem ext = Path(original_name).suffix or '.jpg' # Build filename for carousel image using actual media ID (use poster's username) filename = f"{poster_username}_{date_str}_{actual_media_id}_{slide_index + 1}{ext}" filepath = output_dir / filename # Save file with open(filepath, 'wb') as f: f.write(response.content) # Log with appropriate quality label quality_label = "fallback" if used_webp_fallback else "high-res" self.log(f"Downloaded tagged ({quality_label}): {filename} from @{poster_username} ({len(response.content)} bytes)", "info") downloaded_files.append(str(filepath)) # Check for duplicate hash before recording if self.unified_db: from pathlib import Path as PathLib # Check for duplicate hash (hash blacklist persists even if original deleted) file_hash = self.unified_db.get_file_hash(str(filepath)) if file_hash: existing = self.unified_db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): # Duplicate hash found - content was already downloaded (prevents redownload of deleted content) self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") # Delete the duplicate regardless of whether original file still exists try: filepath.unlink() self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {filename}: {e}", "warning") # Update timestamps if post_date: self._update_file_timestamps(filepath, post_date) image_count += 1 slide_downloaded = True # Mark this slide as successfully downloaded # Add to tracking self.downloaded_files.add(actual_media_id) # Mark in database (or defer for later) - use poster_username for tagged content unique_url = f"{post_url}#{filename}" self._record_download( media_id=actual_media_id, username=poster_username, filename=filename, url=unique_url, post_date=post_date, file_path=str(filepath), content_type='tagged', deferred=defer_database ) except Exception as e: self.log(f"Failed to download carousel image {slide_index + 1}: {e}", "error") # Don't continue - try fallback method below # Fallback: Download from current slide's img/video src if no download button worked if not slide_downloaded: self.log(f"Trying fallback: downloading from slide {slide_index + 1} media src", "debug") # current_slide already defined at top of loop # Try img first, then video media_src = None slide_img = current_slide.locator('img').first if slide_img.count() > 0: media_src = slide_img.get_attribute('src') else: # Check for video tag slide_video = current_slide.locator('video source, video').first if slide_video.count() > 0: media_src = slide_video.get_attribute('src') self.log(f"Found video for slide {slide_index + 1}", "debug") if media_src: # Skip lazy placeholders if 'lazy.jpg' not in media_src and '483011604' not in media_src: try: import requests from urllib.parse import urlparse, unquote if not media_src.startswith('http'): media_src = f"https:{media_src}" if media_src.startswith('//') else f"https://imginn.com{media_src}" response = requests.get(media_src, timeout=30, headers={ 'User-Agent': self.user_agent, 'Referer': 'https://imginn.com/' }, cookies=self._get_cookies_for_requests()) response.raise_for_status() # Extract filename url_path = urlparse(media_src).path original_name = unquote(url_path.split('/')[-1].split('?')[0]) actual_media_id = Path(original_name).stem ext = Path(original_name).suffix or '.jpg' # Build filename filename = f"{poster_username}_{date_str}_{actual_media_id}_{slide_index + 1}{ext}" filepath = output_dir / filename # Save file with open(filepath, 'wb') as f: f.write(response.content) self.log(f"Downloaded tagged (fallback): {filename} from @{poster_username} ({len(response.content)} bytes)", "info") downloaded_files.append(str(filepath)) # Check for duplicate hash before recording if self.unified_db: from pathlib import Path as PathLib file_hash = self.unified_db.get_file_hash(str(filepath)) if file_hash: existing = self.unified_db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): existing_path = PathLib(existing['file_path']) if existing_path.exists(): self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") try: filepath.unlink() self.log(f"Deleted duplicate: {filename}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {filename}: {e}", "warning") # Update timestamps if post_date: self._update_file_timestamps(filepath, post_date) image_count += 1 # Add to tracking self.downloaded_files.add(actual_media_id) # Mark in database (or defer for later) - use poster_username for tagged content unique_url = f"{post_url}#{filename}" self._record_download( media_id=actual_media_id, username=poster_username, filename=filename, url=unique_url, post_date=post_date, file_path=str(filepath), content_type='tagged', deferred=defer_database ) except Exception as e: self.log(f"Failed to download from media src for slide {slide_index + 1}: {e}", "error") else: self.log(f"No media (img/video) found for carousel slide {slide_index + 1}", "warning") else: # Single image - download from post page using download button download_url = None webp_fallback_url = None download_selectors = [ 'a.btn[href*="scontent"][href*=".jpg"]', # High-res jpg 'a.btn[href*="scontent"][href*=".mp4"]', # Video 'a.btn[href*="scontent"]', # Any scontent 'a[download][href*=".jpg"]', 'a[download][href*=".mp4"]', 'a.download', 'a[href*="/post"]' ] for selector in download_selectors: btn = page.locator(selector).first if btn.count() > 0: temp_url = btn.get_attribute('href') if temp_url and temp_url != '#' and temp_url != 'javascript:void(0)': if not temp_url.startswith('http'): temp_url = f"https://imginn.com{temp_url}" # Store .webp as fallback, but keep looking for better if '.webp' in temp_url.lower(): if not webp_fallback_url: webp_fallback_url = temp_url self.log(f"Found .webp link (fallback): {temp_url[:80]}...", "debug") continue # Found non-.webp link, use it download_url = temp_url self.log(f"Found high-res download for single image: {download_url[:80]}...", "debug") break # Use .webp fallback if no high-res found if not download_url and webp_fallback_url: download_url = webp_fallback_url self.log(f"Using .webp fallback for single image", "info") if download_url: try: import requests from urllib.parse import urlparse, unquote response = requests.get(download_url, timeout=30, headers={ 'User-Agent': self.user_agent, 'Referer': 'https://imginn.com/' }, cookies=self._get_cookies_for_requests()) response.raise_for_status() # Extract filename and media ID from the actual file url_path = urlparse(download_url).path original_name = unquote(url_path.split('/')[-1].split('?')[0]) if original_name.startswith('post'): original_name = original_name[4:] # The media ID is the filename without extension actual_media_id = Path(original_name).stem ext = Path(original_name).suffix or '.jpg' # Build filename using poster's username filename = f"{poster_username}_{date_str}_{actual_media_id}{ext}" filepath = output_dir / filename # Save file with open(filepath, 'wb') as f: f.write(response.content) self.log(f"Downloaded tagged (high-res): {filename} from @{poster_username} ({len(response.content)} bytes)", "info") downloaded_files.append(str(filepath)) # Check for duplicate hash before recording if self.unified_db: from pathlib import Path as PathLib file_hash = self.unified_db.get_file_hash(str(filepath)) if file_hash: existing = self.unified_db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): existing_path = PathLib(existing['file_path']) if existing_path.exists(): self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") try: filepath.unlink() self.log(f"Deleted duplicate: {filename}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {filename}: {e}", "warning") # Update timestamps if post_date: self._update_file_timestamps(filepath, post_date) image_count = 1 # Add to tracking self.downloaded_files.add(actual_media_id) # Mark in database (or defer for later) - use poster_username for tagged content self._record_download( media_id=actual_media_id, username=poster_username, filename=filename, url=post_url, post_date=post_date, file_path=str(filepath), content_type='tagged', deferred=defer_database ) except Exception as e: self.log(f"Failed to download single image: {e}", "warning") else: # No download button found - try video/image src as fallback self.log("No download button found, trying video/image src fallback", "debug") media_src = None # Try video first - multiple selectors for different page structures video_selectors = [ 'video source[src]', 'video[src]', 'video source[type*="mp4"]', '.video-container video', '.post-video video', 'div[class*="video"] video', 'video' ] for v_selector in video_selectors: video_elem = page.locator(v_selector).first if video_elem.count() > 0: # Try src attribute first, then check source child media_src = video_elem.get_attribute('src') if not media_src: source_elem = video_elem.locator('source').first if source_elem.count() > 0: media_src = source_elem.get_attribute('src') if media_src and media_src != '#': self.log(f"Found video src via '{v_selector}': {media_src[:80]}...", "debug") break # If no video found, wait a bit and try again (videos may lazy-load) if not media_src: time.sleep(2) for v_selector in video_selectors: video_elem = page.locator(v_selector).first if video_elem.count() > 0: media_src = video_elem.get_attribute('src') if not media_src: source_elem = video_elem.locator('source').first if source_elem.count() > 0: media_src = source_elem.get_attribute('src') if media_src and media_src != '#': self.log(f"Found video src after wait via '{v_selector}': {media_src[:80]}...", "debug") break # Try image if no video if not media_src: img_elem = page.locator('img[src*="scontent"]:not([src*="profile"]), img[src*="post"]').first if img_elem.count() > 0: media_src = img_elem.get_attribute('src') if media_src and 'lazy.jpg' not in media_src: self.log(f"Found image src: {media_src[:80]}...", "debug") else: media_src = None if media_src: try: import requests from urllib.parse import urlparse, unquote if not media_src.startswith('http'): media_src = f"https://imginn.com{media_src}" response = requests.get(media_src, timeout=30, headers={ 'User-Agent': self.user_agent, 'Referer': 'https://imginn.com/' }, cookies=self._get_cookies_for_requests()) response.raise_for_status() url_path = urlparse(media_src).path original_name = unquote(url_path.split('/')[-1].split('?')[0]) if original_name.startswith('post'): original_name = original_name[4:] actual_media_id = Path(original_name).stem ext = Path(original_name).suffix or '.mp4' filename = f"{poster_username}_{date_str}_{actual_media_id}{ext}" filepath = output_dir / filename with open(filepath, 'wb') as f: f.write(response.content) self.log(f"Downloaded (fallback): {filename} ({len(response.content)} bytes)", "info") downloaded_files.append(str(filepath)) if post_date: self._update_file_timestamps(filepath, post_date) image_count = 1 self.downloaded_files.add(actual_media_id) self._record_download( media_id=actual_media_id, username=poster_username, filename=filename, url=post_url, post_date=post_date, file_path=str(filepath), content_type='tagged', deferred=defer_database ) except Exception as e: self.log(f"Failed to download via fallback: {e}", "error") else: self.log("No download button or media src found for single post", "warning") # Debug: capture screenshot and page content when download fails try: debug_dir = Path("debug") debug_dir.mkdir(exist_ok=True) timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') screenshot_path = debug_dir / f"no_media_tagged_{media_id}_{timestamp}.png" page.screenshot(path=str(screenshot_path)) self.log(f"Debug screenshot saved: {screenshot_path}", "debug") # Also log page title title = page.title() self.log(f"Page title: {title}", "debug") # Check if this is a Cloudflare block - don't mark as processed if so if self._is_cloudflare_challenge(page): self.log(f"Cloudflare block detected - NOT marking tagged post {media_id} as processed (will retry later)", "warning") # Skip to next post without marking as processed try: page.goto(f"https://imginn.com/tagged/{username}/?ref=index") time.sleep(3) except Exception: pass continue except Exception as e: self.log(f"Failed to capture debug screenshot: {e}", "debug") # Navigate back to tagged page if image_count > 0: self.log(f"Successfully downloaded {image_count} image(s) from tagged post {media_id}", "info") self._safe_go_back(page, username, tagged=True) # If we just bypassed Cloudflare, wait longer to let session stabilize if cloudflare_bypassed: cooldown = random.uniform(15, 25) self.log(f"Post-bypass cooldown: waiting {cooldown:.1f}s to stabilize session", "info") time.sleep(cooldown) else: time.sleep(1) except KeyboardInterrupt: self.log("Download interrupted by user", "warning") break except Exception as e: self.log(f"Error processing tagged post: {e}", "error") self._safe_go_back(page, username, tagged=True) self.log(f"Downloaded {len(downloaded_files)} tagged files", "info") except Exception as e: self.log(f"Error: {e}", "error") # Don't close browser here - reuse it for next profile return downloaded_files def download_stories(self, username: str, days_back: int = 1, max_stories: int = 50, output_dir: Path = None, skip_database: bool = False, defer_database: bool = False): """Download stories from a user with FastDL naming Args: username: Instagram username days_back: How many days back to search (stories expire after 24h) max_stories: Maximum stories to download output_dir: Output directory skip_database: If True, don't record downloads in database (for temporary processing) defer_database: If True, defer database recording to pending_downloads list for later recording after file move is complete """ profile_name = username.lower() if output_dir is None: output_dir = Path(f"/opt/media-downloader/downloads/{profile_name}") output_dir.mkdir(parents=True, exist_ok=True) # Check site status before doing anything else self.log("Checking ImgInn site status...", "debug") site_status, error_msg = self.cf_handler.check_site_status("https://imginn.com/", timeout=10) if self.cf_handler.should_skip_download(site_status): self.log(f"Skipping stories download for @{profile_name} - ImgInn is unavailable: {error_msg}", "warning") return [] elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE: self.log("Cloudflare challenge detected, will attempt bypass during download", "info") # Scan existing files self._scan_existing_files(output_dir, profile_name) # Get processed stories from database processed_stories = self._get_processed_posts(profile_name) self.log(f"Loaded {len(processed_stories)} processed stories for {profile_name} from database", "info") downloaded_files = [] cutoff_date = datetime.now() - timedelta(days=days_back) # Start or reuse browser self._start_browser() page = self.page try: # Navigate to stories page self.log(f"Navigating to @{username} stories page", "info") page.goto(f"https://imginn.com/stories/{username}/?ref=index", wait_until='domcontentloaded') # CRITICAL: Wait for Cloudflare background JS challenges import random wait_time = 5 + random.uniform(0, 2) self.log(f"Waiting {wait_time:.1f}s for Cloudflare background validation...", "debug") time.sleep(wait_time) # Wait for page to load if not self.wait_for_cloudflare(page): self.log("Stories page didn't load properly", "error") return [] # Save cookies self.save_cookies(self.context) # Wait for stories container to load self.log("Waiting for stories to load...", "info") try: page.wait_for_selector('.swiper-container.reels', timeout=10000) self.log("Stories container loaded", "info") except Exception: self.log("No stories found - may have expired or page structure changed", "warning") return [] # Find the Stories reel (first li.reel with data-uid and title "Stories") self.log("Looking for Stories reel...", "info") stories_reel = None reels = page.locator('li.reel[data-uid]').all() for reel in reels: try: # Check if this is the "Stories" reel title = reel.locator('.title').first.text_content() if title and title.strip().lower() == "stories": stories_reel = reel self.log(f"Found Stories reel", "info") break except Exception: continue if not stories_reel: self.log("No active Stories found for this user", "warning") return [] # Click the Stories reel to open viewer self.log("Opening Stories viewer...", "info") stories_reel.click() time.sleep(2) # Wait for viewer to open # Find all download buttons in the story viewer self.log("Finding story download links...", "info") download_links = page.locator('div.action a.download').all() if not download_links: self.log("No story download links found", "warning") return [] self.log(f"Found {len(download_links)} stories", "info") # Set initial progress so dashboard shows 0/N immediately stories_to_download = min(len(download_links), max_stories) self.activity_manager.update_status( "Downloading stories", progress_current=0, progress_total=stories_to_download ) # Download each story story_index = 1 for i, download_link in enumerate(download_links[:max_stories]): # Update progress at start of each iteration (fires even on skips) self.activity_manager.update_status( "Downloading stories", progress_current=i + 1, progress_total=stories_to_download ) try: # Get download URL download_url = download_link.get_attribute('href') if not download_url or download_url == '#': self.log(f"Story {story_index}: Invalid download URL", "warning") continue self.log(f"Story {story_index}: {download_url[:80]}...", "debug") # Extract media ID from URL or generate unique ID from urllib.parse import urlparse, unquote url_path = urlparse(download_url).path original_name = unquote(url_path.split('/')[-1].split('?')[0]) media_id_full = Path(original_name).stem # Full filename stem for unique naming ext = Path(original_name).suffix or '.jpg' # Extract real Instagram media ID (18-digit number) for duplicate checking media_id_for_tracking = extract_instagram_media_id(media_id_full) self.log(f"Story {story_index}: Full ID: {media_id_full[:40]}..., Tracking ID: {media_id_for_tracking}", "debug") # Check if already downloaded using the normalized media ID if media_id_for_tracking in self.downloaded_files or media_id_for_tracking in processed_stories: self.log(f"Story {story_index}: Already downloaded (tracking ID: {media_id_for_tracking}), skipping", "debug") story_index += 1 continue # Also check with full ID for backwards compatibility if media_id_full in self.downloaded_files or media_id_full in processed_stories: self.log(f"Story {story_index}: Already downloaded (full ID: {media_id_full[:30]}...), skipping", "debug") story_index += 1 continue # Use current date for stories (they expire after 24h) story_date = datetime.now() date_str = story_date.strftime('%Y%m%d_%H%M%S') # Build filename: {profile}_{date}_{media_id}_story{index}{ext} # Use full media ID in filename for uniqueness filename = f"{profile_name}_{date_str}_{media_id_full}_story{story_index}{ext}" filepath = output_dir / filename # Download the story try: import requests response = requests.get(download_url, timeout=30, headers={ 'User-Agent': self.user_agent, 'Referer': 'https://imginn.com/' }, cookies=self._get_cookies_for_requests()) response.raise_for_status() # Save file with open(filepath, 'wb') as f: f.write(response.content) self.log(f"Downloaded story: {filename} ({len(response.content)} bytes)", "info") downloaded_files.append(str(filepath)) # Check for duplicate hash before recording if self.unified_db: from pathlib import Path as PathLib file_hash = self.unified_db.get_file_hash(str(filepath)) if file_hash: existing = self.unified_db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): existing_path = PathLib(existing['file_path']) if existing_path.exists(): self.log(f"⚠ Duplicate file detected: {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") try: filepath.unlink() self.log(f"Deleted duplicate: {filename}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {filename}: {e}", "warning") # Update timestamps self._update_file_timestamps(filepath, story_date) # Add both tracking ID and full ID to tracking set for comprehensive duplicate prevention self.downloaded_files.add(media_id_for_tracking) self.downloaded_files.add(media_id_full) # Mark in database with media_id in metadata (or defer for later) # Use the normalized media ID for database tracking to prevent future duplicates if not skip_database or defer_database: self._record_download( media_id=media_id_for_tracking, username=profile_name, filename=filename, url=download_url, post_date=story_date, file_path=str(filepath), content_type='stories', metadata={'media_id_full': media_id_full}, deferred=defer_database ) story_index += 1 except Exception as e: self.log(f"Failed to download story {story_index}: {e}", "error") story_index += 1 continue except Exception as e: self.log(f"Error processing story {story_index}: {e}", "error") story_index += 1 continue self.log(f"Downloaded {len(downloaded_files)} story files", "info") except Exception as e: self.log(f"Error downloading stories: {e}", "error") # Don't close browser here - reuse it for next profile return downloaded_files def main(): """Test the downloader with FastDL naming""" import sys print("=" * 60) print("ImgInn Downloader - FastDL Compatible Naming") print("=" * 60) print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print("=" * 60) downloader = ImgInnDownloader( api_key=os.environ.get("IMGINN_API_KEY", ""), headless=False # Use with xvfb ) # Check for specific post URL in arguments if len(sys.argv) > 1 and 'imginn.com/p/' in sys.argv[-1]: # Download specific post without date filter files = downloader.download_posts( username="evalongoria", days_back=365, # Use large value to bypass date filter max_posts=5, specific_post_url=sys.argv[-1] ) else: # Download evalongoria posts from last 2 weeks files = downloader.download_posts( username="evalongoria", days_back=14, max_posts=50 ) print("\n" + "=" * 60) print("RESULTS") print("=" * 60) if files: print(f"Successfully downloaded {len(files)} files!") print("\nšŸ“ Downloaded files (FastDL naming format):") for f in files: name = Path(f).name size = Path(f).stat().st_size / 1024 # Show the naming format parts = name.split('_', 3) if len(parts) >= 4: print(f" - {name}") print(f" Profile: {parts[0]}") print(f" Date: {parts[1]}_{parts[2]}") print(f" Media ID: {parts[3].split('.')[0]}") print(f" Size: {size:.1f} KB") else: print("No files downloaded") # Check total in folder download_dir = Path("/opt/media-downloader/downloads/evalongoria") if download_dir.exists(): all_files = list(download_dir.glob("*")) total_size = sum(f.stat().st_size for f in all_files) / 1024 print(f"\nšŸ“Š Total in folder: {len(all_files)} files ({total_size:.1f} KB)") if __name__ == "__main__": main()