#!/usr/bin/env python3 """ ImgInn API-based downloader module. Uses ImgInn's JSON API endpoints instead of DOM scraping for reliable, structured Instagram content downloading. API Endpoints: /api/posts/ — Paginated posts with full carousel support via `srcs` array /api/story/ — Stories with direct CDN URLs /api/tagged — Tagged posts (minimal data, supplemented via post pages) Advantages over DOM scraping: - Carousel items grouped by post (srcs array) - Exact UNIX timestamps for post dates - Reliable cursor-based pagination - No Playwright dependency (uses curl_cffi for TLS fingerprint matching) - Pinned post detection (isPind flag) Uses curl_cffi to impersonate Chrome's TLS fingerprint, which is required for Cloudflare cf_clearance cookies to work outside a real browser. """ import os import re import json import time import hashlib from curl_cffi import requests as cf_requests from curl_cffi.requests.exceptions import ImpersonateError from pathlib import Path def _create_cf_session(**kwargs): """Create a curl_cffi session, trying multiple browser versions for compatibility.""" for browser in ("chrome131", "chrome136", "chrome"): try: return cf_requests.Session(impersonate=browser, **kwargs) except Exception: continue return cf_requests.Session(**kwargs) from datetime import datetime, timedelta from typing import Dict, List, Optional, Set, Tuple from modules.base_module import LoggingMixin from modules.cloudflare_handler import ( CloudflareHandler, SiteStatus, get_flaresolverr_user_agent, get_flaresolverr_fingerprint ) from modules.instagram_utils import ( extract_instagram_media_id, media_id_to_shortcode, scan_existing_files_for_media_ids, record_instagram_download, is_instagram_downloaded ) class ImgInnAPIDownloader(LoggingMixin): """ImgInn API-based downloader with full carousel grouping support.""" IMGINN_BASE = "https://imginn.com" def __init__(self, headless=True, cookie_file=None, show_progress=True, use_database=True, log_callback=None, unified_db=None): """Initialize downloader (compatible with ImgInnDownloader interface). Args: headless: Ignored (no browser needed), kept for interface compat cookie_file: Cookie file path (used only if no unified_db) show_progress: Whether to show progress updates use_database: Whether to use database for tracking log_callback: Optional log callback unified_db: UnifiedDatabase instance """ self._init_logger('Instagram', log_callback, default_module='Download') self.headless = headless self.downloaded_files: Set[str] = set() self.show_progress = show_progress self.use_database = use_database self.download_count = 0 self.unified_db = unified_db self.scraper_id = 'imginn' self.pending_downloads: List[dict] = [] if unified_db and use_database: self.unified_db = unified_db else: self.unified_db = None self.use_database = False # Activity status manager from modules.activity_status import get_activity_manager self.activity_manager = get_activity_manager(unified_db) # Proxy config from database self.proxy_url = None if unified_db: scraper_config = unified_db.get_scraper(self.scraper_id) if scraper_config: if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'): self.proxy_url = scraper_config['proxy_url'] self.log(f"Using proxy: {self.proxy_url}", "info") # User agent from FlareSolverr self.user_agent = get_flaresolverr_user_agent() # CloudflareHandler (no cookie file when using DB) self.cf_handler = CloudflareHandler( module_name="ImgInn", cookie_file=None if unified_db else (cookie_file or "/opt/media-downloader/cookies/imginn_cookies.json"), user_agent=self.user_agent, logger=self.logger, aggressive_expiry=True, proxy_url=self.proxy_url ) self._load_cookies_from_db() # HTTP session (curl_cffi with Chrome TLS fingerprint) self.session = _create_cf_session() self._setup_session() # Rate limiting self._last_request_time = None self._min_request_interval = 2 # seconds between requests # Cookie refresh cooldown (don't re-fetch within 5 minutes) self._last_cookie_refresh = None self._cookie_refresh_interval = 300 # 5 minutes # User ID cache (username -> id) self._user_id_cache: Dict[str, str] = {} # ==================== Cookie / Session ==================== def _recreate_session(self): """Recreate the curl_cffi session when impersonation fails at request time.""" self.log("Impersonation error, recreating curl_cffi session...", "warning") try: self.session.close() except Exception: pass self.session = _create_cf_session() self._setup_session() self._refresh_session_cookies() def _load_cookies_from_db(self): if not self.unified_db: return try: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) if cookies: self.cf_handler._cookies = cookies self.log(f"Loaded {len(cookies)} cookies from database", "debug") except Exception as e: self.log(f"Error loading cookies: {e}", "warning") def _save_cookies_to_db(self, cookies, user_agent=None): if not self.unified_db: return try: ua = user_agent or self.user_agent self.unified_db.save_scraper_cookies(self.scraper_id, cookies, user_agent=ua, merge=True) except Exception as e: self.log(f"Error saving cookies: {e}", "warning") def _setup_session(self): """Configure curl_cffi session with CF-matching headers.""" fingerprint = get_flaresolverr_fingerprint() stored_ua = None if self.unified_db: try: stored_ua = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id) except Exception: pass self._stored_ua = stored_ua or fingerprint.get('user_agent', self.user_agent) self._default_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': fingerprint.get('accept_language', 'en-US,en;q=0.9'), 'Connection': 'keep-alive', 'User-Agent': self._stored_ua, } # Load CF cookies self._refresh_session_cookies() def _refresh_session_cookies(self): """Reload CF cookies into the curl_cffi session.""" cf_cookies = self.cf_handler.get_cookies_dict() # curl_cffi session uses a cookies dict for name, value in cf_cookies.items(): self.session.cookies.set(name, value, domain=".imginn.com") def _ensure_cookies(self, force: bool = False) -> bool: """Ensure valid CF cookies, refresh via FlareSolverr if needed. Uses a cooldown to avoid calling FlareSolverr too frequently. With aggressive_expiry=True, cookies_expired() returns True whenever cf_clearance expiry is within 7 days — but cf_clearance only lasts ~30 min, so without cooldown we'd call FlareSolverr on every single request. Args: force: If True, skip cooldown and expiry checks and always refresh. Used when a 403 proves the current cookies are invalid. """ if not force: # If we refreshed recently, skip the expiry check entirely if self._last_cookie_refresh: elapsed = time.time() - self._last_cookie_refresh if elapsed < self._cookie_refresh_interval: return True if not self.cf_handler.cookies_expired(): return True self.log("Cookies expired, refreshing via FlareSolverr...", "info") success = self.cf_handler.get_cookies_via_flaresolverr(f"{self.IMGINN_BASE}/") self._last_cookie_refresh = time.time() if success: cookies_list = self.cf_handler.get_cookies_list() flaresolverr_ua = self.cf_handler.get_user_agent() if cookies_list and self.unified_db: self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua) # Refresh session cookies and UA if flaresolverr_ua: self._stored_ua = flaresolverr_ua self._default_headers['User-Agent'] = flaresolverr_ua self._refresh_session_cookies() return True self.log("Failed to get fresh cookies", "warning") return False # ==================== HTTP Helpers ==================== def _rate_limit(self): if self._last_request_time: elapsed = time.time() - self._last_request_time if elapsed < self._min_request_interval: time.sleep(self._min_request_interval - elapsed) self._last_request_time = time.time() def _is_cf_challenge(self, text: str) -> bool: """Check if response is a Cloudflare challenge page.""" if len(text) > 10000: return False lower = text[:2000].lower() return any(ind in lower for ind in [ 'just a moment', 'checking your browser', 'verify you are human', 'challenge-platform' ]) and '