#!/usr/bin/env python3 """ ImgInn API-based downloader module. Uses ImgInn's JSON API endpoints instead of DOM scraping for reliable, structured Instagram content downloading. API Endpoints: /api/posts/ — Paginated posts with full carousel support via `srcs` array /api/story/ — Stories with direct CDN URLs /api/tagged — Tagged posts (minimal data, supplemented via post pages) Advantages over DOM scraping: - Carousel items grouped by post (srcs array) - Exact UNIX timestamps for post dates - Reliable cursor-based pagination - No Playwright dependency (uses curl_cffi for TLS fingerprint matching) - Pinned post detection (isPind flag) Uses curl_cffi to impersonate Chrome's TLS fingerprint, which is required for Cloudflare cf_clearance cookies to work outside a real browser. """ import os import re import json import time import hashlib from curl_cffi import requests as cf_requests from curl_cffi.requests.exceptions import ImpersonateError from pathlib import Path def _create_cf_session(**kwargs): """Create a curl_cffi session, trying multiple browser versions for compatibility.""" for browser in ("chrome131", "chrome136", "chrome"): try: return cf_requests.Session(impersonate=browser, **kwargs) except Exception: continue return cf_requests.Session(**kwargs) from datetime import datetime, timedelta from typing import Dict, List, Optional, Set, Tuple from modules.base_module import LoggingMixin from modules.cloudflare_handler import ( CloudflareHandler, SiteStatus, get_flaresolverr_user_agent, get_flaresolverr_fingerprint ) from modules.instagram_utils import ( extract_instagram_media_id, media_id_to_shortcode, scan_existing_files_for_media_ids, record_instagram_download, is_instagram_downloaded ) class ImgInnAPIDownloader(LoggingMixin): """ImgInn API-based downloader with full carousel grouping support.""" IMGINN_BASE = "https://imginn.com" def __init__(self, headless=True, cookie_file=None, show_progress=True, use_database=True, log_callback=None, unified_db=None): """Initialize downloader (compatible with ImgInnDownloader interface). Args: headless: Ignored (no browser needed), kept for interface compat cookie_file: Cookie file path (used only if no unified_db) show_progress: Whether to show progress updates use_database: Whether to use database for tracking log_callback: Optional log callback unified_db: UnifiedDatabase instance """ self._init_logger('Instagram', log_callback, default_module='Download') self.headless = headless self.downloaded_files: Set[str] = set() self.show_progress = show_progress self.use_database = use_database self.download_count = 0 self.unified_db = unified_db self.scraper_id = 'imginn' self.pending_downloads: List[dict] = [] if unified_db and use_database: self.unified_db = unified_db else: self.unified_db = None self.use_database = False # Activity status manager from modules.activity_status import get_activity_manager self.activity_manager = get_activity_manager(unified_db) # Proxy config from database self.proxy_url = None if unified_db: scraper_config = unified_db.get_scraper(self.scraper_id) if scraper_config: if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'): self.proxy_url = scraper_config['proxy_url'] self.log(f"Using proxy: {self.proxy_url}", "info") # User agent from FlareSolverr self.user_agent = get_flaresolverr_user_agent() # CloudflareHandler (no cookie file when using DB) self.cf_handler = CloudflareHandler( module_name="ImgInn", cookie_file=None if unified_db else (cookie_file or "/opt/media-downloader/cookies/imginn_cookies.json"), user_agent=self.user_agent, logger=self.logger, aggressive_expiry=True, proxy_url=self.proxy_url ) self._load_cookies_from_db() # HTTP session (curl_cffi with Chrome TLS fingerprint) self.session = _create_cf_session() self._setup_session() # Rate limiting self._last_request_time = None self._min_request_interval = 2 # seconds between requests # Cookie refresh cooldown (don't re-fetch within 5 minutes) self._last_cookie_refresh = None self._cookie_refresh_interval = 300 # 5 minutes # User ID cache (username -> id) self._user_id_cache: Dict[str, str] = {} # ==================== Cookie / Session ==================== def _recreate_session(self): """Recreate the curl_cffi session when impersonation fails at request time.""" self.log("Impersonation error, recreating curl_cffi session...", "warning") try: self.session.close() except Exception: pass self.session = _create_cf_session() self._setup_session() self._refresh_session_cookies() def _load_cookies_from_db(self): if not self.unified_db: return try: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) if cookies: self.cf_handler._cookies = cookies self.log(f"Loaded {len(cookies)} cookies from database", "debug") except Exception as e: self.log(f"Error loading cookies: {e}", "warning") def _save_cookies_to_db(self, cookies, user_agent=None): if not self.unified_db: return try: ua = user_agent or self.user_agent self.unified_db.save_scraper_cookies(self.scraper_id, cookies, user_agent=ua, merge=True) except Exception as e: self.log(f"Error saving cookies: {e}", "warning") def _setup_session(self): """Configure curl_cffi session with CF-matching headers.""" fingerprint = get_flaresolverr_fingerprint() stored_ua = None if self.unified_db: try: stored_ua = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id) except Exception: pass self._stored_ua = stored_ua or fingerprint.get('user_agent', self.user_agent) self._default_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': fingerprint.get('accept_language', 'en-US,en;q=0.9'), 'Connection': 'keep-alive', 'User-Agent': self._stored_ua, } # Load CF cookies self._refresh_session_cookies() def _refresh_session_cookies(self): """Reload CF cookies into the curl_cffi session.""" cf_cookies = self.cf_handler.get_cookies_dict() # curl_cffi session uses a cookies dict for name, value in cf_cookies.items(): self.session.cookies.set(name, value, domain=".imginn.com") def _ensure_cookies(self, force: bool = False) -> bool: """Ensure valid CF cookies, refresh via FlareSolverr if needed. Uses a cooldown to avoid calling FlareSolverr too frequently. With aggressive_expiry=True, cookies_expired() returns True whenever cf_clearance expiry is within 7 days — but cf_clearance only lasts ~30 min, so without cooldown we'd call FlareSolverr on every single request. Args: force: If True, skip cooldown and expiry checks and always refresh. Used when a 403 proves the current cookies are invalid. """ if not force: # If we refreshed recently, skip the expiry check entirely if self._last_cookie_refresh: elapsed = time.time() - self._last_cookie_refresh if elapsed < self._cookie_refresh_interval: return True if not self.cf_handler.cookies_expired(): return True self.log("Cookies expired, refreshing via FlareSolverr...", "info") success = self.cf_handler.get_cookies_via_flaresolverr(f"{self.IMGINN_BASE}/") self._last_cookie_refresh = time.time() if success: cookies_list = self.cf_handler.get_cookies_list() flaresolverr_ua = self.cf_handler.get_user_agent() if cookies_list and self.unified_db: self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua) # Refresh session cookies and UA if flaresolverr_ua: self._stored_ua = flaresolverr_ua self._default_headers['User-Agent'] = flaresolverr_ua self._refresh_session_cookies() return True self.log("Failed to get fresh cookies", "warning") return False # ==================== HTTP Helpers ==================== def _rate_limit(self): if self._last_request_time: elapsed = time.time() - self._last_request_time if elapsed < self._min_request_interval: time.sleep(self._min_request_interval - elapsed) self._last_request_time = time.time() def _is_cf_challenge(self, text: str) -> bool: """Check if response is a Cloudflare challenge page.""" if len(text) > 10000: return False lower = text[:2000].lower() return any(ind in lower for ind in [ 'just a moment', 'checking your browser', 'verify you are human', 'challenge-platform' ]) and ' Optional[str]: """Fetch a page via curl_cffi (Chrome TLS), handle CF challenges.""" self._ensure_cookies() self._rate_limit() headers = {**self._default_headers} try: try: resp = self.session.get(url, headers=headers, timeout=30, allow_redirects=True) except ImpersonateError: self._recreate_session() resp = self.session.get(url, headers=headers, timeout=30, allow_redirects=True) if resp.status_code == 403 or self._is_cf_challenge(resp.text): self.log(f"CF challenge on {url}, trying FlareSolverr direct fetch...", "info") return self._fetch_html_via_flaresolverr(url) if resp.status_code == 404: self.log(f"Page not found: {url}", "warning") return None # ImgInn returns 410 for some valid profiles — treat as OK if body has content if resp.status_code == 410 and len(resp.text) > 1000: return resp.text # Retry on server errors (500/502/503) — often transient if resp.status_code >= 500: self.log(f"HTTP {resp.status_code} for {url}, retrying in 5s...", "warning") time.sleep(5) self._rate_limit() resp = self.session.get(url, headers=headers, timeout=30, allow_redirects=True) if resp.status_code >= 500: self.log(f"HTTP {resp.status_code} for {url} on retry, trying FlareSolverr...", "warning") return self._fetch_html_via_flaresolverr(url) if resp.status_code == 200: return resp.text if resp.status_code == 410 and len(resp.text) > 1000: return resp.text if resp.status_code != 200: self.log(f"HTTP {resp.status_code} for {url}", "warning") return None return resp.text except Exception as e: self.log(f"Error fetching {url}: {e}", "error") return None def _call_api(self, endpoint: str, params: dict) -> Optional[dict]: """Make an API call to ImgInn, return parsed JSON. Falls back to FlareSolverr if curl_cffi gets 403 (CF challenge). This is needed for endpoints like /api/story/ where Cloudflare applies stricter path-based rules. """ self._ensure_cookies() self._rate_limit() url = f"{self.IMGINN_BASE}{endpoint}" headers = { **self._default_headers, 'Accept': '*/*', 'Referer': f'{self.IMGINN_BASE}/', 'X-Requested-With': 'XMLHttpRequest', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', } try: try: resp = self.session.get(url, params=params, headers=headers, timeout=30) except ImpersonateError: self._recreate_session() resp = self.session.get(url, params=params, headers=headers, timeout=30) if resp.status_code == 429: self.log("Rate limited (429), waiting 30s...", "warning") time.sleep(30) self._rate_limit() resp = self.session.get(url, params=params, headers=headers, timeout=30) if resp.status_code == 403 or self._is_cf_challenge(resp.text): self.log(f"CF challenge on {endpoint}, trying FlareSolverr...", "info") return self._call_api_via_flaresolverr(url, params) if resp.status_code != 200: self.log(f"API {resp.status_code} for {endpoint}", "warning") return None return resp.json() except (ValueError, json.JSONDecodeError): self.log(f"Invalid JSON from {endpoint}", "warning") return None except Exception as e: self.log(f"API error {endpoint}: {e}", "error") return None def _call_api_via_flaresolverr(self, url: str, params: dict) -> Optional[dict]: """Fetch an API endpoint through FlareSolverr's browser. Used as fallback when curl_cffi gets 403 from Cloudflare on certain API endpoints (e.g. /api/story/). """ import html as html_mod # Build full URL with query params from urllib.parse import urlencode full_url = f"{url}?{urlencode(params)}" if params else url try: import requests as std_requests payload = { 'cmd': 'request.get', 'url': full_url, 'maxTimeout': 60000, } resp = std_requests.post('http://localhost:8191/v1', json=payload, timeout=70) data = resp.json() if data.get('status') != 'ok': self.log(f"FlareSolverr error: {data.get('message', 'unknown')}", "warning") return None solution = data.get('solution', {}) response_text = solution.get('response', '') # Save cookies from FlareSolverr for future curl_cffi requests cookies_list = solution.get('cookies', []) if cookies_list: flaresolverr_ua = solution.get('userAgent', self.cf_handler.get_user_agent()) self.cf_handler.save_cookies(cookies_list, user_agent=flaresolverr_ua) if flaresolverr_ua: self._stored_ua = flaresolverr_ua self._default_headers['User-Agent'] = flaresolverr_ua if self.unified_db: self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua) self._refresh_session_cookies() if not response_text: return None # FlareSolverr wraps JSON responses in HTML
 tags
            pre_match = re.search(r']*>(.*?)
', response_text, re.DOTALL) if pre_match: json_text = html_mod.unescape(pre_match.group(1)) return json.loads(json_text) # Try parsing raw response as JSON return json.loads(response_text) except (ValueError, json.JSONDecodeError) as e: self.log(f"FlareSolverr JSON parse error: {e}", "warning") return None except Exception as e: self.log(f"FlareSolverr fetch error: {e}", "error") return None def _fetch_html_via_flaresolverr(self, url: str) -> Optional[str]: """Fetch an HTML page through FlareSolverr's browser. Used as fallback when curl_cffi gets 403 from Cloudflare on HTML pages (e.g. /tagged/) that have stricter path-based rules. """ try: import requests as std_requests payload = { 'cmd': 'request.get', 'url': url, 'maxTimeout': 120000, } resp = std_requests.post('http://localhost:8191/v1', json=payload, timeout=130) data = resp.json() if data.get('status') != 'ok': self.log(f"FlareSolverr error: {data.get('message', 'unknown')}", "warning") return None solution = data.get('solution', {}) response_text = solution.get('response', '') if not response_text: self.log("FlareSolverr returned empty response", "warning") return None if self._is_cf_challenge(response_text): self.log("FlareSolverr could not bypass CF challenge", "warning") return None # Save cookies from FlareSolverr for future curl_cffi requests cookies_list = solution.get('cookies', []) if cookies_list: flaresolverr_ua = solution.get('userAgent', self.cf_handler.get_user_agent()) self.cf_handler.save_cookies(cookies_list, user_agent=flaresolverr_ua) if flaresolverr_ua: self._stored_ua = flaresolverr_ua self._default_headers['User-Agent'] = flaresolverr_ua if self.unified_db: self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua) self._refresh_session_cookies() return response_text except Exception as e: self.log(f"FlareSolverr HTML fetch error: {e}", "error") return None def _download_file(self, url: str, output_path: Path) -> bool: """Download a file from CDN URL (no rate limit — goes to Instagram CDN, not ImgInn).""" try: # CDN downloads don't need ImgInn CF cookies - use session for TLS fingerprint resp = self.session.get( url, headers={'Referer': f'{self.IMGINN_BASE}/'}, timeout=120, ) if resp.status_code != 200: self.log(f"Download HTTP {resp.status_code}: {output_path.name}", "warning") return False output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'wb') as f: f.write(resp.content) size = output_path.stat().st_size if size < 1000: self.log(f"File too small ({size}B), discarding: {output_path.name}", "warning") output_path.unlink() return False return True except Exception as e: self.log(f"Download error: {e}", "error") if output_path.exists(): try: output_path.unlink() except OSError: pass return False # ==================== Profile / Page Data Extraction ==================== def _get_profile_info(self, username: str) -> Optional[dict]: """Fetch profile page and extract user_id, cursor, shortcodes.""" url = f"{self.IMGINN_BASE}/{username}/" html = self._fetch_html(url) if not html: # Retry once with forced cookie refresh (handles expired CF cookies) if self._ensure_cookies(force=True): html = self._fetch_html(url) if not html: return None info = { 'user_id': None, 'cursor': None, 'verified': False, 'shortcodes': [], } # data-id on container or load-more button id_match = re.search(r'data-id="(\d+)"', html) if id_match: info['user_id'] = id_match.group(1) self._user_id_cache[username] = info['user_id'] # data-cursor on load-more button cursor_match = re.search(r'data-cursor="([^"]+)"', html) if cursor_match: info['cursor'] = cursor_match.group(1) # verified flag if 'data-verified="true"' in html or 'data-verified="1"' in html: info['verified'] = True # Extract post shortcodes from grid links shortcodes = re.findall(r'href="/p/([A-Za-z0-9_-]+)/"', html) seen = set() for sc in shortcodes: if sc not in seen: seen.add(sc) info['shortcodes'].append(sc) return info def get_user_profile(self, username: str) -> Optional[dict]: """Fetch public profile info: avatar, display name, bio, stats. Args: username: Instagram username Returns: Dict with keys: username, user_id, display_name, avatar_url, bio, followers, following, posts_count, verified Returns None if profile cannot be fetched. """ import html as html_mod if not self._ensure_cookies(): return None html = self._fetch_html(f"{self.IMGINN_BASE}/{username}/") if not html: return None profile = { 'username': username, 'user_id': None, 'display_name': None, 'avatar_url': None, 'bio': None, 'followers': None, 'following': None, 'posts_count': None, 'verified': False, } # User ID id_match = re.search(r'data-id="(\d+)"', html) if id_match: profile['user_id'] = id_match.group(1) self._user_id_cache[username] = profile['user_id'] # Verified if 'data-verified="true"' in html or 'data-verified="1"' in html: profile['verified'] = True # Avatar from og:image og_img = re.search(r'property="og:image"\s*content="([^"]+)"', html) if og_img: profile['avatar_url'] = html_mod.unescape(og_img.group(1)) # Display name from og:title: "View Display Name(@username)..." og_title = re.search(r'property="og:title"\s*content="([^"]+)"', html) if og_title: title_text = html_mod.unescape(og_title.group(1)) name_match = re.match(r'View\s+(.+?)\s*\(@', title_text) if name_match: profile['display_name'] = name_match.group(1).strip() # Bio and stats from og:description # Format: "Bio text here Followers_count Followers, Following_count Following, Posts_count Posts" og_desc = re.search(r'property="og:description"\s*content="([^"]+)"', html) if og_desc: desc = html_mod.unescape(og_desc.group(1)) # Extract stats from end of description stats_match = re.search( r'([\d,.]+[MKk]?)\s*Followers?,\s*([\d,.]+[MKk]?)\s*Following,\s*([\d,.]+[MKk]?)\s*Posts?', desc ) if stats_match: profile['followers'] = stats_match.group(1) profile['following'] = stats_match.group(2) profile['posts_count'] = stats_match.group(3) # Bio is everything before the stats bio = desc[:stats_match.start()].strip().rstrip(',').strip() if bio: profile['bio'] = bio return profile def _get_stories_params(self, username: str, user_id: str = None) -> Optional[dict]: """Get parameters for the stories API call. The stories API requires uid, name, and hash parameters. - uid: Instagram numeric user ID (from profile page or cache) - name: Instagram username - hash: floor(current_time / 100000) — time-based hash Note: We don't fetch the /stories/ HTML page because Cloudflare applies stricter challenge rules to that path. Instead, we get the uid from the profile page (which works fine) and compute the hash directly. """ uid = user_id or self._user_id_cache.get(username) if not uid: # Fetch profile page to get user_id profile = self._get_profile_info(username) if profile: uid = profile['user_id'] if not uid: self.log(f"Cannot resolve user_id for @{username}", "warning") return None # Hash computation: ceil(current_time / 100000) # Using ceil (floor + 1) to get the current cache period instead of the # previous one, which returns stale story data. story_hash = str(int(time.time()) // 100000 + 1) return {'uid': uid, 'name': username, 'hash': story_hash} def _get_tagged_info(self, username: str) -> Optional[dict]: """Fetch tagged page and extract user_id + cursor.""" html = self._fetch_html(f"{self.IMGINN_BASE}/tagged/{username}/") if not html: return None info = {'user_id': None, 'cursor': None, 'shortcodes': []} id_match = re.search(r'data-id="(\d+)"', html) if id_match: info['user_id'] = id_match.group(1) self._user_id_cache[username] = info['user_id'] cursor_match = re.search(r'data-cursor="([^"]+)"', html) if cursor_match: info['cursor'] = cursor_match.group(1) # Extract tagged post shortcodes shortcodes = re.findall(r'href="/p/([A-Za-z0-9_-]+)/"', html) seen = set() for sc in shortcodes: if sc not in seen: seen.add(sc) info['shortcodes'].append(sc) return info def _get_post_detail(self, shortcode: str) -> Optional[dict]: """Fetch individual post page and extract media URLs + metadata.""" html = self._fetch_html(f"{self.IMGINN_BASE}/p/{shortcode}/") if not html: return None post = { 'code': shortcode, 'date': None, 'alt': '', 'author': None, 'srcs': [], 'isSidecar': False, 'isPind': False, } import html as html_mod # Extract date from data-created (UNIX timestamp) date_match = re.search(r'data-created="(\d+)"', html) if date_match: post['date'] = int(date_match.group(1)) else: # Fallback: try datetime attribute on