Files
media-downloader/modules/imginn_api_module.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

2020 lines
80 KiB
Python

#!/usr/bin/env python3
"""
ImgInn API-based downloader module.
Uses ImgInn's JSON API endpoints instead of DOM scraping for reliable,
structured Instagram content downloading.
API Endpoints:
/api/posts/ — Paginated posts with full carousel support via `srcs` array
/api/story/ — Stories with direct CDN URLs
/api/tagged — Tagged posts (minimal data, supplemented via post pages)
Advantages over DOM scraping:
- Carousel items grouped by post (srcs array)
- Exact UNIX timestamps for post dates
- Reliable cursor-based pagination
- No Playwright dependency (uses curl_cffi for TLS fingerprint matching)
- Pinned post detection (isPind flag)
Uses curl_cffi to impersonate Chrome's TLS fingerprint, which is required
for Cloudflare cf_clearance cookies to work outside a real browser.
"""
import os
import re
import json
import time
import hashlib
from curl_cffi import requests as cf_requests
from curl_cffi.requests.exceptions import ImpersonateError
from pathlib import Path
def _create_cf_session(**kwargs):
"""Create a curl_cffi session, trying multiple browser versions for compatibility."""
for browser in ("chrome131", "chrome136", "chrome"):
try:
return cf_requests.Session(impersonate=browser, **kwargs)
except Exception:
continue
return cf_requests.Session(**kwargs)
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Set, Tuple
from modules.base_module import LoggingMixin
from modules.cloudflare_handler import (
CloudflareHandler, SiteStatus,
get_flaresolverr_user_agent,
get_flaresolverr_fingerprint
)
from modules.instagram_utils import (
extract_instagram_media_id,
media_id_to_shortcode,
scan_existing_files_for_media_ids,
record_instagram_download,
is_instagram_downloaded
)
class ImgInnAPIDownloader(LoggingMixin):
"""ImgInn API-based downloader with full carousel grouping support."""
IMGINN_BASE = "https://imginn.com"
def __init__(self, headless=True, cookie_file=None,
show_progress=True, use_database=True,
log_callback=None, unified_db=None):
"""Initialize downloader (compatible with ImgInnDownloader interface).
Args:
headless: Ignored (no browser needed), kept for interface compat
cookie_file: Cookie file path (used only if no unified_db)
show_progress: Whether to show progress updates
use_database: Whether to use database for tracking
log_callback: Optional log callback
unified_db: UnifiedDatabase instance
"""
self._init_logger('Instagram', log_callback, default_module='Download')
self.headless = headless
self.downloaded_files: Set[str] = set()
self.show_progress = show_progress
self.use_database = use_database
self.download_count = 0
self.unified_db = unified_db
self.scraper_id = 'imginn'
self.pending_downloads: List[dict] = []
if unified_db and use_database:
self.unified_db = unified_db
else:
self.unified_db = None
self.use_database = False
# Activity status manager
from modules.activity_status import get_activity_manager
self.activity_manager = get_activity_manager(unified_db)
# Proxy config from database
self.proxy_url = None
if unified_db:
scraper_config = unified_db.get_scraper(self.scraper_id)
if scraper_config:
if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
self.proxy_url = scraper_config['proxy_url']
self.log(f"Using proxy: {self.proxy_url}", "info")
# User agent from FlareSolverr
self.user_agent = get_flaresolverr_user_agent()
# CloudflareHandler (no cookie file when using DB)
self.cf_handler = CloudflareHandler(
module_name="ImgInn",
cookie_file=None if unified_db else (cookie_file or "/opt/media-downloader/cookies/imginn_cookies.json"),
user_agent=self.user_agent,
logger=self.logger,
aggressive_expiry=True,
proxy_url=self.proxy_url
)
self._load_cookies_from_db()
# HTTP session (curl_cffi with Chrome TLS fingerprint)
self.session = _create_cf_session()
self._setup_session()
# Rate limiting
self._last_request_time = None
self._min_request_interval = 2 # seconds between requests
# Cookie refresh cooldown (don't re-fetch within 5 minutes)
self._last_cookie_refresh = None
self._cookie_refresh_interval = 300 # 5 minutes
# User ID cache (username -> id)
self._user_id_cache: Dict[str, str] = {}
# ==================== Cookie / Session ====================
def _recreate_session(self):
"""Recreate the curl_cffi session when impersonation fails at request time."""
self.log("Impersonation error, recreating curl_cffi session...", "warning")
try:
self.session.close()
except Exception:
pass
self.session = _create_cf_session()
self._setup_session()
self._refresh_session_cookies()
def _load_cookies_from_db(self):
if not self.unified_db:
return
try:
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
if cookies:
self.cf_handler._cookies = cookies
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
except Exception as e:
self.log(f"Error loading cookies: {e}", "warning")
def _save_cookies_to_db(self, cookies, user_agent=None):
if not self.unified_db:
return
try:
ua = user_agent or self.user_agent
self.unified_db.save_scraper_cookies(self.scraper_id, cookies,
user_agent=ua, merge=True)
except Exception as e:
self.log(f"Error saving cookies: {e}", "warning")
def _setup_session(self):
"""Configure curl_cffi session with CF-matching headers."""
fingerprint = get_flaresolverr_fingerprint()
stored_ua = None
if self.unified_db:
try:
stored_ua = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
except Exception:
pass
self._stored_ua = stored_ua or fingerprint.get('user_agent', self.user_agent)
self._default_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': fingerprint.get('accept_language', 'en-US,en;q=0.9'),
'Connection': 'keep-alive',
'User-Agent': self._stored_ua,
}
# Load CF cookies
self._refresh_session_cookies()
def _refresh_session_cookies(self):
"""Reload CF cookies into the curl_cffi session."""
cf_cookies = self.cf_handler.get_cookies_dict()
# curl_cffi session uses a cookies dict
for name, value in cf_cookies.items():
self.session.cookies.set(name, value, domain=".imginn.com")
def _ensure_cookies(self, force: bool = False) -> bool:
"""Ensure valid CF cookies, refresh via FlareSolverr if needed.
Uses a cooldown to avoid calling FlareSolverr too frequently.
With aggressive_expiry=True, cookies_expired() returns True whenever
cf_clearance expiry is within 7 days — but cf_clearance only lasts ~30 min,
so without cooldown we'd call FlareSolverr on every single request.
Args:
force: If True, skip cooldown and expiry checks and always refresh.
Used when a 403 proves the current cookies are invalid.
"""
if not force:
# If we refreshed recently, skip the expiry check entirely
if self._last_cookie_refresh:
elapsed = time.time() - self._last_cookie_refresh
if elapsed < self._cookie_refresh_interval:
return True
if not self.cf_handler.cookies_expired():
return True
self.log("Cookies expired, refreshing via FlareSolverr...", "info")
success = self.cf_handler.get_cookies_via_flaresolverr(f"{self.IMGINN_BASE}/")
self._last_cookie_refresh = time.time()
if success:
cookies_list = self.cf_handler.get_cookies_list()
flaresolverr_ua = self.cf_handler.get_user_agent()
if cookies_list and self.unified_db:
self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua)
# Refresh session cookies and UA
if flaresolverr_ua:
self._stored_ua = flaresolverr_ua
self._default_headers['User-Agent'] = flaresolverr_ua
self._refresh_session_cookies()
return True
self.log("Failed to get fresh cookies", "warning")
return False
# ==================== HTTP Helpers ====================
def _rate_limit(self):
if self._last_request_time:
elapsed = time.time() - self._last_request_time
if elapsed < self._min_request_interval:
time.sleep(self._min_request_interval - elapsed)
self._last_request_time = time.time()
def _is_cf_challenge(self, text: str) -> bool:
"""Check if response is a Cloudflare challenge page."""
if len(text) > 10000:
return False
lower = text[:2000].lower()
return any(ind in lower for ind in [
'just a moment', 'checking your browser',
'verify you are human', 'challenge-platform'
]) and '<form' not in lower[:500]
def _fetch_html(self, url: str) -> Optional[str]:
"""Fetch a page via curl_cffi (Chrome TLS), handle CF challenges."""
self._ensure_cookies()
self._rate_limit()
headers = {**self._default_headers}
try:
try:
resp = self.session.get(url, headers=headers, timeout=30, allow_redirects=True)
except ImpersonateError:
self._recreate_session()
resp = self.session.get(url, headers=headers, timeout=30, allow_redirects=True)
if resp.status_code == 403 or self._is_cf_challenge(resp.text):
self.log(f"CF challenge on {url}, trying FlareSolverr direct fetch...", "info")
return self._fetch_html_via_flaresolverr(url)
if resp.status_code == 404:
self.log(f"Page not found: {url}", "warning")
return None
# ImgInn returns 410 for some valid profiles — treat as OK if body has content
if resp.status_code == 410 and len(resp.text) > 1000:
return resp.text
# Retry on server errors (500/502/503) — often transient
if resp.status_code >= 500:
self.log(f"HTTP {resp.status_code} for {url}, retrying in 5s...", "warning")
time.sleep(5)
self._rate_limit()
resp = self.session.get(url, headers=headers, timeout=30, allow_redirects=True)
if resp.status_code >= 500:
self.log(f"HTTP {resp.status_code} for {url} on retry, trying FlareSolverr...", "warning")
return self._fetch_html_via_flaresolverr(url)
if resp.status_code == 200:
return resp.text
if resp.status_code == 410 and len(resp.text) > 1000:
return resp.text
if resp.status_code != 200:
self.log(f"HTTP {resp.status_code} for {url}", "warning")
return None
return resp.text
except Exception as e:
self.log(f"Error fetching {url}: {e}", "error")
return None
def _call_api(self, endpoint: str, params: dict) -> Optional[dict]:
"""Make an API call to ImgInn, return parsed JSON.
Falls back to FlareSolverr if curl_cffi gets 403 (CF challenge).
This is needed for endpoints like /api/story/ where Cloudflare
applies stricter path-based rules.
"""
self._ensure_cookies()
self._rate_limit()
url = f"{self.IMGINN_BASE}{endpoint}"
headers = {
**self._default_headers,
'Accept': '*/*',
'Referer': f'{self.IMGINN_BASE}/',
'X-Requested-With': 'XMLHttpRequest',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
}
try:
try:
resp = self.session.get(url, params=params, headers=headers, timeout=30)
except ImpersonateError:
self._recreate_session()
resp = self.session.get(url, params=params, headers=headers, timeout=30)
if resp.status_code == 429:
self.log("Rate limited (429), waiting 30s...", "warning")
time.sleep(30)
self._rate_limit()
resp = self.session.get(url, params=params, headers=headers, timeout=30)
if resp.status_code == 403 or self._is_cf_challenge(resp.text):
self.log(f"CF challenge on {endpoint}, trying FlareSolverr...", "info")
return self._call_api_via_flaresolverr(url, params)
if resp.status_code != 200:
self.log(f"API {resp.status_code} for {endpoint}", "warning")
return None
return resp.json()
except (ValueError, json.JSONDecodeError):
self.log(f"Invalid JSON from {endpoint}", "warning")
return None
except Exception as e:
self.log(f"API error {endpoint}: {e}", "error")
return None
def _call_api_via_flaresolverr(self, url: str, params: dict) -> Optional[dict]:
"""Fetch an API endpoint through FlareSolverr's browser.
Used as fallback when curl_cffi gets 403 from Cloudflare on
certain API endpoints (e.g. /api/story/).
"""
import html as html_mod
# Build full URL with query params
from urllib.parse import urlencode
full_url = f"{url}?{urlencode(params)}" if params else url
try:
import requests as std_requests
payload = {
'cmd': 'request.get',
'url': full_url,
'maxTimeout': 60000,
}
resp = std_requests.post('http://localhost:8191/v1', json=payload, timeout=70)
data = resp.json()
if data.get('status') != 'ok':
self.log(f"FlareSolverr error: {data.get('message', 'unknown')}", "warning")
return None
solution = data.get('solution', {})
response_text = solution.get('response', '')
# Save cookies from FlareSolverr for future curl_cffi requests
cookies_list = solution.get('cookies', [])
if cookies_list:
flaresolverr_ua = solution.get('userAgent', self.cf_handler.get_user_agent())
self.cf_handler.save_cookies(cookies_list, user_agent=flaresolverr_ua)
if flaresolverr_ua:
self._stored_ua = flaresolverr_ua
self._default_headers['User-Agent'] = flaresolverr_ua
if self.unified_db:
self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua)
self._refresh_session_cookies()
if not response_text:
return None
# FlareSolverr wraps JSON responses in HTML <pre> tags
pre_match = re.search(r'<pre[^>]*>(.*?)</pre>', response_text, re.DOTALL)
if pre_match:
json_text = html_mod.unescape(pre_match.group(1))
return json.loads(json_text)
# Try parsing raw response as JSON
return json.loads(response_text)
except (ValueError, json.JSONDecodeError) as e:
self.log(f"FlareSolverr JSON parse error: {e}", "warning")
return None
except Exception as e:
self.log(f"FlareSolverr fetch error: {e}", "error")
return None
def _fetch_html_via_flaresolverr(self, url: str) -> Optional[str]:
"""Fetch an HTML page through FlareSolverr's browser.
Used as fallback when curl_cffi gets 403 from Cloudflare on
HTML pages (e.g. /tagged/) that have stricter path-based rules.
"""
try:
import requests as std_requests
payload = {
'cmd': 'request.get',
'url': url,
'maxTimeout': 120000,
}
resp = std_requests.post('http://localhost:8191/v1', json=payload, timeout=130)
data = resp.json()
if data.get('status') != 'ok':
self.log(f"FlareSolverr error: {data.get('message', 'unknown')}", "warning")
return None
solution = data.get('solution', {})
response_text = solution.get('response', '')
if not response_text:
self.log("FlareSolverr returned empty response", "warning")
return None
if self._is_cf_challenge(response_text):
self.log("FlareSolverr could not bypass CF challenge", "warning")
return None
# Save cookies from FlareSolverr for future curl_cffi requests
cookies_list = solution.get('cookies', [])
if cookies_list:
flaresolverr_ua = solution.get('userAgent', self.cf_handler.get_user_agent())
self.cf_handler.save_cookies(cookies_list, user_agent=flaresolverr_ua)
if flaresolverr_ua:
self._stored_ua = flaresolverr_ua
self._default_headers['User-Agent'] = flaresolverr_ua
if self.unified_db:
self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua)
self._refresh_session_cookies()
return response_text
except Exception as e:
self.log(f"FlareSolverr HTML fetch error: {e}", "error")
return None
def _download_file(self, url: str, output_path: Path) -> bool:
"""Download a file from CDN URL (no rate limit — goes to Instagram CDN, not ImgInn)."""
try:
# CDN downloads don't need ImgInn CF cookies - use session for TLS fingerprint
resp = self.session.get(
url,
headers={'Referer': f'{self.IMGINN_BASE}/'},
timeout=120,
)
if resp.status_code != 200:
self.log(f"Download HTTP {resp.status_code}: {output_path.name}", "warning")
return False
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'wb') as f:
f.write(resp.content)
size = output_path.stat().st_size
if size < 1000:
self.log(f"File too small ({size}B), discarding: {output_path.name}", "warning")
output_path.unlink()
return False
return True
except Exception as e:
self.log(f"Download error: {e}", "error")
if output_path.exists():
try:
output_path.unlink()
except OSError:
pass
return False
# ==================== Profile / Page Data Extraction ====================
def _get_profile_info(self, username: str) -> Optional[dict]:
"""Fetch profile page and extract user_id, cursor, shortcodes."""
url = f"{self.IMGINN_BASE}/{username}/"
html = self._fetch_html(url)
if not html:
# Retry once with forced cookie refresh (handles expired CF cookies)
if self._ensure_cookies(force=True):
html = self._fetch_html(url)
if not html:
return None
info = {
'user_id': None,
'cursor': None,
'verified': False,
'shortcodes': [],
}
# data-id on container or load-more button
id_match = re.search(r'data-id="(\d+)"', html)
if id_match:
info['user_id'] = id_match.group(1)
self._user_id_cache[username] = info['user_id']
# data-cursor on load-more button
cursor_match = re.search(r'data-cursor="([^"]+)"', html)
if cursor_match:
info['cursor'] = cursor_match.group(1)
# verified flag
if 'data-verified="true"' in html or 'data-verified="1"' in html:
info['verified'] = True
# Extract post shortcodes from grid links
shortcodes = re.findall(r'href="/p/([A-Za-z0-9_-]+)/"', html)
seen = set()
for sc in shortcodes:
if sc not in seen:
seen.add(sc)
info['shortcodes'].append(sc)
return info
def get_user_profile(self, username: str) -> Optional[dict]:
"""Fetch public profile info: avatar, display name, bio, stats.
Args:
username: Instagram username
Returns:
Dict with keys: username, user_id, display_name, avatar_url,
bio, followers, following, posts_count, verified
Returns None if profile cannot be fetched.
"""
import html as html_mod
if not self._ensure_cookies():
return None
html = self._fetch_html(f"{self.IMGINN_BASE}/{username}/")
if not html:
return None
profile = {
'username': username,
'user_id': None,
'display_name': None,
'avatar_url': None,
'bio': None,
'followers': None,
'following': None,
'posts_count': None,
'verified': False,
}
# User ID
id_match = re.search(r'data-id="(\d+)"', html)
if id_match:
profile['user_id'] = id_match.group(1)
self._user_id_cache[username] = profile['user_id']
# Verified
if 'data-verified="true"' in html or 'data-verified="1"' in html:
profile['verified'] = True
# Avatar from og:image
og_img = re.search(r'property="og:image"\s*content="([^"]+)"', html)
if og_img:
profile['avatar_url'] = html_mod.unescape(og_img.group(1))
# Display name from og:title: "View Display Name(@username)..."
og_title = re.search(r'property="og:title"\s*content="([^"]+)"', html)
if og_title:
title_text = html_mod.unescape(og_title.group(1))
name_match = re.match(r'View\s+(.+?)\s*\(@', title_text)
if name_match:
profile['display_name'] = name_match.group(1).strip()
# Bio and stats from og:description
# Format: "Bio text here Followers_count Followers, Following_count Following, Posts_count Posts"
og_desc = re.search(r'property="og:description"\s*content="([^"]+)"', html)
if og_desc:
desc = html_mod.unescape(og_desc.group(1))
# Extract stats from end of description
stats_match = re.search(
r'([\d,.]+[MKk]?)\s*Followers?,\s*([\d,.]+[MKk]?)\s*Following,\s*([\d,.]+[MKk]?)\s*Posts?',
desc
)
if stats_match:
profile['followers'] = stats_match.group(1)
profile['following'] = stats_match.group(2)
profile['posts_count'] = stats_match.group(3)
# Bio is everything before the stats
bio = desc[:stats_match.start()].strip().rstrip(',').strip()
if bio:
profile['bio'] = bio
return profile
def _get_stories_params(self, username: str, user_id: str = None) -> Optional[dict]:
"""Get parameters for the stories API call.
The stories API requires uid, name, and hash parameters.
- uid: Instagram numeric user ID (from profile page or cache)
- name: Instagram username
- hash: floor(current_time / 100000) — time-based hash
Note: We don't fetch the /stories/ HTML page because Cloudflare applies
stricter challenge rules to that path. Instead, we get the uid from the
profile page (which works fine) and compute the hash directly.
"""
uid = user_id or self._user_id_cache.get(username)
if not uid:
# Fetch profile page to get user_id
profile = self._get_profile_info(username)
if profile:
uid = profile['user_id']
if not uid:
self.log(f"Cannot resolve user_id for @{username}", "warning")
return None
# Hash computation: ceil(current_time / 100000)
# Using ceil (floor + 1) to get the current cache period instead of the
# previous one, which returns stale story data.
story_hash = str(int(time.time()) // 100000 + 1)
return {'uid': uid, 'name': username, 'hash': story_hash}
def _get_tagged_info(self, username: str) -> Optional[dict]:
"""Fetch tagged page and extract user_id + cursor."""
html = self._fetch_html(f"{self.IMGINN_BASE}/tagged/{username}/")
if not html:
return None
info = {'user_id': None, 'cursor': None, 'shortcodes': []}
id_match = re.search(r'data-id="(\d+)"', html)
if id_match:
info['user_id'] = id_match.group(1)
self._user_id_cache[username] = info['user_id']
cursor_match = re.search(r'data-cursor="([^"]+)"', html)
if cursor_match:
info['cursor'] = cursor_match.group(1)
# Extract tagged post shortcodes
shortcodes = re.findall(r'href="/p/([A-Za-z0-9_-]+)/"', html)
seen = set()
for sc in shortcodes:
if sc not in seen:
seen.add(sc)
info['shortcodes'].append(sc)
return info
def _get_post_detail(self, shortcode: str) -> Optional[dict]:
"""Fetch individual post page and extract media URLs + metadata."""
html = self._fetch_html(f"{self.IMGINN_BASE}/p/{shortcode}/")
if not html:
return None
post = {
'code': shortcode,
'date': None,
'alt': '',
'author': None,
'srcs': [],
'isSidecar': False,
'isPind': False,
}
import html as html_mod
# Extract date from data-created (UNIX timestamp)
date_match = re.search(r'data-created="(\d+)"', html)
if date_match:
post['date'] = int(date_match.group(1))
else:
# Fallback: try datetime attribute on <time> elements
time_match = re.search(r'<time[^>]*datetime="([^"]+)"', html)
if time_match:
try:
from datetime import timezone
dt = datetime.fromisoformat(time_match.group(1).replace('Z', '+00:00'))
post['date'] = int(dt.timestamp())
except Exception:
pass
if not post['date']:
# Fallback: try data-date or data-timestamp attributes
alt_date = re.search(r'data-(?:date|timestamp|time)="(\d{10,13})"', html)
if alt_date:
ts = int(alt_date.group(1))
if ts > 1e12: # milliseconds
ts = ts // 1000
post['date'] = ts
if not post['date']:
self.log(f"Could not extract date for post {shortcode}", "warning")
# Extract author username from div.username link (most reliable)
# Format: <div class="username"><a href="/username/">...</a></div>
username_link = re.search(r'class="username"[^>]*>\s*<a\s+href="/([^"]+?)/"', html)
if username_link:
author_candidate = username_link.group(1).strip().lower()
if re.match(r'^[a-zA-Z0-9_.]{1,30}$', author_candidate):
post['author'] = author_candidate
# Extract caption from og:description (format: "username: caption text")
cap_match = re.search(r'<meta\s+property="og:description"\s+content="([^"]*)"', html)
if cap_match:
full_text = html_mod.unescape(cap_match.group(1))
# Fallback: extract author from caption if not found above
if not post['author'] and ':' in full_text:
author_candidate = full_text.split(':')[0].strip()
if re.match(r'^[a-zA-Z0-9_.]{1,30}$', author_candidate):
post['author'] = author_candidate
post['alt'] = full_text
# Extract media URLs from swiper slides
# Each swiper-slide has a data-src with the full-res CDN URL
# Only grab data-src from within swiper-slide divs (not profile pics etc.)
slide_pattern = re.compile(
r'class="swiper-slide[^"]*"[^>]*data-src="([^"]+)"', re.DOTALL)
slide_srcs = slide_pattern.findall(html)
# Also check for plain data-src within the main post area (non-carousel)
if not slide_srcs:
# Look for the main download button link with scontent URL
dl_pattern = re.compile(
r'class="[^"]*downloads[^"]*"[^>]*href="(https://scontent[^"]+)"', re.DOTALL)
dl_srcs = dl_pattern.findall(html)
if not dl_srcs:
# Broader: any scontent link with dl=1
dl_srcs = re.findall(r'href="(https://scontent[^"]*dl=1[^"]*)"', html)
slide_srcs = dl_srcs
# Clean URLs and filter to CDN only
urls = []
seen_urls = set()
for src in slide_srcs:
src = html_mod.unescape(src)
# Only keep Instagram CDN URLs
if 'scontent' not in src and 'cdninstagram' not in src:
continue
# Deduplicate
base = src.split('?')[0]
if base in seen_urls:
continue
seen_urls.add(base)
urls.append(src)
post['srcs'] = urls
post['isSidecar'] = len(urls) > 1
return post if urls else None
# ==================== File Naming ====================
def _extract_cdn_filename(self, url: str) -> str:
"""Extract the base filename from a CDN URL (without extension)."""
path = url.split('?')[0]
filename = path.split('/')[-1]
name = filename.rsplit('.', 1)[0] if '.' in filename else filename
return name
def _extract_ext(self, url: str) -> str:
"""Extract file extension from CDN URL."""
path = url.split('?')[0]
if '.mp4' in path:
return '.mp4'
elif '.webp' in path:
return '.webp'
elif '.png' in path:
return '.png'
elif '.jpeg' in path:
return '.jpeg'
return '.jpg'
def _make_filename(self, profile: str, date_ts: int, cdn_filename: str,
ext: str, slide_index: int = None) -> str:
"""Generate filename: {profile}_{YYYYMMDD_HHMMSS}_{cdn_filename}[_{idx}]{ext}"""
dt = datetime.fromtimestamp(date_ts) if date_ts else datetime.now()
date_str = dt.strftime('%Y%m%d_%H%M%S')
if slide_index is not None and slide_index > 0:
return f"{profile}_{date_str}_{cdn_filename}_{slide_index}{ext}"
return f"{profile}_{date_str}_{cdn_filename}{ext}"
def _update_file_timestamps(self, filepath: Path, post_date_ts: int):
"""Set file modification time to match post date."""
if not post_date_ts:
return
try:
os.utime(str(filepath), (post_date_ts, post_date_ts))
except Exception:
pass
# ==================== Duplicate Detection ====================
def _is_already_downloaded(self, media_id: str, username: str = None) -> bool:
if media_id in self.downloaded_files:
return True
if self.unified_db and self.use_database:
return is_instagram_downloaded(self.unified_db, media_id, username)
return False
def _scan_existing_files(self, output_dir: Path, profile: str):
existing = scan_existing_files_for_media_ids(
output_dir, profile, min_file_size=1000
)
self.downloaded_files.update(existing)
if existing:
self.log(f"Found {len(existing)} existing files", "debug")
# ==================== Phrase Filtering ====================
def _check_phrases(self, caption: str, phrase_config: dict) -> bool:
"""Check if caption matches phrase filter. Returns True if post should be downloaded."""
if not phrase_config or not phrase_config.get('enabled'):
return True
phrases = phrase_config.get('phrases', [])
if not phrases:
return True
case_sensitive = phrase_config.get('case_sensitive', False)
match_all = phrase_config.get('match_all', False)
text = caption if case_sensitive else caption.lower()
if match_all:
return all((p if case_sensitive else p.lower()) in text for p in phrases)
else:
return any((p if case_sensitive else p.lower()) in text for p in phrases)
# ==================== Database Recording ====================
def _record_download(self, media_id, username, filename, url=None,
post_date=None, file_path=None, content_type='post',
metadata=None, deferred=False):
record = {
'media_id': media_id,
'username': username,
'filename': filename,
'url': url or f'instagram://{media_id}',
'post_date': post_date,
'file_path': file_path,
'content_type': content_type,
'metadata': metadata or {},
}
if deferred:
self.pending_downloads.append(record)
return True
if self.unified_db and self.use_database:
try:
return record_instagram_download(
self.unified_db,
media_id=media_id,
username=username,
content_type=content_type,
filename=filename,
file_path=file_path,
url=url,
post_date=datetime.fromtimestamp(post_date) if isinstance(post_date, (int, float)) else post_date,
method='imginn',
extra_metadata=metadata
)
except Exception as e:
self.log(f"Error recording download: {e}", "warning")
return False
def get_pending_downloads(self) -> list:
return self.pending_downloads.copy()
def clear_pending_downloads(self):
self.pending_downloads = []
# ==================== Post Processing ====================
def _batch_upgrade_to_hires(self, items: list) -> Tuple[dict, dict]:
"""Batch fetch post detail pages via FlareSolverr session for full-res URLs.
Creates a persistent browser session that solves CF once, then
reuses it for all subsequent requests (~0.5s each instead of ~10s).
Args:
items: List of API post items with 'code' keys
Returns:
Tuple of (srcs_map, dates_map) where:
- srcs_map: Dict mapping shortcode -> list of full-res src URLs
- dates_map: Dict mapping shortcode -> UNIX timestamp (for items missing dates)
"""
import html as html_mod
import requests as std_requests
shortcodes = [item.get('code', '') for item in items if item.get('code')]
if not shortcodes:
return {}, {}
results = {}
dates = {}
session_id = None
total = len(shortcodes)
try:
resp = std_requests.post('http://localhost:8191/v1', json={
'cmd': 'sessions.create'
}, timeout=30)
data = resp.json()
if data.get('status') != 'ok':
self.log("Failed to create FlareSolverr session for full-res, using API URLs", 'warning')
return {}
session_id = data.get('session')
self.log(f"Fetching full-res URLs for {total} posts via browser session...", 'info')
for i, code in enumerate(shortcodes):
if self.show_progress and i % 10 == 0:
self.activity_manager.update_status(
f"Fetching full-res {i + 1}/{total}")
try:
resp = std_requests.post('http://localhost:8191/v1', json={
'cmd': 'request.get',
'url': f'{self.IMGINN_BASE}/p/{code}/',
'session': session_id,
'maxTimeout': 60000,
}, timeout=70)
page_data = resp.json()
if page_data.get('status') != 'ok':
continue
html = page_data.get('solution', {}).get('response', '')
if not html:
continue
srcs = self._parse_detail_srcs(html)
if srcs:
results[code] = srcs
# Also extract date from detail page (for items missing dates)
date_match = re.search(r'data-created="(\d+)"', html)
if date_match:
dates[code] = int(date_match.group(1))
except Exception as e:
self.log(f"Detail fetch failed for {code}: {e}", 'debug')
continue
except Exception as e:
self.log(f"FlareSolverr session error: {e}", 'warning')
finally:
if session_id:
try:
std_requests.post('http://localhost:8191/v1', json={
'cmd': 'sessions.destroy',
'session': session_id,
}, timeout=10)
except Exception:
pass
self.log(f"Got full-res URLs for {len(results)}/{total} posts, dates for {len(dates)}", 'info')
return results, dates
@staticmethod
def _parse_detail_srcs(html: str) -> list:
"""Extract full-res CDN URLs from a post detail page HTML."""
import html as html_mod
slide_pattern = re.compile(
r'class="swiper-slide[^"]*"[^>]*data-src="([^"]+)"', re.DOTALL)
slide_srcs = slide_pattern.findall(html)
if not slide_srcs:
dl_pattern = re.compile(
r'class="[^"]*downloads[^"]*"[^>]*href="(https://scontent[^"]+)"', re.DOTALL)
slide_srcs = dl_pattern.findall(html)
if not slide_srcs:
slide_srcs = re.findall(r'href="(https://scontent[^"]*dl=1[^"]*)"', html)
urls = []
seen = set()
for src in slide_srcs:
src = html_mod.unescape(src)
if 'scontent' not in src and 'cdninstagram' not in src:
continue
base = src.split('?')[0]
if base in seen:
continue
seen.add(base)
urls.append(src)
return urls
def _process_api_post(self, item: dict, username: str, output_dir: Path,
cutoff_ts: int, phrase_config: dict,
defer_database: bool, video_only: bool = False,
date_to_ts: int = None,
content_type: str = 'post') -> Tuple[str, List[str]]:
"""Process a single post from the API response.
Args:
item: API post item dict
username: Instagram username
output_dir: Download directory
cutoff_ts: Oldest allowed post timestamp (0/None = no lower bound)
phrase_config: Phrase filter config
defer_database: Whether to defer DB recording
video_only: If True, only download video items (for reels mode)
date_to_ts: Newest allowed post timestamp (None = no upper bound)
Returns:
Tuple of (status, downloaded_files) where status is:
'downloaded', 'old', 'skipped', 'duplicate', 'filtered', 'future'
"""
shortcode = item.get('code', '')
post_date = item.get('date') # UNIX timestamp
if not post_date:
self.log(f"Post {shortcode} has no date - timestamps will default to download time", "warning")
caption = item.get('alt', '') or ''
is_sidecar = item.get('isSidecar', False)
srcs = item.get('srcs', [])
# If no srcs, use src as fallback (single item)
if not srcs:
src = item.get('src', '')
if src:
srcs = [src]
if not srcs:
self.log(f"No media URLs for post {shortcode}", "debug")
return ('skipped', [])
# Date range: skip posts newer than date_to
if post_date and date_to_ts and post_date > date_to_ts:
return ('future', [])
# Age check (cutoff_ts=0 or None means no lower bound)
if cutoff_ts:
if post_date:
from datetime import datetime as _dt
post_dt = _dt.fromtimestamp(post_date)
cutoff_dt = _dt.fromtimestamp(cutoff_ts)
self.log(f"Age check: post {shortcode} date={post_dt.isoformat()} cutoff={cutoff_dt.isoformat()} old={post_date < cutoff_ts}", "debug")
else:
self.log(f"Age check: post {shortcode} has no date (post_date={post_date}), skipping age filter", "debug")
if post_date and cutoff_ts and post_date < cutoff_ts:
return ('old', [])
# Phrase check
if not self._check_phrases(caption, phrase_config):
self.log(f"Post {shortcode} filtered by phrase config", "debug")
return ('filtered', [])
# Video-only filter for reels mode
if video_only:
video_srcs = [s for s in srcs if '.mp4' in s.split('?')[0]]
if not video_srcs:
return ('skipped', [])
srcs = video_srcs
downloaded = []
for idx, src_url in enumerate(srcs):
cdn_filename = self._extract_cdn_filename(src_url)
media_id = extract_instagram_media_id(cdn_filename)
ext = self._extract_ext(src_url)
# Duplicate check
if self._is_already_downloaded(media_id, username):
continue
if self._is_already_downloaded(cdn_filename, username):
continue
# Build output filename
slide_index = idx if is_sidecar and len(srcs) > 1 else None
out_filename = self._make_filename(username, post_date, cdn_filename, ext, slide_index)
out_path = output_dir / out_filename
# Skip if file already exists
if out_path.exists():
self.downloaded_files.add(media_id)
self.downloaded_files.add(cdn_filename)
continue
# Download
if self._download_file(src_url, out_path):
self.downloaded_files.add(media_id)
self.downloaded_files.add(cdn_filename)
# Set file timestamps
if post_date:
self._update_file_timestamps(out_path, post_date)
# Record - use per-slide URL for sidecars so each slide gets a unique url_hash
if shortcode:
if is_sidecar and len(srcs) > 1:
instagram_url = f"https://www.instagram.com/p/{shortcode}/?img_index={idx + 1}"
else:
instagram_url = f"https://www.instagram.com/p/{shortcode}/"
else:
instagram_url = None
self._record_download(
media_id=media_id,
username=username,
filename=out_filename,
url=instagram_url,
post_date=post_date,
file_path=str(out_path),
content_type=content_type,
metadata={
'shortcode': shortcode,
'is_sidecar': is_sidecar,
'slide_index': idx if is_sidecar else None,
'total_slides': len(srcs) if is_sidecar else 1,
'cdn_filename': cdn_filename,
},
deferred=defer_database
)
downloaded.append(str(out_path))
self.log(f"Downloaded: {out_filename}", "info")
if downloaded:
return ('downloaded', downloaded)
return ('duplicate', [])
def _process_post_detail(self, post: dict, username: str, output_dir: Path,
cutoff_ts: int, phrase_config: dict,
defer_database: bool,
content_type: str = 'post') -> Tuple[str, List[str]]:
"""Process a post from _get_post_detail (HTML-extracted data).
Uses the post's author for the filename if available (important for
tagged posts where the author differs from the searched profile).
"""
# Use post author for filename if available, fall back to searched username
file_username = post.get('author') or username
# Convert to API-like format and delegate
api_item = {
'code': post.get('code', ''),
'date': post.get('date'),
'alt': post.get('alt', ''),
'isPind': post.get('isPind', False),
'isSidecar': post.get('isSidecar', False),
'srcs': post.get('srcs', []),
}
return self._process_api_post(api_item, file_username, output_dir,
cutoff_ts, phrase_config, defer_database,
content_type=content_type)
# ==================== Date Range Helpers ====================
def _parse_date(self, date_val) -> Optional[int]:
"""Parse a date value to UNIX timestamp.
Accepts: int/float (UNIX timestamp), ISO date string, datetime object, None.
"""
if date_val is None:
return None
if isinstance(date_val, (int, float)):
return int(date_val)
if isinstance(date_val, datetime):
return int(date_val.timestamp())
if isinstance(date_val, str):
for fmt in ('%Y-%m-%d', '%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S'):
try:
return int(datetime.strptime(date_val, fmt).timestamp())
except ValueError:
continue
return None
def _compute_cutoffs(self, days_back, date_from=None, date_to=None) -> Tuple[Optional[int], Optional[int]]:
"""Compute (cutoff_ts, date_to_ts) from days_back and explicit date range.
- days_back=0 means no lower bound (download everything)
- date_from overrides days_back if provided
- date_to sets an upper bound (skip posts newer than this)
"""
cutoff_ts = None
if date_from is not None:
cutoff_ts = self._parse_date(date_from)
elif days_back and days_back > 0:
cutoff_ts = int((datetime.now() - timedelta(days=days_back)).timestamp())
date_to_ts = self._parse_date(date_to) if date_to is not None else None
self.log(f"Cutoffs: days_back={days_back} cutoff_ts={cutoff_ts} ({datetime.fromtimestamp(cutoff_ts).isoformat() if cutoff_ts else 'None'}) date_to_ts={date_to_ts}", "info")
return cutoff_ts, date_to_ts
# ==================== Main Download Entry ====================
def download(self, username, content_type="posts", days_back=14,
max_downloads=50, output_dir=None, phrase_config=None,
defer_database=False, date_from=None, date_to=None) -> int:
"""Main download entry point (compatible with ImgInnDownloader).
Args:
username: Instagram username (or shortcode for content_type="post")
content_type: "posts", "stories", "tagged", "reels", or "post" (single)
days_back: How far back to download (0 = unlimited, no date cutoff)
max_downloads: Maximum files to download (0 = unlimited)
output_dir: Output directory path
phrase_config: Optional phrase filtering config
defer_database: Whether to defer database recording
date_from: Explicit start date (overrides days_back). Accepts:
UNIX timestamp, ISO string "YYYY-MM-DD", or datetime
date_to: Explicit end date. Same formats as date_from.
Returns:
Number of files downloaded
"""
self.downloaded_files = set()
self.download_count = 0
if output_dir is None:
output_dir = f"/opt/media-downloader/downloads/{username}"
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
self._scan_existing_files(output_path, username)
try:
if content_type == "post":
# Single post: username is treated as shortcode or URL
files = self.download_single_post(username, output_path, defer_database)
elif content_type == "posts":
files = self.download_posts(username, days_back, max_downloads,
output_path, phrase_config, defer_database,
date_from=date_from, date_to=date_to)
elif content_type == "stories":
files = self.download_stories(username, days_back, max_downloads,
output_path, defer_database)
elif content_type == "tagged":
files = self.download_tagged(username, days_back, max_downloads,
output_path, phrase_config, defer_database,
date_from=date_from, date_to=date_to)
elif content_type == "reels":
files = self.download_reels(username, days_back, max_downloads,
output_path, phrase_config, defer_database,
date_from=date_from, date_to=date_to)
else:
self.log(f"Unsupported content type: {content_type}", "warning")
return 0
count = len(files) if files else 0
self.download_count = count
return count
except Exception as e:
self.log(f"Download error for @{username} ({content_type}): {e}", "error")
import traceback
self.log(traceback.format_exc(), "debug")
return 0
# ==================== Single Post ====================
def download_single_post(self, shortcode_or_url: str, output_dir=None,
defer_database=False) -> List[str]:
"""Download a single Instagram post by shortcode or URL.
Args:
shortcode_or_url: Post shortcode (e.g. "DVL2WxBFGBT") or
Instagram/ImgInn URL containing the shortcode
output_dir: Output directory
defer_database: Whether to defer DB recording
Returns:
List of downloaded file paths
"""
# Extract shortcode from URL if needed
shortcode = shortcode_or_url
url_match = re.search(r'/p/([A-Za-z0-9_-]+)', shortcode_or_url)
if url_match:
shortcode = url_match.group(1)
self.activity_manager.update_status(f"Downloading post {shortcode}")
if not self._ensure_cookies():
return []
output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/posts")
output_path.mkdir(parents=True, exist_ok=True)
post = self._get_post_detail(shortcode)
if not post:
self.log(f"Could not fetch post /p/{shortcode}/", "error")
return []
username = post.get('author') or 'unknown'
self.log(f"Post {shortcode} by @{username}: {len(post['srcs'])} media items", "info")
# No date cutoff or phrase filter for single post
status, files = self._process_post_detail(
post, username, output_path, cutoff_ts=None,
phrase_config=None, defer_database=defer_database)
self.log(f"Single post complete: {len(files)} files", "info")
return files
# ==================== Posts ====================
def download_posts(self, username, days_back=14, max_posts=50,
output_dir=None, phrase_config=None,
defer_database=False, date_from=None, date_to=None) -> List[str]:
"""Download posts using ImgInn API with full carousel support.
Args:
days_back: How far back to download. 0 = no date cutoff (all posts).
max_posts: Maximum posts to process. 0 = unlimited.
date_from: Explicit start date (overrides days_back).
date_to: Explicit end date (skip posts newer than this).
"""
self.activity_manager.update_status(f"Checking posts for @{username}")
if not self._ensure_cookies():
self.activity_manager.update_status(f"Skipped - ImgInn unavailable")
return []
output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/{username}")
output_path.mkdir(parents=True, exist_ok=True)
# Fetch profile page for user_id and cursor
profile = self._get_profile_info(username)
if not profile or not profile['user_id']:
self.log(f"Could not resolve profile for @{username}", "warning")
return []
user_id = profile['user_id']
cursor = profile['cursor']
verified = '1' if profile['verified'] else '0'
self.log(f"Profile @{username}: user_id={user_id}, {len(profile['shortcodes'])} initial posts", "info")
cutoff_ts, date_to_ts = self._compute_cutoffs(days_back, date_from, date_to)
has_date_cutoff = cutoff_ts is not None
# max_posts=0 means unlimited
effective_max = max_posts if max_posts > 0 else float('inf')
max_consecutive_old = 5 if has_date_cutoff else float('inf')
# ── Phase 1: Collect items via API pagination ──
self.activity_manager.update_status(f"Scanning posts for @{username}")
collected_items = []
consecutive_old = 0
first_page = self._call_api('/api/posts/', {
'id': user_id,
'cursor': '',
'username': username,
'verified': verified,
})
if first_page and first_page.get('items'):
for item in first_page['items']:
if len(collected_items) >= effective_max or consecutive_old >= max_consecutive_old:
break
post_date = item.get('date')
if post_date and cutoff_ts and post_date < cutoff_ts and not item.get('isPind', False):
consecutive_old += 1
continue
if post_date and date_to_ts and post_date > date_to_ts:
continue
consecutive_old = 0
collected_items.append(item)
if first_page.get('cursor'):
cursor = first_page['cursor']
elif not first_page.get('hasNext', True):
cursor = None
else:
# First page API didn't work — fetch individual post pages (already full-res)
self.log("First page API unavailable, fetching post pages...", "debug")
all_downloaded = []
total_processed = 0
consecutive_old = 0
for shortcode in profile['shortcodes']:
if total_processed >= effective_max or consecutive_old >= max_consecutive_old:
break
if self._is_already_downloaded(shortcode, username):
total_processed += 1
continue
post = self._get_post_detail(shortcode)
if not post:
continue
status, files = self._process_post_detail(
post, username, output_path, cutoff_ts, phrase_config, defer_database)
total_processed += 1
if status == 'old':
consecutive_old += 1
elif status == 'downloaded':
consecutive_old = 0
all_downloaded.extend(files)
self.log(f"Posts complete: {len(all_downloaded)} files for @{username}", "info")
return all_downloaded
# Continue API pagination
while cursor and len(collected_items) < effective_max and consecutive_old < max_consecutive_old:
self.log(f"Fetching posts page (collected={len(collected_items)})...", "debug")
data = self._call_api('/api/posts/', {
'id': user_id,
'cursor': cursor,
'username': username,
'verified': verified,
})
if not data or not data.get('items'):
break
for item in data['items']:
if len(collected_items) >= effective_max or consecutive_old >= max_consecutive_old:
break
post_date = item.get('date')
if post_date and cutoff_ts and post_date < cutoff_ts and not item.get('isPind', False):
consecutive_old += 1
continue
if post_date and date_to_ts and post_date > date_to_ts:
continue
consecutive_old = 0
collected_items.append(item)
if data.get('hasNext') and data.get('cursor'):
cursor = data['cursor']
else:
break
if not collected_items:
self.log(f"Posts complete: 0 files for @{username}", "info")
return []
# Filter out already-downloaded and out-of-range posts before expensive browser session
needs_download = []
for item in collected_items:
code = item.get('code', '')
post_date = item.get('date')
# Date filters — same checks _process_api_post will do
if post_date and cutoff_ts and post_date < cutoff_ts:
continue
if post_date and date_to_ts and post_date > date_to_ts:
continue
if self._is_already_downloaded(code, username):
continue
# Check if all srcs are already downloaded
srcs = item.get('srcs', []) or ([item['src']] if item.get('src') else [])
all_downloaded_flag = srcs and all(
self._is_already_downloaded(self._extract_cdn_filename(s), username)
for s in srcs
)
if all_downloaded_flag:
continue
needs_download.append(item)
if not needs_download:
self.log(f"Posts complete: 0 new files for @{username} ({len(collected_items)} already downloaded)", "info")
return []
self.log(f"Collected {len(collected_items)} posts, {len(needs_download)} need downloading, upgrading to full-res...", "info")
# ── Phase 2: Batch upgrade to full-res via FlareSolverr session ──
hires_map, dates_map = self._batch_upgrade_to_hires(needs_download)
# Apply full-res URLs and missing dates to items
for item in needs_download:
code = item.get('code', '')
if code in hires_map:
item['srcs'] = hires_map[code]
item['isSidecar'] = len(hires_map[code]) > 1
# Fill in missing dates from detail pages
if not item.get('date') and code in dates_map:
item['date'] = dates_map[code]
self.log(f"Recovered date for post {code} from detail page", "debug")
# ── Phase 3: Process and download ──
all_downloaded = []
total_processed = 0
for item in needs_download:
status, files = self._process_api_post(
item, username, output_path, cutoff_ts, phrase_config,
defer_database, date_to_ts=date_to_ts)
total_processed += 1
if status == 'downloaded':
all_downloaded.extend(files)
if self.show_progress:
progress_max = max_posts if max_posts > 0 else None
self.activity_manager.update_status(
f"Downloading posts for @{username}",
len(all_downloaded), progress_max)
self.log(f"Posts complete: {len(all_downloaded)} files for @{username}", "info")
return all_downloaded
# ==================== Stories ====================
def download_stories(self, username, days_back=1, max_stories=50,
output_dir=None, defer_database=False) -> List[str]:
"""Download stories using ImgInn API."""
self.activity_manager.update_status(f"Checking stories for @{username}")
if not self._ensure_cookies():
return []
output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/{username}")
output_path.mkdir(parents=True, exist_ok=True)
# Get story API params
params = self._get_stories_params(username)
if not params:
self.log(f"Cannot get story params for @{username}", "warning")
return []
self.log(f"Fetching stories for @{username} (uid={params['uid']})...", "info")
data = self._call_api('/api/story/', params)
items = []
if data and data.get('items'):
items = data['items']
else:
# API returned no data (404/empty) — fall back to parsing the stories HTML page
items = self._parse_stories_from_html(username)
if not items:
self.log(f"No stories found for @{username}", "info")
return []
self.log(f"Found {len(items)} stories for @{username}", "info")
downloaded = []
for idx, item in enumerate(items):
if len(downloaded) >= max_stories:
break
src = item.get('src', '')
if not src:
# Try proxy URL as fallback
src = item.get('proxy', '')
if not src:
continue
# Extract info
story_time = item.get('time') # UNIX timestamp
if not story_time:
# Stories are always recent, use current time as best approximation
story_time = int(time.time())
self.log(f"Story {idx + 1} for @{username} has no timestamp, using current time", "warning")
cdn_filename = self._extract_cdn_filename(src)
media_id = extract_instagram_media_id(cdn_filename) if cdn_filename else f"story_{idx}"
ext = self._extract_ext(src)
# Duplicate check
if self._is_already_downloaded(media_id, username):
continue
if self._is_already_downloaded(cdn_filename, username):
continue
# Generate filename with story suffix
dt = datetime.fromtimestamp(story_time)
date_str = dt.strftime('%Y%m%d_%H%M%S')
out_filename = f"{username}_{date_str}_{cdn_filename}_story{idx + 1}{ext}"
out_path = output_path / out_filename
if out_path.exists():
self.downloaded_files.add(media_id)
continue
if self._download_file(src, out_path):
self.downloaded_files.add(media_id)
self.downloaded_files.add(cdn_filename)
if story_time:
self._update_file_timestamps(out_path, story_time)
# Use per-story URL so each story gets a unique url_hash
story_url = f"https://www.instagram.com/stories/{username}/{media_id}/"
self._record_download(
media_id=media_id,
username=username,
filename=out_filename,
url=story_url,
post_date=story_time,
file_path=str(out_path),
content_type='stories',
metadata={'story_index': idx + 1, 'cdn_filename': cdn_filename},
deferred=defer_database
)
downloaded.append(str(out_path))
self.log(f"Downloaded story: {out_filename}", "info")
if self.show_progress:
self.activity_manager.update_status(
f"Downloading stories for @{username}",
len(downloaded), len(items))
self.log(f"Stories complete: {len(downloaded)} files for @{username}", "info")
return downloaded
def _parse_stories_from_html(self, username: str) -> list:
"""Parse stories from the /stories/{username}/ HTML page.
Fallback when /api/story/ returns 404. Extracts story media URLs
from the reels-media section of the stories page.
Returns:
List of dicts with 'src' and optionally 'time' keys,
matching the API response format.
"""
self.log(f"Trying HTML fallback for stories @{username}", "info")
html = self._fetch_html(f"{self.IMGINN_BASE}/stories/{username}/")
if not html:
return []
items = []
# Find the reels-media section containing story items
media_idx = html.find('class="reels-media"')
if media_idx < 0:
self.log("No reels-media section found in stories page", "debug")
return []
section = html[media_idx:]
# Each story is in a <div class="media"> container
# Videos have: <div class="media-video-wrap" data-src="VIDEO_URL">
# and <a class="download" href="VIDEO_URL&dl=1">
# Images have: <a class="download" href="IMAGE_URL&dl=1">
for m in re.finditer(r'<div class="media">', section):
# Get chunk until next media div or end
start = m.start()
next_media = section.find('<div class="media">', start + 1)
chunk = section[start:next_media] if next_media > 0 else section[start:start + 5000]
# Extract download URL (most reliable source)
dl_match = re.search(r'class="download"[^>]*href="([^"]+)"', chunk)
if not dl_match:
continue
import html as html_mod
src = html_mod.unescape(dl_match.group(1))
# Remove &dl=1 suffix if present
src = re.sub(r'[&?]dl=1$', '', src)
# Extract relative time (e.g. "5 hours ago") and convert to timestamp
item = {'src': src}
time_match = re.search(r'class="time">(\d+)\s+(second|minute|hour|day|week)s?\s+ago<', chunk)
if time_match:
amount = int(time_match.group(1))
unit = time_match.group(2)
seconds_map = {'second': 1, 'minute': 60, 'hour': 3600, 'day': 86400, 'week': 604800}
item['time'] = int(time.time()) - (amount * seconds_map.get(unit, 0))
items.append(item)
if items:
self.log(f"Parsed {len(items)} stories from HTML for @{username}", "info")
return items
def _parse_highlights_from_html(self, username: str) -> list:
"""Parse highlight IDs and titles from /stories/{username}/ page.
Returns list of dicts: [{id, title}, ...]
"""
html = self._fetch_html(f"{self.IMGINN_BASE}/stories/{username}/")
if not html:
return []
highlights = []
# Highlights are: <li class="reel swiper-slide" data-id="{id}">
# <div class="title">{title}</div>
for match in re.finditer(
r'<li[^>]*\bdata-id="(\d+)"[^>]*>.*?'
r'<div class="title">([^<]*)</div>',
html, re.DOTALL
):
highlights.append({
'id': match.group(1),
'title': match.group(2).strip(),
})
if highlights:
self.log(f"Found {len(highlights)} highlights for @{username}", "info")
return highlights
# ==================== Tagged ====================
def download_tagged(self, username, days_back=14, max_posts=50,
output_dir=None, phrase_config=None,
defer_database=False, date_from=None, date_to=None) -> List[str]:
"""Download tagged posts. Uses API for pagination, post pages for media URLs.
Args:
days_back: How far back to download. 0 = no date cutoff.
max_posts: Maximum posts to process. 0 = unlimited.
date_from: Explicit start date (overrides days_back).
date_to: Explicit end date.
"""
self.activity_manager.update_status(f"Checking tagged for @{username}")
if not self._ensure_cookies():
return []
output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/{username}")
output_path.mkdir(parents=True, exist_ok=True)
# Fetch tagged page
tagged_info = self._get_tagged_info(username)
if not tagged_info or not tagged_info['user_id']:
self.log(f"Could not load tagged page for @{username}", "error")
return []
user_id = tagged_info['user_id']
cursor = tagged_info['cursor']
cutoff_ts, date_to_ts = self._compute_cutoffs(days_back, date_from, date_to)
has_date_cutoff = cutoff_ts is not None
all_downloaded = []
total_processed = 0
consecutive_old = 0
max_consecutive_old = 5 if has_date_cutoff else float('inf')
effective_max = max_posts if max_posts > 0 else float('inf')
# Process first-page shortcodes via API to get full data (dates, srcs)
# The HTML page only has shortcodes, so we fetch each via /api/posts/ for dates
first_page_codes = tagged_info['shortcodes']
self.log(f"Tagged: {len(first_page_codes)} posts on first page", "debug")
for shortcode in first_page_codes:
if total_processed >= effective_max or consecutive_old >= max_consecutive_old:
break
if self._is_already_downloaded(shortcode, username):
total_processed += 1
continue
# Fetch post detail page for media URLs and author
post = self._get_post_detail(shortcode)
if not post:
total_processed += 1
continue
status, files = self._process_post_detail(
post, username, output_path, cutoff_ts, phrase_config, defer_database,
content_type='tagged')
total_processed += 1
if status == 'old':
consecutive_old += 1
elif status == 'downloaded':
consecutive_old = 0
all_downloaded.extend(files)
if self.show_progress:
progress_max = max_posts if max_posts > 0 else None
self.activity_manager.update_status(
f"Downloading tagged for @{username}",
len(all_downloaded), progress_max)
# Paginate via tagged API — use API data directly (has time, srcs, code)
while cursor and total_processed < effective_max and consecutive_old < max_consecutive_old:
self.log(f"Fetching tagged page (processed={total_processed})...", "debug")
data = self._call_api('/api/tagged', {
'id': user_id,
'cursor': cursor,
})
if not data or not data.get('items'):
break
items = data['items']
for item in items:
if total_processed >= effective_max or consecutive_old >= max_consecutive_old:
break
shortcode = item.get('code', '')
if not shortcode:
item_id = item.get('id', '')
if not item_id:
continue
shortcode = media_id_to_shortcode(item_id)
if self._is_already_downloaded(shortcode, username):
total_processed += 1
continue
# Use API data directly — normalize 'time' to 'date' for _process_api_post
if 'time' in item and 'date' not in item:
item['date'] = item['time']
# Use owner username for filename if available
api_username = username
owner = item.get('owner', {})
if isinstance(owner, dict) and owner.get('username'):
api_username = owner['username']
status, files = self._process_api_post(
item, api_username, output_path, cutoff_ts, phrase_config, defer_database,
content_type='tagged')
total_processed += 1
if status == 'old':
consecutive_old += 1
elif status == 'downloaded':
consecutive_old = 0
all_downloaded.extend(files)
# Tagged API uses last item's ID as next cursor
if items:
last_id = items[-1].get('id')
if last_id and str(last_id) != str(cursor):
cursor = str(last_id)
else:
break
else:
break
self.log(f"Tagged complete: {len(all_downloaded)} files for @{username}", "info")
return all_downloaded
# ==================== Reels ====================
def download_reels(self, username, days_back=14, max_downloads=50,
output_dir=None, phrase_config=None,
defer_database=False, date_from=None, date_to=None) -> List[str]:
"""Download reels (video posts) using the posts API with video filter.
Args:
days_back: How far back to download. 0 = no date cutoff.
max_downloads: Maximum files to download. 0 = unlimited.
date_from: Explicit start date (overrides days_back).
date_to: Explicit end date.
"""
self.activity_manager.update_status(f"Checking reels for @{username}")
if not self._ensure_cookies():
return []
output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/{username}")
output_path.mkdir(parents=True, exist_ok=True)
# Same as posts but with video_only flag
profile = self._get_profile_info(username)
if not profile or not profile['user_id']:
self.log(f"Could not resolve profile for @{username}", "warning")
return []
user_id = profile['user_id']
cursor = profile['cursor']
verified = '1' if profile['verified'] else '0'
cutoff_ts, date_to_ts = self._compute_cutoffs(days_back, date_from, date_to)
has_date_cutoff = cutoff_ts is not None
effective_max = max_downloads if max_downloads > 0 else float('inf')
max_consecutive_old = 5 if has_date_cutoff else float('inf')
# ── Phase 1: Collect video items via API ──
collected_items = []
consecutive_old = 0
first_page = self._call_api('/api/posts/', {
'id': user_id,
'cursor': '',
'username': username,
'verified': verified,
})
if first_page and first_page.get('items'):
for item in first_page['items']:
if len(collected_items) >= effective_max or consecutive_old >= max_consecutive_old:
break
post_date = item.get('date')
if post_date and cutoff_ts and post_date < cutoff_ts and not item.get('isPind', False):
consecutive_old += 1
continue
if post_date and date_to_ts and post_date > date_to_ts:
continue
# Only keep video items
srcs = item.get('srcs', []) or ([item['src']] if item.get('src') else [])
video_srcs = [s for s in srcs if '.mp4' in s.split('?')[0]]
if not video_srcs:
continue
consecutive_old = 0
collected_items.append(item)
if first_page.get('cursor'):
cursor = first_page['cursor']
elif not first_page.get('hasNext', True):
cursor = None
while cursor and len(collected_items) < effective_max and consecutive_old < max_consecutive_old:
data = self._call_api('/api/posts/', {
'id': user_id,
'cursor': cursor,
'username': username,
'verified': verified,
})
if not data or not data.get('items'):
break
for item in data['items']:
if len(collected_items) >= effective_max or consecutive_old >= max_consecutive_old:
break
post_date = item.get('date')
if post_date and cutoff_ts and post_date < cutoff_ts and not item.get('isPind', False):
consecutive_old += 1
continue
if post_date and date_to_ts and post_date > date_to_ts:
continue
srcs = item.get('srcs', []) or ([item['src']] if item.get('src') else [])
video_srcs = [s for s in srcs if '.mp4' in s.split('?')[0]]
if not video_srcs:
continue
consecutive_old = 0
collected_items.append(item)
if data.get('hasNext') and data.get('cursor'):
cursor = data['cursor']
else:
break
if not collected_items:
self.log(f"Reels complete: 0 files for @{username}", "info")
return []
# Filter out already-downloaded and out-of-range reels before expensive browser session
needs_download = []
for item in collected_items:
code = item.get('code', '')
post_date = item.get('date')
# Date filters — same checks _process_api_post will do
if post_date and cutoff_ts and post_date < cutoff_ts:
continue
if post_date and date_to_ts and post_date > date_to_ts:
continue
if self._is_already_downloaded(code, username):
continue
srcs = item.get('srcs', []) or ([item['src']] if item.get('src') else [])
all_downloaded_flag = srcs and all(
self._is_already_downloaded(self._extract_cdn_filename(s), username)
for s in srcs
)
if all_downloaded_flag:
continue
needs_download.append(item)
if not needs_download:
self.log(f"Reels complete: 0 new files for @{username} ({len(collected_items)} already downloaded)", "info")
return []
self.log(f"Collected {len(collected_items)} reels, {len(needs_download)} need downloading, upgrading to full-res...", "info")
# ── Phase 2: Batch upgrade to full-res ──
hires_map, dates_map = self._batch_upgrade_to_hires(needs_download)
for item in needs_download:
code = item.get('code', '')
if code in hires_map:
item['srcs'] = hires_map[code]
item['isSidecar'] = len(hires_map[code]) > 1
# Fill in missing dates from detail pages
if not item.get('date') and code in dates_map:
item['date'] = dates_map[code]
self.log(f"Recovered date for reel {code} from detail page", "debug")
# ── Phase 3: Process and download ──
all_downloaded = []
for item in needs_download:
status, files = self._process_api_post(
item, username, output_path, cutoff_ts, phrase_config,
defer_database, video_only=True, date_to_ts=date_to_ts,
content_type='reels')
if status == 'downloaded':
all_downloaded.extend(files)
self.log(f"Reels complete: {len(all_downloaded)} files for @{username}", "info")
return all_downloaded