2020 lines
80 KiB
Python
2020 lines
80 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
ImgInn API-based downloader module.
|
|
|
|
Uses ImgInn's JSON API endpoints instead of DOM scraping for reliable,
|
|
structured Instagram content downloading.
|
|
|
|
API Endpoints:
|
|
/api/posts/ — Paginated posts with full carousel support via `srcs` array
|
|
/api/story/ — Stories with direct CDN URLs
|
|
/api/tagged — Tagged posts (minimal data, supplemented via post pages)
|
|
|
|
Advantages over DOM scraping:
|
|
- Carousel items grouped by post (srcs array)
|
|
- Exact UNIX timestamps for post dates
|
|
- Reliable cursor-based pagination
|
|
- No Playwright dependency (uses curl_cffi for TLS fingerprint matching)
|
|
- Pinned post detection (isPind flag)
|
|
|
|
Uses curl_cffi to impersonate Chrome's TLS fingerprint, which is required
|
|
for Cloudflare cf_clearance cookies to work outside a real browser.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import json
|
|
import time
|
|
import hashlib
|
|
from curl_cffi import requests as cf_requests
|
|
from curl_cffi.requests.exceptions import ImpersonateError
|
|
from pathlib import Path
|
|
|
|
|
|
def _create_cf_session(**kwargs):
|
|
"""Create a curl_cffi session, trying multiple browser versions for compatibility."""
|
|
for browser in ("chrome131", "chrome136", "chrome"):
|
|
try:
|
|
return cf_requests.Session(impersonate=browser, **kwargs)
|
|
except Exception:
|
|
continue
|
|
return cf_requests.Session(**kwargs)
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, List, Optional, Set, Tuple
|
|
|
|
from modules.base_module import LoggingMixin
|
|
from modules.cloudflare_handler import (
|
|
CloudflareHandler, SiteStatus,
|
|
get_flaresolverr_user_agent,
|
|
get_flaresolverr_fingerprint
|
|
)
|
|
from modules.instagram_utils import (
|
|
extract_instagram_media_id,
|
|
media_id_to_shortcode,
|
|
scan_existing_files_for_media_ids,
|
|
record_instagram_download,
|
|
is_instagram_downloaded
|
|
)
|
|
|
|
|
|
class ImgInnAPIDownloader(LoggingMixin):
|
|
"""ImgInn API-based downloader with full carousel grouping support."""
|
|
|
|
IMGINN_BASE = "https://imginn.com"
|
|
|
|
def __init__(self, headless=True, cookie_file=None,
|
|
show_progress=True, use_database=True,
|
|
log_callback=None, unified_db=None):
|
|
"""Initialize downloader (compatible with ImgInnDownloader interface).
|
|
|
|
Args:
|
|
headless: Ignored (no browser needed), kept for interface compat
|
|
cookie_file: Cookie file path (used only if no unified_db)
|
|
show_progress: Whether to show progress updates
|
|
use_database: Whether to use database for tracking
|
|
log_callback: Optional log callback
|
|
unified_db: UnifiedDatabase instance
|
|
"""
|
|
self._init_logger('Instagram', log_callback, default_module='Download')
|
|
|
|
self.headless = headless
|
|
self.downloaded_files: Set[str] = set()
|
|
self.show_progress = show_progress
|
|
self.use_database = use_database
|
|
self.download_count = 0
|
|
self.unified_db = unified_db
|
|
self.scraper_id = 'imginn'
|
|
self.pending_downloads: List[dict] = []
|
|
|
|
if unified_db and use_database:
|
|
self.unified_db = unified_db
|
|
else:
|
|
self.unified_db = None
|
|
self.use_database = False
|
|
|
|
# Activity status manager
|
|
from modules.activity_status import get_activity_manager
|
|
self.activity_manager = get_activity_manager(unified_db)
|
|
|
|
# Proxy config from database
|
|
self.proxy_url = None
|
|
if unified_db:
|
|
scraper_config = unified_db.get_scraper(self.scraper_id)
|
|
if scraper_config:
|
|
if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
|
|
self.proxy_url = scraper_config['proxy_url']
|
|
self.log(f"Using proxy: {self.proxy_url}", "info")
|
|
|
|
# User agent from FlareSolverr
|
|
self.user_agent = get_flaresolverr_user_agent()
|
|
|
|
# CloudflareHandler (no cookie file when using DB)
|
|
self.cf_handler = CloudflareHandler(
|
|
module_name="ImgInn",
|
|
cookie_file=None if unified_db else (cookie_file or "/opt/media-downloader/cookies/imginn_cookies.json"),
|
|
user_agent=self.user_agent,
|
|
logger=self.logger,
|
|
aggressive_expiry=True,
|
|
proxy_url=self.proxy_url
|
|
)
|
|
|
|
self._load_cookies_from_db()
|
|
|
|
# HTTP session (curl_cffi with Chrome TLS fingerprint)
|
|
self.session = _create_cf_session()
|
|
self._setup_session()
|
|
|
|
# Rate limiting
|
|
self._last_request_time = None
|
|
self._min_request_interval = 2 # seconds between requests
|
|
|
|
# Cookie refresh cooldown (don't re-fetch within 5 minutes)
|
|
self._last_cookie_refresh = None
|
|
self._cookie_refresh_interval = 300 # 5 minutes
|
|
|
|
# User ID cache (username -> id)
|
|
self._user_id_cache: Dict[str, str] = {}
|
|
|
|
# ==================== Cookie / Session ====================
|
|
|
|
def _recreate_session(self):
|
|
"""Recreate the curl_cffi session when impersonation fails at request time."""
|
|
self.log("Impersonation error, recreating curl_cffi session...", "warning")
|
|
try:
|
|
self.session.close()
|
|
except Exception:
|
|
pass
|
|
self.session = _create_cf_session()
|
|
self._setup_session()
|
|
self._refresh_session_cookies()
|
|
|
|
def _load_cookies_from_db(self):
|
|
if not self.unified_db:
|
|
return
|
|
try:
|
|
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
|
if cookies:
|
|
self.cf_handler._cookies = cookies
|
|
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies: {e}", "warning")
|
|
|
|
def _save_cookies_to_db(self, cookies, user_agent=None):
|
|
if not self.unified_db:
|
|
return
|
|
try:
|
|
ua = user_agent or self.user_agent
|
|
self.unified_db.save_scraper_cookies(self.scraper_id, cookies,
|
|
user_agent=ua, merge=True)
|
|
except Exception as e:
|
|
self.log(f"Error saving cookies: {e}", "warning")
|
|
|
|
def _setup_session(self):
|
|
"""Configure curl_cffi session with CF-matching headers."""
|
|
fingerprint = get_flaresolverr_fingerprint()
|
|
|
|
stored_ua = None
|
|
if self.unified_db:
|
|
try:
|
|
stored_ua = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
|
|
except Exception:
|
|
pass
|
|
|
|
self._stored_ua = stored_ua or fingerprint.get('user_agent', self.user_agent)
|
|
|
|
self._default_headers = {
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
'Accept-Language': fingerprint.get('accept_language', 'en-US,en;q=0.9'),
|
|
'Connection': 'keep-alive',
|
|
'User-Agent': self._stored_ua,
|
|
}
|
|
|
|
# Load CF cookies
|
|
self._refresh_session_cookies()
|
|
|
|
def _refresh_session_cookies(self):
|
|
"""Reload CF cookies into the curl_cffi session."""
|
|
cf_cookies = self.cf_handler.get_cookies_dict()
|
|
# curl_cffi session uses a cookies dict
|
|
for name, value in cf_cookies.items():
|
|
self.session.cookies.set(name, value, domain=".imginn.com")
|
|
|
|
def _ensure_cookies(self, force: bool = False) -> bool:
|
|
"""Ensure valid CF cookies, refresh via FlareSolverr if needed.
|
|
|
|
Uses a cooldown to avoid calling FlareSolverr too frequently.
|
|
With aggressive_expiry=True, cookies_expired() returns True whenever
|
|
cf_clearance expiry is within 7 days — but cf_clearance only lasts ~30 min,
|
|
so without cooldown we'd call FlareSolverr on every single request.
|
|
|
|
Args:
|
|
force: If True, skip cooldown and expiry checks and always refresh.
|
|
Used when a 403 proves the current cookies are invalid.
|
|
"""
|
|
if not force:
|
|
# If we refreshed recently, skip the expiry check entirely
|
|
if self._last_cookie_refresh:
|
|
elapsed = time.time() - self._last_cookie_refresh
|
|
if elapsed < self._cookie_refresh_interval:
|
|
return True
|
|
|
|
if not self.cf_handler.cookies_expired():
|
|
return True
|
|
|
|
self.log("Cookies expired, refreshing via FlareSolverr...", "info")
|
|
success = self.cf_handler.get_cookies_via_flaresolverr(f"{self.IMGINN_BASE}/")
|
|
self._last_cookie_refresh = time.time()
|
|
|
|
if success:
|
|
cookies_list = self.cf_handler.get_cookies_list()
|
|
flaresolverr_ua = self.cf_handler.get_user_agent()
|
|
if cookies_list and self.unified_db:
|
|
self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua)
|
|
|
|
# Refresh session cookies and UA
|
|
if flaresolverr_ua:
|
|
self._stored_ua = flaresolverr_ua
|
|
self._default_headers['User-Agent'] = flaresolverr_ua
|
|
self._refresh_session_cookies()
|
|
return True
|
|
|
|
self.log("Failed to get fresh cookies", "warning")
|
|
return False
|
|
|
|
# ==================== HTTP Helpers ====================
|
|
|
|
def _rate_limit(self):
|
|
if self._last_request_time:
|
|
elapsed = time.time() - self._last_request_time
|
|
if elapsed < self._min_request_interval:
|
|
time.sleep(self._min_request_interval - elapsed)
|
|
self._last_request_time = time.time()
|
|
|
|
def _is_cf_challenge(self, text: str) -> bool:
|
|
"""Check if response is a Cloudflare challenge page."""
|
|
if len(text) > 10000:
|
|
return False
|
|
lower = text[:2000].lower()
|
|
return any(ind in lower for ind in [
|
|
'just a moment', 'checking your browser',
|
|
'verify you are human', 'challenge-platform'
|
|
]) and '<form' not in lower[:500]
|
|
|
|
def _fetch_html(self, url: str) -> Optional[str]:
|
|
"""Fetch a page via curl_cffi (Chrome TLS), handle CF challenges."""
|
|
self._ensure_cookies()
|
|
self._rate_limit()
|
|
|
|
headers = {**self._default_headers}
|
|
|
|
try:
|
|
try:
|
|
resp = self.session.get(url, headers=headers, timeout=30, allow_redirects=True)
|
|
except ImpersonateError:
|
|
self._recreate_session()
|
|
resp = self.session.get(url, headers=headers, timeout=30, allow_redirects=True)
|
|
|
|
if resp.status_code == 403 or self._is_cf_challenge(resp.text):
|
|
self.log(f"CF challenge on {url}, trying FlareSolverr direct fetch...", "info")
|
|
return self._fetch_html_via_flaresolverr(url)
|
|
|
|
if resp.status_code == 404:
|
|
self.log(f"Page not found: {url}", "warning")
|
|
return None
|
|
|
|
# ImgInn returns 410 for some valid profiles — treat as OK if body has content
|
|
if resp.status_code == 410 and len(resp.text) > 1000:
|
|
return resp.text
|
|
|
|
# Retry on server errors (500/502/503) — often transient
|
|
if resp.status_code >= 500:
|
|
self.log(f"HTTP {resp.status_code} for {url}, retrying in 5s...", "warning")
|
|
time.sleep(5)
|
|
self._rate_limit()
|
|
resp = self.session.get(url, headers=headers, timeout=30, allow_redirects=True)
|
|
if resp.status_code >= 500:
|
|
self.log(f"HTTP {resp.status_code} for {url} on retry, trying FlareSolverr...", "warning")
|
|
return self._fetch_html_via_flaresolverr(url)
|
|
if resp.status_code == 200:
|
|
return resp.text
|
|
if resp.status_code == 410 and len(resp.text) > 1000:
|
|
return resp.text
|
|
|
|
if resp.status_code != 200:
|
|
self.log(f"HTTP {resp.status_code} for {url}", "warning")
|
|
return None
|
|
|
|
return resp.text
|
|
except Exception as e:
|
|
self.log(f"Error fetching {url}: {e}", "error")
|
|
return None
|
|
|
|
def _call_api(self, endpoint: str, params: dict) -> Optional[dict]:
|
|
"""Make an API call to ImgInn, return parsed JSON.
|
|
|
|
Falls back to FlareSolverr if curl_cffi gets 403 (CF challenge).
|
|
This is needed for endpoints like /api/story/ where Cloudflare
|
|
applies stricter path-based rules.
|
|
"""
|
|
self._ensure_cookies()
|
|
self._rate_limit()
|
|
|
|
url = f"{self.IMGINN_BASE}{endpoint}"
|
|
headers = {
|
|
**self._default_headers,
|
|
'Accept': '*/*',
|
|
'Referer': f'{self.IMGINN_BASE}/',
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
'Sec-Fetch-Dest': 'empty',
|
|
'Sec-Fetch-Mode': 'cors',
|
|
'Sec-Fetch-Site': 'same-origin',
|
|
}
|
|
|
|
try:
|
|
try:
|
|
resp = self.session.get(url, params=params, headers=headers, timeout=30)
|
|
except ImpersonateError:
|
|
self._recreate_session()
|
|
resp = self.session.get(url, params=params, headers=headers, timeout=30)
|
|
|
|
if resp.status_code == 429:
|
|
self.log("Rate limited (429), waiting 30s...", "warning")
|
|
time.sleep(30)
|
|
self._rate_limit()
|
|
resp = self.session.get(url, params=params, headers=headers, timeout=30)
|
|
|
|
if resp.status_code == 403 or self._is_cf_challenge(resp.text):
|
|
self.log(f"CF challenge on {endpoint}, trying FlareSolverr...", "info")
|
|
return self._call_api_via_flaresolverr(url, params)
|
|
|
|
if resp.status_code != 200:
|
|
self.log(f"API {resp.status_code} for {endpoint}", "warning")
|
|
return None
|
|
|
|
return resp.json()
|
|
except (ValueError, json.JSONDecodeError):
|
|
self.log(f"Invalid JSON from {endpoint}", "warning")
|
|
return None
|
|
except Exception as e:
|
|
self.log(f"API error {endpoint}: {e}", "error")
|
|
return None
|
|
|
|
def _call_api_via_flaresolverr(self, url: str, params: dict) -> Optional[dict]:
|
|
"""Fetch an API endpoint through FlareSolverr's browser.
|
|
|
|
Used as fallback when curl_cffi gets 403 from Cloudflare on
|
|
certain API endpoints (e.g. /api/story/).
|
|
"""
|
|
import html as html_mod
|
|
|
|
# Build full URL with query params
|
|
from urllib.parse import urlencode
|
|
full_url = f"{url}?{urlencode(params)}" if params else url
|
|
|
|
try:
|
|
import requests as std_requests
|
|
payload = {
|
|
'cmd': 'request.get',
|
|
'url': full_url,
|
|
'maxTimeout': 60000,
|
|
}
|
|
resp = std_requests.post('http://localhost:8191/v1', json=payload, timeout=70)
|
|
data = resp.json()
|
|
|
|
if data.get('status') != 'ok':
|
|
self.log(f"FlareSolverr error: {data.get('message', 'unknown')}", "warning")
|
|
return None
|
|
|
|
solution = data.get('solution', {})
|
|
response_text = solution.get('response', '')
|
|
|
|
# Save cookies from FlareSolverr for future curl_cffi requests
|
|
cookies_list = solution.get('cookies', [])
|
|
if cookies_list:
|
|
flaresolverr_ua = solution.get('userAgent', self.cf_handler.get_user_agent())
|
|
self.cf_handler.save_cookies(cookies_list, user_agent=flaresolverr_ua)
|
|
if flaresolverr_ua:
|
|
self._stored_ua = flaresolverr_ua
|
|
self._default_headers['User-Agent'] = flaresolverr_ua
|
|
if self.unified_db:
|
|
self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua)
|
|
self._refresh_session_cookies()
|
|
|
|
if not response_text:
|
|
return None
|
|
|
|
# FlareSolverr wraps JSON responses in HTML <pre> tags
|
|
pre_match = re.search(r'<pre[^>]*>(.*?)</pre>', response_text, re.DOTALL)
|
|
if pre_match:
|
|
json_text = html_mod.unescape(pre_match.group(1))
|
|
return json.loads(json_text)
|
|
|
|
# Try parsing raw response as JSON
|
|
return json.loads(response_text)
|
|
|
|
except (ValueError, json.JSONDecodeError) as e:
|
|
self.log(f"FlareSolverr JSON parse error: {e}", "warning")
|
|
return None
|
|
except Exception as e:
|
|
self.log(f"FlareSolverr fetch error: {e}", "error")
|
|
return None
|
|
|
|
def _fetch_html_via_flaresolverr(self, url: str) -> Optional[str]:
|
|
"""Fetch an HTML page through FlareSolverr's browser.
|
|
|
|
Used as fallback when curl_cffi gets 403 from Cloudflare on
|
|
HTML pages (e.g. /tagged/) that have stricter path-based rules.
|
|
"""
|
|
try:
|
|
import requests as std_requests
|
|
payload = {
|
|
'cmd': 'request.get',
|
|
'url': url,
|
|
'maxTimeout': 120000,
|
|
}
|
|
resp = std_requests.post('http://localhost:8191/v1', json=payload, timeout=130)
|
|
data = resp.json()
|
|
|
|
if data.get('status') != 'ok':
|
|
self.log(f"FlareSolverr error: {data.get('message', 'unknown')}", "warning")
|
|
return None
|
|
|
|
solution = data.get('solution', {})
|
|
response_text = solution.get('response', '')
|
|
|
|
if not response_text:
|
|
self.log("FlareSolverr returned empty response", "warning")
|
|
return None
|
|
|
|
if self._is_cf_challenge(response_text):
|
|
self.log("FlareSolverr could not bypass CF challenge", "warning")
|
|
return None
|
|
|
|
# Save cookies from FlareSolverr for future curl_cffi requests
|
|
cookies_list = solution.get('cookies', [])
|
|
if cookies_list:
|
|
flaresolverr_ua = solution.get('userAgent', self.cf_handler.get_user_agent())
|
|
self.cf_handler.save_cookies(cookies_list, user_agent=flaresolverr_ua)
|
|
if flaresolverr_ua:
|
|
self._stored_ua = flaresolverr_ua
|
|
self._default_headers['User-Agent'] = flaresolverr_ua
|
|
if self.unified_db:
|
|
self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua)
|
|
self._refresh_session_cookies()
|
|
|
|
return response_text
|
|
|
|
except Exception as e:
|
|
self.log(f"FlareSolverr HTML fetch error: {e}", "error")
|
|
return None
|
|
|
|
def _download_file(self, url: str, output_path: Path) -> bool:
|
|
"""Download a file from CDN URL (no rate limit — goes to Instagram CDN, not ImgInn)."""
|
|
try:
|
|
# CDN downloads don't need ImgInn CF cookies - use session for TLS fingerprint
|
|
resp = self.session.get(
|
|
url,
|
|
headers={'Referer': f'{self.IMGINN_BASE}/'},
|
|
timeout=120,
|
|
)
|
|
if resp.status_code != 200:
|
|
self.log(f"Download HTTP {resp.status_code}: {output_path.name}", "warning")
|
|
return False
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path, 'wb') as f:
|
|
f.write(resp.content)
|
|
|
|
size = output_path.stat().st_size
|
|
if size < 1000:
|
|
self.log(f"File too small ({size}B), discarding: {output_path.name}", "warning")
|
|
output_path.unlink()
|
|
return False
|
|
|
|
return True
|
|
except Exception as e:
|
|
self.log(f"Download error: {e}", "error")
|
|
if output_path.exists():
|
|
try:
|
|
output_path.unlink()
|
|
except OSError:
|
|
pass
|
|
return False
|
|
|
|
# ==================== Profile / Page Data Extraction ====================
|
|
|
|
def _get_profile_info(self, username: str) -> Optional[dict]:
|
|
"""Fetch profile page and extract user_id, cursor, shortcodes."""
|
|
url = f"{self.IMGINN_BASE}/{username}/"
|
|
html = self._fetch_html(url)
|
|
if not html:
|
|
# Retry once with forced cookie refresh (handles expired CF cookies)
|
|
if self._ensure_cookies(force=True):
|
|
html = self._fetch_html(url)
|
|
if not html:
|
|
return None
|
|
|
|
info = {
|
|
'user_id': None,
|
|
'cursor': None,
|
|
'verified': False,
|
|
'shortcodes': [],
|
|
}
|
|
|
|
# data-id on container or load-more button
|
|
id_match = re.search(r'data-id="(\d+)"', html)
|
|
if id_match:
|
|
info['user_id'] = id_match.group(1)
|
|
self._user_id_cache[username] = info['user_id']
|
|
|
|
# data-cursor on load-more button
|
|
cursor_match = re.search(r'data-cursor="([^"]+)"', html)
|
|
if cursor_match:
|
|
info['cursor'] = cursor_match.group(1)
|
|
|
|
# verified flag
|
|
if 'data-verified="true"' in html or 'data-verified="1"' in html:
|
|
info['verified'] = True
|
|
|
|
# Extract post shortcodes from grid links
|
|
shortcodes = re.findall(r'href="/p/([A-Za-z0-9_-]+)/"', html)
|
|
seen = set()
|
|
for sc in shortcodes:
|
|
if sc not in seen:
|
|
seen.add(sc)
|
|
info['shortcodes'].append(sc)
|
|
|
|
return info
|
|
|
|
def get_user_profile(self, username: str) -> Optional[dict]:
|
|
"""Fetch public profile info: avatar, display name, bio, stats.
|
|
|
|
Args:
|
|
username: Instagram username
|
|
|
|
Returns:
|
|
Dict with keys: username, user_id, display_name, avatar_url,
|
|
bio, followers, following, posts_count, verified
|
|
Returns None if profile cannot be fetched.
|
|
"""
|
|
import html as html_mod
|
|
|
|
if not self._ensure_cookies():
|
|
return None
|
|
|
|
html = self._fetch_html(f"{self.IMGINN_BASE}/{username}/")
|
|
if not html:
|
|
return None
|
|
|
|
profile = {
|
|
'username': username,
|
|
'user_id': None,
|
|
'display_name': None,
|
|
'avatar_url': None,
|
|
'bio': None,
|
|
'followers': None,
|
|
'following': None,
|
|
'posts_count': None,
|
|
'verified': False,
|
|
}
|
|
|
|
# User ID
|
|
id_match = re.search(r'data-id="(\d+)"', html)
|
|
if id_match:
|
|
profile['user_id'] = id_match.group(1)
|
|
self._user_id_cache[username] = profile['user_id']
|
|
|
|
# Verified
|
|
if 'data-verified="true"' in html or 'data-verified="1"' in html:
|
|
profile['verified'] = True
|
|
|
|
# Avatar from og:image
|
|
og_img = re.search(r'property="og:image"\s*content="([^"]+)"', html)
|
|
if og_img:
|
|
profile['avatar_url'] = html_mod.unescape(og_img.group(1))
|
|
|
|
# Display name from og:title: "View Display Name(@username)..."
|
|
og_title = re.search(r'property="og:title"\s*content="([^"]+)"', html)
|
|
if og_title:
|
|
title_text = html_mod.unescape(og_title.group(1))
|
|
name_match = re.match(r'View\s+(.+?)\s*\(@', title_text)
|
|
if name_match:
|
|
profile['display_name'] = name_match.group(1).strip()
|
|
|
|
# Bio and stats from og:description
|
|
# Format: "Bio text here Followers_count Followers, Following_count Following, Posts_count Posts"
|
|
og_desc = re.search(r'property="og:description"\s*content="([^"]+)"', html)
|
|
if og_desc:
|
|
desc = html_mod.unescape(og_desc.group(1))
|
|
|
|
# Extract stats from end of description
|
|
stats_match = re.search(
|
|
r'([\d,.]+[MKk]?)\s*Followers?,\s*([\d,.]+[MKk]?)\s*Following,\s*([\d,.]+[MKk]?)\s*Posts?',
|
|
desc
|
|
)
|
|
if stats_match:
|
|
profile['followers'] = stats_match.group(1)
|
|
profile['following'] = stats_match.group(2)
|
|
profile['posts_count'] = stats_match.group(3)
|
|
|
|
# Bio is everything before the stats
|
|
bio = desc[:stats_match.start()].strip().rstrip(',').strip()
|
|
if bio:
|
|
profile['bio'] = bio
|
|
|
|
return profile
|
|
|
|
def _get_stories_params(self, username: str, user_id: str = None) -> Optional[dict]:
|
|
"""Get parameters for the stories API call.
|
|
|
|
The stories API requires uid, name, and hash parameters.
|
|
- uid: Instagram numeric user ID (from profile page or cache)
|
|
- name: Instagram username
|
|
- hash: floor(current_time / 100000) — time-based hash
|
|
|
|
Note: We don't fetch the /stories/ HTML page because Cloudflare applies
|
|
stricter challenge rules to that path. Instead, we get the uid from the
|
|
profile page (which works fine) and compute the hash directly.
|
|
"""
|
|
uid = user_id or self._user_id_cache.get(username)
|
|
|
|
if not uid:
|
|
# Fetch profile page to get user_id
|
|
profile = self._get_profile_info(username)
|
|
if profile:
|
|
uid = profile['user_id']
|
|
|
|
if not uid:
|
|
self.log(f"Cannot resolve user_id for @{username}", "warning")
|
|
return None
|
|
|
|
# Hash computation: ceil(current_time / 100000)
|
|
# Using ceil (floor + 1) to get the current cache period instead of the
|
|
# previous one, which returns stale story data.
|
|
story_hash = str(int(time.time()) // 100000 + 1)
|
|
|
|
return {'uid': uid, 'name': username, 'hash': story_hash}
|
|
|
|
def _get_tagged_info(self, username: str) -> Optional[dict]:
|
|
"""Fetch tagged page and extract user_id + cursor."""
|
|
html = self._fetch_html(f"{self.IMGINN_BASE}/tagged/{username}/")
|
|
if not html:
|
|
return None
|
|
|
|
info = {'user_id': None, 'cursor': None, 'shortcodes': []}
|
|
|
|
id_match = re.search(r'data-id="(\d+)"', html)
|
|
if id_match:
|
|
info['user_id'] = id_match.group(1)
|
|
self._user_id_cache[username] = info['user_id']
|
|
|
|
cursor_match = re.search(r'data-cursor="([^"]+)"', html)
|
|
if cursor_match:
|
|
info['cursor'] = cursor_match.group(1)
|
|
|
|
# Extract tagged post shortcodes
|
|
shortcodes = re.findall(r'href="/p/([A-Za-z0-9_-]+)/"', html)
|
|
seen = set()
|
|
for sc in shortcodes:
|
|
if sc not in seen:
|
|
seen.add(sc)
|
|
info['shortcodes'].append(sc)
|
|
|
|
return info
|
|
|
|
def _get_post_detail(self, shortcode: str) -> Optional[dict]:
|
|
"""Fetch individual post page and extract media URLs + metadata."""
|
|
html = self._fetch_html(f"{self.IMGINN_BASE}/p/{shortcode}/")
|
|
if not html:
|
|
return None
|
|
|
|
post = {
|
|
'code': shortcode,
|
|
'date': None,
|
|
'alt': '',
|
|
'author': None,
|
|
'srcs': [],
|
|
'isSidecar': False,
|
|
'isPind': False,
|
|
}
|
|
|
|
import html as html_mod
|
|
|
|
# Extract date from data-created (UNIX timestamp)
|
|
date_match = re.search(r'data-created="(\d+)"', html)
|
|
if date_match:
|
|
post['date'] = int(date_match.group(1))
|
|
else:
|
|
# Fallback: try datetime attribute on <time> elements
|
|
time_match = re.search(r'<time[^>]*datetime="([^"]+)"', html)
|
|
if time_match:
|
|
try:
|
|
from datetime import timezone
|
|
dt = datetime.fromisoformat(time_match.group(1).replace('Z', '+00:00'))
|
|
post['date'] = int(dt.timestamp())
|
|
except Exception:
|
|
pass
|
|
if not post['date']:
|
|
# Fallback: try data-date or data-timestamp attributes
|
|
alt_date = re.search(r'data-(?:date|timestamp|time)="(\d{10,13})"', html)
|
|
if alt_date:
|
|
ts = int(alt_date.group(1))
|
|
if ts > 1e12: # milliseconds
|
|
ts = ts // 1000
|
|
post['date'] = ts
|
|
if not post['date']:
|
|
self.log(f"Could not extract date for post {shortcode}", "warning")
|
|
|
|
# Extract author username from div.username link (most reliable)
|
|
# Format: <div class="username"><a href="/username/">...</a></div>
|
|
username_link = re.search(r'class="username"[^>]*>\s*<a\s+href="/([^"]+?)/"', html)
|
|
if username_link:
|
|
author_candidate = username_link.group(1).strip().lower()
|
|
if re.match(r'^[a-zA-Z0-9_.]{1,30}$', author_candidate):
|
|
post['author'] = author_candidate
|
|
|
|
# Extract caption from og:description (format: "username: caption text")
|
|
cap_match = re.search(r'<meta\s+property="og:description"\s+content="([^"]*)"', html)
|
|
if cap_match:
|
|
full_text = html_mod.unescape(cap_match.group(1))
|
|
# Fallback: extract author from caption if not found above
|
|
if not post['author'] and ':' in full_text:
|
|
author_candidate = full_text.split(':')[0].strip()
|
|
if re.match(r'^[a-zA-Z0-9_.]{1,30}$', author_candidate):
|
|
post['author'] = author_candidate
|
|
post['alt'] = full_text
|
|
|
|
# Extract media URLs from swiper slides
|
|
# Each swiper-slide has a data-src with the full-res CDN URL
|
|
# Only grab data-src from within swiper-slide divs (not profile pics etc.)
|
|
slide_pattern = re.compile(
|
|
r'class="swiper-slide[^"]*"[^>]*data-src="([^"]+)"', re.DOTALL)
|
|
slide_srcs = slide_pattern.findall(html)
|
|
|
|
# Also check for plain data-src within the main post area (non-carousel)
|
|
if not slide_srcs:
|
|
# Look for the main download button link with scontent URL
|
|
dl_pattern = re.compile(
|
|
r'class="[^"]*downloads[^"]*"[^>]*href="(https://scontent[^"]+)"', re.DOTALL)
|
|
dl_srcs = dl_pattern.findall(html)
|
|
if not dl_srcs:
|
|
# Broader: any scontent link with dl=1
|
|
dl_srcs = re.findall(r'href="(https://scontent[^"]*dl=1[^"]*)"', html)
|
|
|
|
slide_srcs = dl_srcs
|
|
|
|
# Clean URLs and filter to CDN only
|
|
urls = []
|
|
seen_urls = set()
|
|
for src in slide_srcs:
|
|
src = html_mod.unescape(src)
|
|
# Only keep Instagram CDN URLs
|
|
if 'scontent' not in src and 'cdninstagram' not in src:
|
|
continue
|
|
# Deduplicate
|
|
base = src.split('?')[0]
|
|
if base in seen_urls:
|
|
continue
|
|
seen_urls.add(base)
|
|
urls.append(src)
|
|
|
|
post['srcs'] = urls
|
|
post['isSidecar'] = len(urls) > 1
|
|
|
|
return post if urls else None
|
|
|
|
# ==================== File Naming ====================
|
|
|
|
def _extract_cdn_filename(self, url: str) -> str:
|
|
"""Extract the base filename from a CDN URL (without extension)."""
|
|
path = url.split('?')[0]
|
|
filename = path.split('/')[-1]
|
|
name = filename.rsplit('.', 1)[0] if '.' in filename else filename
|
|
return name
|
|
|
|
def _extract_ext(self, url: str) -> str:
|
|
"""Extract file extension from CDN URL."""
|
|
path = url.split('?')[0]
|
|
if '.mp4' in path:
|
|
return '.mp4'
|
|
elif '.webp' in path:
|
|
return '.webp'
|
|
elif '.png' in path:
|
|
return '.png'
|
|
elif '.jpeg' in path:
|
|
return '.jpeg'
|
|
return '.jpg'
|
|
|
|
def _make_filename(self, profile: str, date_ts: int, cdn_filename: str,
|
|
ext: str, slide_index: int = None) -> str:
|
|
"""Generate filename: {profile}_{YYYYMMDD_HHMMSS}_{cdn_filename}[_{idx}]{ext}"""
|
|
dt = datetime.fromtimestamp(date_ts) if date_ts else datetime.now()
|
|
date_str = dt.strftime('%Y%m%d_%H%M%S')
|
|
|
|
if slide_index is not None and slide_index > 0:
|
|
return f"{profile}_{date_str}_{cdn_filename}_{slide_index}{ext}"
|
|
return f"{profile}_{date_str}_{cdn_filename}{ext}"
|
|
|
|
def _update_file_timestamps(self, filepath: Path, post_date_ts: int):
|
|
"""Set file modification time to match post date."""
|
|
if not post_date_ts:
|
|
return
|
|
try:
|
|
os.utime(str(filepath), (post_date_ts, post_date_ts))
|
|
except Exception:
|
|
pass
|
|
|
|
# ==================== Duplicate Detection ====================
|
|
|
|
def _is_already_downloaded(self, media_id: str, username: str = None) -> bool:
|
|
if media_id in self.downloaded_files:
|
|
return True
|
|
if self.unified_db and self.use_database:
|
|
return is_instagram_downloaded(self.unified_db, media_id, username)
|
|
return False
|
|
|
|
def _scan_existing_files(self, output_dir: Path, profile: str):
|
|
existing = scan_existing_files_for_media_ids(
|
|
output_dir, profile, min_file_size=1000
|
|
)
|
|
self.downloaded_files.update(existing)
|
|
if existing:
|
|
self.log(f"Found {len(existing)} existing files", "debug")
|
|
|
|
# ==================== Phrase Filtering ====================
|
|
|
|
def _check_phrases(self, caption: str, phrase_config: dict) -> bool:
|
|
"""Check if caption matches phrase filter. Returns True if post should be downloaded."""
|
|
if not phrase_config or not phrase_config.get('enabled'):
|
|
return True
|
|
phrases = phrase_config.get('phrases', [])
|
|
if not phrases:
|
|
return True
|
|
|
|
case_sensitive = phrase_config.get('case_sensitive', False)
|
|
match_all = phrase_config.get('match_all', False)
|
|
|
|
text = caption if case_sensitive else caption.lower()
|
|
|
|
if match_all:
|
|
return all((p if case_sensitive else p.lower()) in text for p in phrases)
|
|
else:
|
|
return any((p if case_sensitive else p.lower()) in text for p in phrases)
|
|
|
|
# ==================== Database Recording ====================
|
|
|
|
def _record_download(self, media_id, username, filename, url=None,
|
|
post_date=None, file_path=None, content_type='post',
|
|
metadata=None, deferred=False):
|
|
record = {
|
|
'media_id': media_id,
|
|
'username': username,
|
|
'filename': filename,
|
|
'url': url or f'instagram://{media_id}',
|
|
'post_date': post_date,
|
|
'file_path': file_path,
|
|
'content_type': content_type,
|
|
'metadata': metadata or {},
|
|
}
|
|
|
|
if deferred:
|
|
self.pending_downloads.append(record)
|
|
return True
|
|
|
|
if self.unified_db and self.use_database:
|
|
try:
|
|
return record_instagram_download(
|
|
self.unified_db,
|
|
media_id=media_id,
|
|
username=username,
|
|
content_type=content_type,
|
|
filename=filename,
|
|
file_path=file_path,
|
|
url=url,
|
|
post_date=datetime.fromtimestamp(post_date) if isinstance(post_date, (int, float)) else post_date,
|
|
method='imginn',
|
|
extra_metadata=metadata
|
|
)
|
|
except Exception as e:
|
|
self.log(f"Error recording download: {e}", "warning")
|
|
return False
|
|
|
|
def get_pending_downloads(self) -> list:
|
|
return self.pending_downloads.copy()
|
|
|
|
def clear_pending_downloads(self):
|
|
self.pending_downloads = []
|
|
|
|
# ==================== Post Processing ====================
|
|
|
|
def _batch_upgrade_to_hires(self, items: list) -> Tuple[dict, dict]:
|
|
"""Batch fetch post detail pages via FlareSolverr session for full-res URLs.
|
|
|
|
Creates a persistent browser session that solves CF once, then
|
|
reuses it for all subsequent requests (~0.5s each instead of ~10s).
|
|
|
|
Args:
|
|
items: List of API post items with 'code' keys
|
|
|
|
Returns:
|
|
Tuple of (srcs_map, dates_map) where:
|
|
- srcs_map: Dict mapping shortcode -> list of full-res src URLs
|
|
- dates_map: Dict mapping shortcode -> UNIX timestamp (for items missing dates)
|
|
"""
|
|
import html as html_mod
|
|
import requests as std_requests
|
|
|
|
shortcodes = [item.get('code', '') for item in items if item.get('code')]
|
|
if not shortcodes:
|
|
return {}, {}
|
|
|
|
results = {}
|
|
dates = {}
|
|
session_id = None
|
|
total = len(shortcodes)
|
|
|
|
try:
|
|
resp = std_requests.post('http://localhost:8191/v1', json={
|
|
'cmd': 'sessions.create'
|
|
}, timeout=30)
|
|
data = resp.json()
|
|
if data.get('status') != 'ok':
|
|
self.log("Failed to create FlareSolverr session for full-res, using API URLs", 'warning')
|
|
return {}
|
|
session_id = data.get('session')
|
|
|
|
self.log(f"Fetching full-res URLs for {total} posts via browser session...", 'info')
|
|
|
|
for i, code in enumerate(shortcodes):
|
|
if self.show_progress and i % 10 == 0:
|
|
self.activity_manager.update_status(
|
|
f"Fetching full-res {i + 1}/{total}")
|
|
|
|
try:
|
|
resp = std_requests.post('http://localhost:8191/v1', json={
|
|
'cmd': 'request.get',
|
|
'url': f'{self.IMGINN_BASE}/p/{code}/',
|
|
'session': session_id,
|
|
'maxTimeout': 60000,
|
|
}, timeout=70)
|
|
page_data = resp.json()
|
|
|
|
if page_data.get('status') != 'ok':
|
|
continue
|
|
|
|
html = page_data.get('solution', {}).get('response', '')
|
|
if not html:
|
|
continue
|
|
|
|
srcs = self._parse_detail_srcs(html)
|
|
if srcs:
|
|
results[code] = srcs
|
|
|
|
# Also extract date from detail page (for items missing dates)
|
|
date_match = re.search(r'data-created="(\d+)"', html)
|
|
if date_match:
|
|
dates[code] = int(date_match.group(1))
|
|
|
|
except Exception as e:
|
|
self.log(f"Detail fetch failed for {code}: {e}", 'debug')
|
|
continue
|
|
|
|
except Exception as e:
|
|
self.log(f"FlareSolverr session error: {e}", 'warning')
|
|
finally:
|
|
if session_id:
|
|
try:
|
|
std_requests.post('http://localhost:8191/v1', json={
|
|
'cmd': 'sessions.destroy',
|
|
'session': session_id,
|
|
}, timeout=10)
|
|
except Exception:
|
|
pass
|
|
|
|
self.log(f"Got full-res URLs for {len(results)}/{total} posts, dates for {len(dates)}", 'info')
|
|
return results, dates
|
|
|
|
@staticmethod
|
|
def _parse_detail_srcs(html: str) -> list:
|
|
"""Extract full-res CDN URLs from a post detail page HTML."""
|
|
import html as html_mod
|
|
|
|
slide_pattern = re.compile(
|
|
r'class="swiper-slide[^"]*"[^>]*data-src="([^"]+)"', re.DOTALL)
|
|
slide_srcs = slide_pattern.findall(html)
|
|
|
|
if not slide_srcs:
|
|
dl_pattern = re.compile(
|
|
r'class="[^"]*downloads[^"]*"[^>]*href="(https://scontent[^"]+)"', re.DOTALL)
|
|
slide_srcs = dl_pattern.findall(html)
|
|
if not slide_srcs:
|
|
slide_srcs = re.findall(r'href="(https://scontent[^"]*dl=1[^"]*)"', html)
|
|
|
|
urls = []
|
|
seen = set()
|
|
for src in slide_srcs:
|
|
src = html_mod.unescape(src)
|
|
if 'scontent' not in src and 'cdninstagram' not in src:
|
|
continue
|
|
base = src.split('?')[0]
|
|
if base in seen:
|
|
continue
|
|
seen.add(base)
|
|
urls.append(src)
|
|
|
|
return urls
|
|
|
|
def _process_api_post(self, item: dict, username: str, output_dir: Path,
|
|
cutoff_ts: int, phrase_config: dict,
|
|
defer_database: bool, video_only: bool = False,
|
|
date_to_ts: int = None,
|
|
content_type: str = 'post') -> Tuple[str, List[str]]:
|
|
"""Process a single post from the API response.
|
|
|
|
Args:
|
|
item: API post item dict
|
|
username: Instagram username
|
|
output_dir: Download directory
|
|
cutoff_ts: Oldest allowed post timestamp (0/None = no lower bound)
|
|
phrase_config: Phrase filter config
|
|
defer_database: Whether to defer DB recording
|
|
video_only: If True, only download video items (for reels mode)
|
|
date_to_ts: Newest allowed post timestamp (None = no upper bound)
|
|
|
|
Returns:
|
|
Tuple of (status, downloaded_files) where status is:
|
|
'downloaded', 'old', 'skipped', 'duplicate', 'filtered', 'future'
|
|
"""
|
|
shortcode = item.get('code', '')
|
|
post_date = item.get('date') # UNIX timestamp
|
|
if not post_date:
|
|
self.log(f"Post {shortcode} has no date - timestamps will default to download time", "warning")
|
|
caption = item.get('alt', '') or ''
|
|
is_sidecar = item.get('isSidecar', False)
|
|
srcs = item.get('srcs', [])
|
|
|
|
# If no srcs, use src as fallback (single item)
|
|
if not srcs:
|
|
src = item.get('src', '')
|
|
if src:
|
|
srcs = [src]
|
|
|
|
if not srcs:
|
|
self.log(f"No media URLs for post {shortcode}", "debug")
|
|
return ('skipped', [])
|
|
|
|
# Date range: skip posts newer than date_to
|
|
if post_date and date_to_ts and post_date > date_to_ts:
|
|
return ('future', [])
|
|
|
|
# Age check (cutoff_ts=0 or None means no lower bound)
|
|
if cutoff_ts:
|
|
if post_date:
|
|
from datetime import datetime as _dt
|
|
post_dt = _dt.fromtimestamp(post_date)
|
|
cutoff_dt = _dt.fromtimestamp(cutoff_ts)
|
|
self.log(f"Age check: post {shortcode} date={post_dt.isoformat()} cutoff={cutoff_dt.isoformat()} old={post_date < cutoff_ts}", "debug")
|
|
else:
|
|
self.log(f"Age check: post {shortcode} has no date (post_date={post_date}), skipping age filter", "debug")
|
|
|
|
if post_date and cutoff_ts and post_date < cutoff_ts:
|
|
return ('old', [])
|
|
|
|
# Phrase check
|
|
if not self._check_phrases(caption, phrase_config):
|
|
self.log(f"Post {shortcode} filtered by phrase config", "debug")
|
|
return ('filtered', [])
|
|
|
|
# Video-only filter for reels mode
|
|
if video_only:
|
|
video_srcs = [s for s in srcs if '.mp4' in s.split('?')[0]]
|
|
if not video_srcs:
|
|
return ('skipped', [])
|
|
srcs = video_srcs
|
|
|
|
downloaded = []
|
|
|
|
for idx, src_url in enumerate(srcs):
|
|
cdn_filename = self._extract_cdn_filename(src_url)
|
|
media_id = extract_instagram_media_id(cdn_filename)
|
|
ext = self._extract_ext(src_url)
|
|
|
|
# Duplicate check
|
|
if self._is_already_downloaded(media_id, username):
|
|
continue
|
|
if self._is_already_downloaded(cdn_filename, username):
|
|
continue
|
|
|
|
# Build output filename
|
|
slide_index = idx if is_sidecar and len(srcs) > 1 else None
|
|
out_filename = self._make_filename(username, post_date, cdn_filename, ext, slide_index)
|
|
out_path = output_dir / out_filename
|
|
|
|
# Skip if file already exists
|
|
if out_path.exists():
|
|
self.downloaded_files.add(media_id)
|
|
self.downloaded_files.add(cdn_filename)
|
|
continue
|
|
|
|
# Download
|
|
if self._download_file(src_url, out_path):
|
|
self.downloaded_files.add(media_id)
|
|
self.downloaded_files.add(cdn_filename)
|
|
|
|
# Set file timestamps
|
|
if post_date:
|
|
self._update_file_timestamps(out_path, post_date)
|
|
|
|
# Record - use per-slide URL for sidecars so each slide gets a unique url_hash
|
|
if shortcode:
|
|
if is_sidecar and len(srcs) > 1:
|
|
instagram_url = f"https://www.instagram.com/p/{shortcode}/?img_index={idx + 1}"
|
|
else:
|
|
instagram_url = f"https://www.instagram.com/p/{shortcode}/"
|
|
else:
|
|
instagram_url = None
|
|
self._record_download(
|
|
media_id=media_id,
|
|
username=username,
|
|
filename=out_filename,
|
|
url=instagram_url,
|
|
post_date=post_date,
|
|
file_path=str(out_path),
|
|
content_type=content_type,
|
|
metadata={
|
|
'shortcode': shortcode,
|
|
'is_sidecar': is_sidecar,
|
|
'slide_index': idx if is_sidecar else None,
|
|
'total_slides': len(srcs) if is_sidecar else 1,
|
|
'cdn_filename': cdn_filename,
|
|
},
|
|
deferred=defer_database
|
|
)
|
|
|
|
downloaded.append(str(out_path))
|
|
self.log(f"Downloaded: {out_filename}", "info")
|
|
|
|
if downloaded:
|
|
return ('downloaded', downloaded)
|
|
return ('duplicate', [])
|
|
|
|
def _process_post_detail(self, post: dict, username: str, output_dir: Path,
|
|
cutoff_ts: int, phrase_config: dict,
|
|
defer_database: bool,
|
|
content_type: str = 'post') -> Tuple[str, List[str]]:
|
|
"""Process a post from _get_post_detail (HTML-extracted data).
|
|
|
|
Uses the post's author for the filename if available (important for
|
|
tagged posts where the author differs from the searched profile).
|
|
"""
|
|
# Use post author for filename if available, fall back to searched username
|
|
file_username = post.get('author') or username
|
|
|
|
# Convert to API-like format and delegate
|
|
api_item = {
|
|
'code': post.get('code', ''),
|
|
'date': post.get('date'),
|
|
'alt': post.get('alt', ''),
|
|
'isPind': post.get('isPind', False),
|
|
'isSidecar': post.get('isSidecar', False),
|
|
'srcs': post.get('srcs', []),
|
|
}
|
|
return self._process_api_post(api_item, file_username, output_dir,
|
|
cutoff_ts, phrase_config, defer_database,
|
|
content_type=content_type)
|
|
|
|
# ==================== Date Range Helpers ====================
|
|
|
|
def _parse_date(self, date_val) -> Optional[int]:
|
|
"""Parse a date value to UNIX timestamp.
|
|
|
|
Accepts: int/float (UNIX timestamp), ISO date string, datetime object, None.
|
|
"""
|
|
if date_val is None:
|
|
return None
|
|
if isinstance(date_val, (int, float)):
|
|
return int(date_val)
|
|
if isinstance(date_val, datetime):
|
|
return int(date_val.timestamp())
|
|
if isinstance(date_val, str):
|
|
for fmt in ('%Y-%m-%d', '%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S'):
|
|
try:
|
|
return int(datetime.strptime(date_val, fmt).timestamp())
|
|
except ValueError:
|
|
continue
|
|
return None
|
|
|
|
def _compute_cutoffs(self, days_back, date_from=None, date_to=None) -> Tuple[Optional[int], Optional[int]]:
|
|
"""Compute (cutoff_ts, date_to_ts) from days_back and explicit date range.
|
|
|
|
- days_back=0 means no lower bound (download everything)
|
|
- date_from overrides days_back if provided
|
|
- date_to sets an upper bound (skip posts newer than this)
|
|
"""
|
|
cutoff_ts = None
|
|
if date_from is not None:
|
|
cutoff_ts = self._parse_date(date_from)
|
|
elif days_back and days_back > 0:
|
|
cutoff_ts = int((datetime.now() - timedelta(days=days_back)).timestamp())
|
|
|
|
date_to_ts = self._parse_date(date_to) if date_to is not None else None
|
|
|
|
self.log(f"Cutoffs: days_back={days_back} cutoff_ts={cutoff_ts} ({datetime.fromtimestamp(cutoff_ts).isoformat() if cutoff_ts else 'None'}) date_to_ts={date_to_ts}", "info")
|
|
|
|
return cutoff_ts, date_to_ts
|
|
|
|
# ==================== Main Download Entry ====================
|
|
|
|
def download(self, username, content_type="posts", days_back=14,
|
|
max_downloads=50, output_dir=None, phrase_config=None,
|
|
defer_database=False, date_from=None, date_to=None) -> int:
|
|
"""Main download entry point (compatible with ImgInnDownloader).
|
|
|
|
Args:
|
|
username: Instagram username (or shortcode for content_type="post")
|
|
content_type: "posts", "stories", "tagged", "reels", or "post" (single)
|
|
days_back: How far back to download (0 = unlimited, no date cutoff)
|
|
max_downloads: Maximum files to download (0 = unlimited)
|
|
output_dir: Output directory path
|
|
phrase_config: Optional phrase filtering config
|
|
defer_database: Whether to defer database recording
|
|
date_from: Explicit start date (overrides days_back). Accepts:
|
|
UNIX timestamp, ISO string "YYYY-MM-DD", or datetime
|
|
date_to: Explicit end date. Same formats as date_from.
|
|
|
|
Returns:
|
|
Number of files downloaded
|
|
"""
|
|
self.downloaded_files = set()
|
|
self.download_count = 0
|
|
|
|
if output_dir is None:
|
|
output_dir = f"/opt/media-downloader/downloads/{username}"
|
|
output_path = Path(output_dir)
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
self._scan_existing_files(output_path, username)
|
|
|
|
try:
|
|
if content_type == "post":
|
|
# Single post: username is treated as shortcode or URL
|
|
files = self.download_single_post(username, output_path, defer_database)
|
|
elif content_type == "posts":
|
|
files = self.download_posts(username, days_back, max_downloads,
|
|
output_path, phrase_config, defer_database,
|
|
date_from=date_from, date_to=date_to)
|
|
elif content_type == "stories":
|
|
files = self.download_stories(username, days_back, max_downloads,
|
|
output_path, defer_database)
|
|
elif content_type == "tagged":
|
|
files = self.download_tagged(username, days_back, max_downloads,
|
|
output_path, phrase_config, defer_database,
|
|
date_from=date_from, date_to=date_to)
|
|
elif content_type == "reels":
|
|
files = self.download_reels(username, days_back, max_downloads,
|
|
output_path, phrase_config, defer_database,
|
|
date_from=date_from, date_to=date_to)
|
|
else:
|
|
self.log(f"Unsupported content type: {content_type}", "warning")
|
|
return 0
|
|
|
|
count = len(files) if files else 0
|
|
self.download_count = count
|
|
return count
|
|
|
|
except Exception as e:
|
|
self.log(f"Download error for @{username} ({content_type}): {e}", "error")
|
|
import traceback
|
|
self.log(traceback.format_exc(), "debug")
|
|
return 0
|
|
|
|
# ==================== Single Post ====================
|
|
|
|
def download_single_post(self, shortcode_or_url: str, output_dir=None,
|
|
defer_database=False) -> List[str]:
|
|
"""Download a single Instagram post by shortcode or URL.
|
|
|
|
Args:
|
|
shortcode_or_url: Post shortcode (e.g. "DVL2WxBFGBT") or
|
|
Instagram/ImgInn URL containing the shortcode
|
|
output_dir: Output directory
|
|
defer_database: Whether to defer DB recording
|
|
|
|
Returns:
|
|
List of downloaded file paths
|
|
"""
|
|
# Extract shortcode from URL if needed
|
|
shortcode = shortcode_or_url
|
|
url_match = re.search(r'/p/([A-Za-z0-9_-]+)', shortcode_or_url)
|
|
if url_match:
|
|
shortcode = url_match.group(1)
|
|
|
|
self.activity_manager.update_status(f"Downloading post {shortcode}")
|
|
|
|
if not self._ensure_cookies():
|
|
return []
|
|
|
|
output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/posts")
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
post = self._get_post_detail(shortcode)
|
|
if not post:
|
|
self.log(f"Could not fetch post /p/{shortcode}/", "error")
|
|
return []
|
|
|
|
username = post.get('author') or 'unknown'
|
|
self.log(f"Post {shortcode} by @{username}: {len(post['srcs'])} media items", "info")
|
|
|
|
# No date cutoff or phrase filter for single post
|
|
status, files = self._process_post_detail(
|
|
post, username, output_path, cutoff_ts=None,
|
|
phrase_config=None, defer_database=defer_database)
|
|
|
|
self.log(f"Single post complete: {len(files)} files", "info")
|
|
return files
|
|
|
|
# ==================== Posts ====================
|
|
|
|
def download_posts(self, username, days_back=14, max_posts=50,
|
|
output_dir=None, phrase_config=None,
|
|
defer_database=False, date_from=None, date_to=None) -> List[str]:
|
|
"""Download posts using ImgInn API with full carousel support.
|
|
|
|
Args:
|
|
days_back: How far back to download. 0 = no date cutoff (all posts).
|
|
max_posts: Maximum posts to process. 0 = unlimited.
|
|
date_from: Explicit start date (overrides days_back).
|
|
date_to: Explicit end date (skip posts newer than this).
|
|
"""
|
|
self.activity_manager.update_status(f"Checking posts for @{username}")
|
|
|
|
if not self._ensure_cookies():
|
|
self.activity_manager.update_status(f"Skipped - ImgInn unavailable")
|
|
return []
|
|
|
|
output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/{username}")
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Fetch profile page for user_id and cursor
|
|
profile = self._get_profile_info(username)
|
|
if not profile or not profile['user_id']:
|
|
self.log(f"Could not resolve profile for @{username}", "warning")
|
|
return []
|
|
|
|
user_id = profile['user_id']
|
|
cursor = profile['cursor']
|
|
verified = '1' if profile['verified'] else '0'
|
|
|
|
self.log(f"Profile @{username}: user_id={user_id}, {len(profile['shortcodes'])} initial posts", "info")
|
|
|
|
cutoff_ts, date_to_ts = self._compute_cutoffs(days_back, date_from, date_to)
|
|
has_date_cutoff = cutoff_ts is not None
|
|
|
|
# max_posts=0 means unlimited
|
|
effective_max = max_posts if max_posts > 0 else float('inf')
|
|
max_consecutive_old = 5 if has_date_cutoff else float('inf')
|
|
|
|
# ── Phase 1: Collect items via API pagination ──
|
|
self.activity_manager.update_status(f"Scanning posts for @{username}")
|
|
collected_items = []
|
|
consecutive_old = 0
|
|
|
|
first_page = self._call_api('/api/posts/', {
|
|
'id': user_id,
|
|
'cursor': '',
|
|
'username': username,
|
|
'verified': verified,
|
|
})
|
|
|
|
if first_page and first_page.get('items'):
|
|
for item in first_page['items']:
|
|
if len(collected_items) >= effective_max or consecutive_old >= max_consecutive_old:
|
|
break
|
|
post_date = item.get('date')
|
|
if post_date and cutoff_ts and post_date < cutoff_ts and not item.get('isPind', False):
|
|
consecutive_old += 1
|
|
continue
|
|
if post_date and date_to_ts and post_date > date_to_ts:
|
|
continue
|
|
consecutive_old = 0
|
|
collected_items.append(item)
|
|
|
|
if first_page.get('cursor'):
|
|
cursor = first_page['cursor']
|
|
elif not first_page.get('hasNext', True):
|
|
cursor = None
|
|
else:
|
|
# First page API didn't work — fetch individual post pages (already full-res)
|
|
self.log("First page API unavailable, fetching post pages...", "debug")
|
|
all_downloaded = []
|
|
total_processed = 0
|
|
consecutive_old = 0
|
|
for shortcode in profile['shortcodes']:
|
|
if total_processed >= effective_max or consecutive_old >= max_consecutive_old:
|
|
break
|
|
if self._is_already_downloaded(shortcode, username):
|
|
total_processed += 1
|
|
continue
|
|
post = self._get_post_detail(shortcode)
|
|
if not post:
|
|
continue
|
|
status, files = self._process_post_detail(
|
|
post, username, output_path, cutoff_ts, phrase_config, defer_database)
|
|
total_processed += 1
|
|
if status == 'old':
|
|
consecutive_old += 1
|
|
elif status == 'downloaded':
|
|
consecutive_old = 0
|
|
all_downloaded.extend(files)
|
|
self.log(f"Posts complete: {len(all_downloaded)} files for @{username}", "info")
|
|
return all_downloaded
|
|
|
|
# Continue API pagination
|
|
while cursor and len(collected_items) < effective_max and consecutive_old < max_consecutive_old:
|
|
self.log(f"Fetching posts page (collected={len(collected_items)})...", "debug")
|
|
data = self._call_api('/api/posts/', {
|
|
'id': user_id,
|
|
'cursor': cursor,
|
|
'username': username,
|
|
'verified': verified,
|
|
})
|
|
if not data or not data.get('items'):
|
|
break
|
|
|
|
for item in data['items']:
|
|
if len(collected_items) >= effective_max or consecutive_old >= max_consecutive_old:
|
|
break
|
|
post_date = item.get('date')
|
|
if post_date and cutoff_ts and post_date < cutoff_ts and not item.get('isPind', False):
|
|
consecutive_old += 1
|
|
continue
|
|
if post_date and date_to_ts and post_date > date_to_ts:
|
|
continue
|
|
consecutive_old = 0
|
|
collected_items.append(item)
|
|
|
|
if data.get('hasNext') and data.get('cursor'):
|
|
cursor = data['cursor']
|
|
else:
|
|
break
|
|
|
|
if not collected_items:
|
|
self.log(f"Posts complete: 0 files for @{username}", "info")
|
|
return []
|
|
|
|
# Filter out already-downloaded and out-of-range posts before expensive browser session
|
|
needs_download = []
|
|
for item in collected_items:
|
|
code = item.get('code', '')
|
|
post_date = item.get('date')
|
|
# Date filters — same checks _process_api_post will do
|
|
if post_date and cutoff_ts and post_date < cutoff_ts:
|
|
continue
|
|
if post_date and date_to_ts and post_date > date_to_ts:
|
|
continue
|
|
if self._is_already_downloaded(code, username):
|
|
continue
|
|
# Check if all srcs are already downloaded
|
|
srcs = item.get('srcs', []) or ([item['src']] if item.get('src') else [])
|
|
all_downloaded_flag = srcs and all(
|
|
self._is_already_downloaded(self._extract_cdn_filename(s), username)
|
|
for s in srcs
|
|
)
|
|
if all_downloaded_flag:
|
|
continue
|
|
needs_download.append(item)
|
|
|
|
if not needs_download:
|
|
self.log(f"Posts complete: 0 new files for @{username} ({len(collected_items)} already downloaded)", "info")
|
|
return []
|
|
|
|
self.log(f"Collected {len(collected_items)} posts, {len(needs_download)} need downloading, upgrading to full-res...", "info")
|
|
|
|
# ── Phase 2: Batch upgrade to full-res via FlareSolverr session ──
|
|
hires_map, dates_map = self._batch_upgrade_to_hires(needs_download)
|
|
|
|
# Apply full-res URLs and missing dates to items
|
|
for item in needs_download:
|
|
code = item.get('code', '')
|
|
if code in hires_map:
|
|
item['srcs'] = hires_map[code]
|
|
item['isSidecar'] = len(hires_map[code]) > 1
|
|
# Fill in missing dates from detail pages
|
|
if not item.get('date') and code in dates_map:
|
|
item['date'] = dates_map[code]
|
|
self.log(f"Recovered date for post {code} from detail page", "debug")
|
|
|
|
# ── Phase 3: Process and download ──
|
|
all_downloaded = []
|
|
total_processed = 0
|
|
|
|
for item in needs_download:
|
|
status, files = self._process_api_post(
|
|
item, username, output_path, cutoff_ts, phrase_config,
|
|
defer_database, date_to_ts=date_to_ts)
|
|
total_processed += 1
|
|
|
|
if status == 'downloaded':
|
|
all_downloaded.extend(files)
|
|
|
|
if self.show_progress:
|
|
progress_max = max_posts if max_posts > 0 else None
|
|
self.activity_manager.update_status(
|
|
f"Downloading posts for @{username}",
|
|
len(all_downloaded), progress_max)
|
|
|
|
self.log(f"Posts complete: {len(all_downloaded)} files for @{username}", "info")
|
|
return all_downloaded
|
|
|
|
# ==================== Stories ====================
|
|
|
|
def download_stories(self, username, days_back=1, max_stories=50,
|
|
output_dir=None, defer_database=False) -> List[str]:
|
|
"""Download stories using ImgInn API."""
|
|
self.activity_manager.update_status(f"Checking stories for @{username}")
|
|
|
|
if not self._ensure_cookies():
|
|
return []
|
|
|
|
output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/{username}")
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Get story API params
|
|
params = self._get_stories_params(username)
|
|
if not params:
|
|
self.log(f"Cannot get story params for @{username}", "warning")
|
|
return []
|
|
|
|
self.log(f"Fetching stories for @{username} (uid={params['uid']})...", "info")
|
|
|
|
data = self._call_api('/api/story/', params)
|
|
|
|
items = []
|
|
if data and data.get('items'):
|
|
items = data['items']
|
|
else:
|
|
# API returned no data (404/empty) — fall back to parsing the stories HTML page
|
|
items = self._parse_stories_from_html(username)
|
|
|
|
if not items:
|
|
self.log(f"No stories found for @{username}", "info")
|
|
return []
|
|
|
|
self.log(f"Found {len(items)} stories for @{username}", "info")
|
|
|
|
downloaded = []
|
|
|
|
for idx, item in enumerate(items):
|
|
if len(downloaded) >= max_stories:
|
|
break
|
|
|
|
src = item.get('src', '')
|
|
if not src:
|
|
# Try proxy URL as fallback
|
|
src = item.get('proxy', '')
|
|
if not src:
|
|
continue
|
|
|
|
# Extract info
|
|
story_time = item.get('time') # UNIX timestamp
|
|
if not story_time:
|
|
# Stories are always recent, use current time as best approximation
|
|
story_time = int(time.time())
|
|
self.log(f"Story {idx + 1} for @{username} has no timestamp, using current time", "warning")
|
|
cdn_filename = self._extract_cdn_filename(src)
|
|
media_id = extract_instagram_media_id(cdn_filename) if cdn_filename else f"story_{idx}"
|
|
ext = self._extract_ext(src)
|
|
|
|
# Duplicate check
|
|
if self._is_already_downloaded(media_id, username):
|
|
continue
|
|
if self._is_already_downloaded(cdn_filename, username):
|
|
continue
|
|
|
|
# Generate filename with story suffix
|
|
dt = datetime.fromtimestamp(story_time)
|
|
date_str = dt.strftime('%Y%m%d_%H%M%S')
|
|
out_filename = f"{username}_{date_str}_{cdn_filename}_story{idx + 1}{ext}"
|
|
out_path = output_path / out_filename
|
|
|
|
if out_path.exists():
|
|
self.downloaded_files.add(media_id)
|
|
continue
|
|
|
|
if self._download_file(src, out_path):
|
|
self.downloaded_files.add(media_id)
|
|
self.downloaded_files.add(cdn_filename)
|
|
|
|
if story_time:
|
|
self._update_file_timestamps(out_path, story_time)
|
|
|
|
# Use per-story URL so each story gets a unique url_hash
|
|
story_url = f"https://www.instagram.com/stories/{username}/{media_id}/"
|
|
self._record_download(
|
|
media_id=media_id,
|
|
username=username,
|
|
filename=out_filename,
|
|
url=story_url,
|
|
post_date=story_time,
|
|
file_path=str(out_path),
|
|
content_type='stories',
|
|
metadata={'story_index': idx + 1, 'cdn_filename': cdn_filename},
|
|
deferred=defer_database
|
|
)
|
|
|
|
downloaded.append(str(out_path))
|
|
self.log(f"Downloaded story: {out_filename}", "info")
|
|
|
|
if self.show_progress:
|
|
self.activity_manager.update_status(
|
|
f"Downloading stories for @{username}",
|
|
len(downloaded), len(items))
|
|
|
|
self.log(f"Stories complete: {len(downloaded)} files for @{username}", "info")
|
|
return downloaded
|
|
|
|
def _parse_stories_from_html(self, username: str) -> list:
|
|
"""Parse stories from the /stories/{username}/ HTML page.
|
|
|
|
Fallback when /api/story/ returns 404. Extracts story media URLs
|
|
from the reels-media section of the stories page.
|
|
|
|
Returns:
|
|
List of dicts with 'src' and optionally 'time' keys,
|
|
matching the API response format.
|
|
"""
|
|
self.log(f"Trying HTML fallback for stories @{username}", "info")
|
|
html = self._fetch_html(f"{self.IMGINN_BASE}/stories/{username}/")
|
|
if not html:
|
|
return []
|
|
|
|
items = []
|
|
|
|
# Find the reels-media section containing story items
|
|
media_idx = html.find('class="reels-media"')
|
|
if media_idx < 0:
|
|
self.log("No reels-media section found in stories page", "debug")
|
|
return []
|
|
|
|
section = html[media_idx:]
|
|
|
|
# Each story is in a <div class="media"> container
|
|
# Videos have: <div class="media-video-wrap" data-src="VIDEO_URL">
|
|
# and <a class="download" href="VIDEO_URL&dl=1">
|
|
# Images have: <a class="download" href="IMAGE_URL&dl=1">
|
|
for m in re.finditer(r'<div class="media">', section):
|
|
# Get chunk until next media div or end
|
|
start = m.start()
|
|
next_media = section.find('<div class="media">', start + 1)
|
|
chunk = section[start:next_media] if next_media > 0 else section[start:start + 5000]
|
|
|
|
# Extract download URL (most reliable source)
|
|
dl_match = re.search(r'class="download"[^>]*href="([^"]+)"', chunk)
|
|
if not dl_match:
|
|
continue
|
|
|
|
import html as html_mod
|
|
src = html_mod.unescape(dl_match.group(1))
|
|
# Remove &dl=1 suffix if present
|
|
src = re.sub(r'[&?]dl=1$', '', src)
|
|
|
|
# Extract relative time (e.g. "5 hours ago") and convert to timestamp
|
|
item = {'src': src}
|
|
time_match = re.search(r'class="time">(\d+)\s+(second|minute|hour|day|week)s?\s+ago<', chunk)
|
|
if time_match:
|
|
amount = int(time_match.group(1))
|
|
unit = time_match.group(2)
|
|
seconds_map = {'second': 1, 'minute': 60, 'hour': 3600, 'day': 86400, 'week': 604800}
|
|
item['time'] = int(time.time()) - (amount * seconds_map.get(unit, 0))
|
|
|
|
items.append(item)
|
|
|
|
if items:
|
|
self.log(f"Parsed {len(items)} stories from HTML for @{username}", "info")
|
|
|
|
return items
|
|
|
|
def _parse_highlights_from_html(self, username: str) -> list:
|
|
"""Parse highlight IDs and titles from /stories/{username}/ page.
|
|
|
|
Returns list of dicts: [{id, title}, ...]
|
|
"""
|
|
html = self._fetch_html(f"{self.IMGINN_BASE}/stories/{username}/")
|
|
if not html:
|
|
return []
|
|
|
|
highlights = []
|
|
# Highlights are: <li class="reel swiper-slide" data-id="{id}">
|
|
# <div class="title">{title}</div>
|
|
for match in re.finditer(
|
|
r'<li[^>]*\bdata-id="(\d+)"[^>]*>.*?'
|
|
r'<div class="title">([^<]*)</div>',
|
|
html, re.DOTALL
|
|
):
|
|
highlights.append({
|
|
'id': match.group(1),
|
|
'title': match.group(2).strip(),
|
|
})
|
|
|
|
if highlights:
|
|
self.log(f"Found {len(highlights)} highlights for @{username}", "info")
|
|
|
|
return highlights
|
|
|
|
# ==================== Tagged ====================
|
|
|
|
def download_tagged(self, username, days_back=14, max_posts=50,
|
|
output_dir=None, phrase_config=None,
|
|
defer_database=False, date_from=None, date_to=None) -> List[str]:
|
|
"""Download tagged posts. Uses API for pagination, post pages for media URLs.
|
|
|
|
Args:
|
|
days_back: How far back to download. 0 = no date cutoff.
|
|
max_posts: Maximum posts to process. 0 = unlimited.
|
|
date_from: Explicit start date (overrides days_back).
|
|
date_to: Explicit end date.
|
|
"""
|
|
self.activity_manager.update_status(f"Checking tagged for @{username}")
|
|
|
|
if not self._ensure_cookies():
|
|
return []
|
|
|
|
output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/{username}")
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Fetch tagged page
|
|
tagged_info = self._get_tagged_info(username)
|
|
if not tagged_info or not tagged_info['user_id']:
|
|
self.log(f"Could not load tagged page for @{username}", "error")
|
|
return []
|
|
|
|
user_id = tagged_info['user_id']
|
|
cursor = tagged_info['cursor']
|
|
|
|
cutoff_ts, date_to_ts = self._compute_cutoffs(days_back, date_from, date_to)
|
|
has_date_cutoff = cutoff_ts is not None
|
|
|
|
all_downloaded = []
|
|
total_processed = 0
|
|
consecutive_old = 0
|
|
max_consecutive_old = 5 if has_date_cutoff else float('inf')
|
|
effective_max = max_posts if max_posts > 0 else float('inf')
|
|
|
|
# Process first-page shortcodes via API to get full data (dates, srcs)
|
|
# The HTML page only has shortcodes, so we fetch each via /api/posts/ for dates
|
|
first_page_codes = tagged_info['shortcodes']
|
|
self.log(f"Tagged: {len(first_page_codes)} posts on first page", "debug")
|
|
|
|
for shortcode in first_page_codes:
|
|
if total_processed >= effective_max or consecutive_old >= max_consecutive_old:
|
|
break
|
|
|
|
if self._is_already_downloaded(shortcode, username):
|
|
total_processed += 1
|
|
continue
|
|
|
|
# Fetch post detail page for media URLs and author
|
|
post = self._get_post_detail(shortcode)
|
|
if not post:
|
|
total_processed += 1
|
|
continue
|
|
|
|
status, files = self._process_post_detail(
|
|
post, username, output_path, cutoff_ts, phrase_config, defer_database,
|
|
content_type='tagged')
|
|
total_processed += 1
|
|
|
|
if status == 'old':
|
|
consecutive_old += 1
|
|
elif status == 'downloaded':
|
|
consecutive_old = 0
|
|
all_downloaded.extend(files)
|
|
|
|
if self.show_progress:
|
|
progress_max = max_posts if max_posts > 0 else None
|
|
self.activity_manager.update_status(
|
|
f"Downloading tagged for @{username}",
|
|
len(all_downloaded), progress_max)
|
|
|
|
# Paginate via tagged API — use API data directly (has time, srcs, code)
|
|
while cursor and total_processed < effective_max and consecutive_old < max_consecutive_old:
|
|
self.log(f"Fetching tagged page (processed={total_processed})...", "debug")
|
|
|
|
data = self._call_api('/api/tagged', {
|
|
'id': user_id,
|
|
'cursor': cursor,
|
|
})
|
|
|
|
if not data or not data.get('items'):
|
|
break
|
|
|
|
items = data['items']
|
|
|
|
for item in items:
|
|
if total_processed >= effective_max or consecutive_old >= max_consecutive_old:
|
|
break
|
|
|
|
shortcode = item.get('code', '')
|
|
if not shortcode:
|
|
item_id = item.get('id', '')
|
|
if not item_id:
|
|
continue
|
|
shortcode = media_id_to_shortcode(item_id)
|
|
|
|
if self._is_already_downloaded(shortcode, username):
|
|
total_processed += 1
|
|
continue
|
|
|
|
# Use API data directly — normalize 'time' to 'date' for _process_api_post
|
|
if 'time' in item and 'date' not in item:
|
|
item['date'] = item['time']
|
|
|
|
# Use owner username for filename if available
|
|
api_username = username
|
|
owner = item.get('owner', {})
|
|
if isinstance(owner, dict) and owner.get('username'):
|
|
api_username = owner['username']
|
|
|
|
status, files = self._process_api_post(
|
|
item, api_username, output_path, cutoff_ts, phrase_config, defer_database,
|
|
content_type='tagged')
|
|
total_processed += 1
|
|
|
|
if status == 'old':
|
|
consecutive_old += 1
|
|
elif status == 'downloaded':
|
|
consecutive_old = 0
|
|
all_downloaded.extend(files)
|
|
|
|
# Tagged API uses last item's ID as next cursor
|
|
if items:
|
|
last_id = items[-1].get('id')
|
|
if last_id and str(last_id) != str(cursor):
|
|
cursor = str(last_id)
|
|
else:
|
|
break
|
|
else:
|
|
break
|
|
|
|
self.log(f"Tagged complete: {len(all_downloaded)} files for @{username}", "info")
|
|
return all_downloaded
|
|
|
|
# ==================== Reels ====================
|
|
|
|
def download_reels(self, username, days_back=14, max_downloads=50,
|
|
output_dir=None, phrase_config=None,
|
|
defer_database=False, date_from=None, date_to=None) -> List[str]:
|
|
"""Download reels (video posts) using the posts API with video filter.
|
|
|
|
Args:
|
|
days_back: How far back to download. 0 = no date cutoff.
|
|
max_downloads: Maximum files to download. 0 = unlimited.
|
|
date_from: Explicit start date (overrides days_back).
|
|
date_to: Explicit end date.
|
|
"""
|
|
self.activity_manager.update_status(f"Checking reels for @{username}")
|
|
|
|
if not self._ensure_cookies():
|
|
return []
|
|
|
|
output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/{username}")
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Same as posts but with video_only flag
|
|
profile = self._get_profile_info(username)
|
|
if not profile or not profile['user_id']:
|
|
self.log(f"Could not resolve profile for @{username}", "warning")
|
|
return []
|
|
|
|
user_id = profile['user_id']
|
|
cursor = profile['cursor']
|
|
verified = '1' if profile['verified'] else '0'
|
|
|
|
cutoff_ts, date_to_ts = self._compute_cutoffs(days_back, date_from, date_to)
|
|
has_date_cutoff = cutoff_ts is not None
|
|
|
|
effective_max = max_downloads if max_downloads > 0 else float('inf')
|
|
max_consecutive_old = 5 if has_date_cutoff else float('inf')
|
|
|
|
# ── Phase 1: Collect video items via API ──
|
|
collected_items = []
|
|
consecutive_old = 0
|
|
|
|
first_page = self._call_api('/api/posts/', {
|
|
'id': user_id,
|
|
'cursor': '',
|
|
'username': username,
|
|
'verified': verified,
|
|
})
|
|
|
|
if first_page and first_page.get('items'):
|
|
for item in first_page['items']:
|
|
if len(collected_items) >= effective_max or consecutive_old >= max_consecutive_old:
|
|
break
|
|
post_date = item.get('date')
|
|
if post_date and cutoff_ts and post_date < cutoff_ts and not item.get('isPind', False):
|
|
consecutive_old += 1
|
|
continue
|
|
if post_date and date_to_ts and post_date > date_to_ts:
|
|
continue
|
|
# Only keep video items
|
|
srcs = item.get('srcs', []) or ([item['src']] if item.get('src') else [])
|
|
video_srcs = [s for s in srcs if '.mp4' in s.split('?')[0]]
|
|
if not video_srcs:
|
|
continue
|
|
consecutive_old = 0
|
|
collected_items.append(item)
|
|
|
|
if first_page.get('cursor'):
|
|
cursor = first_page['cursor']
|
|
elif not first_page.get('hasNext', True):
|
|
cursor = None
|
|
|
|
while cursor and len(collected_items) < effective_max and consecutive_old < max_consecutive_old:
|
|
data = self._call_api('/api/posts/', {
|
|
'id': user_id,
|
|
'cursor': cursor,
|
|
'username': username,
|
|
'verified': verified,
|
|
})
|
|
if not data or not data.get('items'):
|
|
break
|
|
|
|
for item in data['items']:
|
|
if len(collected_items) >= effective_max or consecutive_old >= max_consecutive_old:
|
|
break
|
|
post_date = item.get('date')
|
|
if post_date and cutoff_ts and post_date < cutoff_ts and not item.get('isPind', False):
|
|
consecutive_old += 1
|
|
continue
|
|
if post_date and date_to_ts and post_date > date_to_ts:
|
|
continue
|
|
srcs = item.get('srcs', []) or ([item['src']] if item.get('src') else [])
|
|
video_srcs = [s for s in srcs if '.mp4' in s.split('?')[0]]
|
|
if not video_srcs:
|
|
continue
|
|
consecutive_old = 0
|
|
collected_items.append(item)
|
|
|
|
if data.get('hasNext') and data.get('cursor'):
|
|
cursor = data['cursor']
|
|
else:
|
|
break
|
|
|
|
if not collected_items:
|
|
self.log(f"Reels complete: 0 files for @{username}", "info")
|
|
return []
|
|
|
|
# Filter out already-downloaded and out-of-range reels before expensive browser session
|
|
needs_download = []
|
|
for item in collected_items:
|
|
code = item.get('code', '')
|
|
post_date = item.get('date')
|
|
# Date filters — same checks _process_api_post will do
|
|
if post_date and cutoff_ts and post_date < cutoff_ts:
|
|
continue
|
|
if post_date and date_to_ts and post_date > date_to_ts:
|
|
continue
|
|
if self._is_already_downloaded(code, username):
|
|
continue
|
|
srcs = item.get('srcs', []) or ([item['src']] if item.get('src') else [])
|
|
all_downloaded_flag = srcs and all(
|
|
self._is_already_downloaded(self._extract_cdn_filename(s), username)
|
|
for s in srcs
|
|
)
|
|
if all_downloaded_flag:
|
|
continue
|
|
needs_download.append(item)
|
|
|
|
if not needs_download:
|
|
self.log(f"Reels complete: 0 new files for @{username} ({len(collected_items)} already downloaded)", "info")
|
|
return []
|
|
|
|
self.log(f"Collected {len(collected_items)} reels, {len(needs_download)} need downloading, upgrading to full-res...", "info")
|
|
|
|
# ── Phase 2: Batch upgrade to full-res ──
|
|
hires_map, dates_map = self._batch_upgrade_to_hires(needs_download)
|
|
for item in needs_download:
|
|
code = item.get('code', '')
|
|
if code in hires_map:
|
|
item['srcs'] = hires_map[code]
|
|
item['isSidecar'] = len(hires_map[code]) > 1
|
|
# Fill in missing dates from detail pages
|
|
if not item.get('date') and code in dates_map:
|
|
item['date'] = dates_map[code]
|
|
self.log(f"Recovered date for reel {code} from detail page", "debug")
|
|
|
|
# ── Phase 3: Process and download ──
|
|
all_downloaded = []
|
|
for item in needs_download:
|
|
status, files = self._process_api_post(
|
|
item, username, output_path, cutoff_ts, phrase_config,
|
|
defer_database, video_only=True, date_to_ts=date_to_ts,
|
|
content_type='reels')
|
|
if status == 'downloaded':
|
|
all_downloaded.extend(files)
|
|
|
|
self.log(f"Reels complete: {len(all_downloaded)} files for @{username}", "info")
|
|
return all_downloaded
|