media-downloader/modules/imginn_api_module.py

#!/usr/bin/env python3
"""
ImgInn API-based downloader module.

Uses ImgInn's JSON API endpoints instead of DOM scraping for reliable,
structured Instagram content downloading.

API Endpoints:
  /api/posts/  — Paginated posts with full carousel support via `srcs` array
  /api/story/  — Stories with direct CDN URLs
  /api/tagged  — Tagged posts (minimal data, supplemented via post pages)

Advantages over DOM scraping:
  - Carousel items grouped by post (srcs array)
  - Exact UNIX timestamps for post dates
  - Reliable cursor-based pagination
  - No Playwright dependency (uses curl_cffi for TLS fingerprint matching)
  - Pinned post detection (isPind flag)

Uses curl_cffi to impersonate Chrome's TLS fingerprint, which is required
for Cloudflare cf_clearance cookies to work outside a real browser.
"""

import os
import re
import json
import time
import hashlib
from curl_cffi import requests as cf_requests
from curl_cffi.requests.exceptions import ImpersonateError
from pathlib import Path


def _create_cf_session(**kwargs):
    """Create a curl_cffi session, trying multiple browser versions for compatibility."""
    for browser in ("chrome131", "chrome136", "chrome"):
        try:
            return cf_requests.Session(impersonate=browser, **kwargs)
        except Exception:
            continue
    return cf_requests.Session(**kwargs)
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Set, Tuple

from modules.base_module import LoggingMixin
from modules.cloudflare_handler import (
    CloudflareHandler, SiteStatus,
    get_flaresolverr_user_agent,
    get_flaresolverr_fingerprint
)
from modules.instagram_utils import (
    extract_instagram_media_id,
    media_id_to_shortcode,
    scan_existing_files_for_media_ids,
    record_instagram_download,
    is_instagram_downloaded
)


class ImgInnAPIDownloader(LoggingMixin):
    """ImgInn API-based downloader with full carousel grouping support."""

    IMGINN_BASE = "https://imginn.com"

    def __init__(self, headless=True, cookie_file=None,
                 show_progress=True, use_database=True,
                 log_callback=None, unified_db=None):
        """Initialize downloader (compatible with ImgInnDownloader interface).

        Args:
            headless: Ignored (no browser needed), kept for interface compat
            cookie_file: Cookie file path (used only if no unified_db)
            show_progress: Whether to show progress updates
            use_database: Whether to use database for tracking
            log_callback: Optional log callback
            unified_db: UnifiedDatabase instance
        """
        self._init_logger('Instagram', log_callback, default_module='Download')

        self.headless = headless
        self.downloaded_files: Set[str] = set()
        self.show_progress = show_progress
        self.use_database = use_database
        self.download_count = 0
        self.unified_db = unified_db
        self.scraper_id = 'imginn'
        self.pending_downloads: List[dict] = []

        if unified_db and use_database:
            self.unified_db = unified_db
        else:
            self.unified_db = None
            self.use_database = False

        # Activity status manager
        from modules.activity_status import get_activity_manager
        self.activity_manager = get_activity_manager(unified_db)

        # Proxy config from database
        self.proxy_url = None
        if unified_db:
            scraper_config = unified_db.get_scraper(self.scraper_id)
            if scraper_config:
                if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
                    self.proxy_url = scraper_config['proxy_url']
                    self.log(f"Using proxy: {self.proxy_url}", "info")

        # User agent from FlareSolverr
        self.user_agent = get_flaresolverr_user_agent()

        # CloudflareHandler (no cookie file when using DB)
        self.cf_handler = CloudflareHandler(
            module_name="ImgInn",
            cookie_file=None if unified_db else (cookie_file or "/opt/media-downloader/cookies/imginn_cookies.json"),
            user_agent=self.user_agent,
            logger=self.logger,
            aggressive_expiry=True,
            proxy_url=self.proxy_url
        )

        self._load_cookies_from_db()

        # HTTP session (curl_cffi with Chrome TLS fingerprint)
        self.session = _create_cf_session()
        self._setup_session()

        # Rate limiting
        self._last_request_time = None
        self._min_request_interval = 2  # seconds between requests

        # Cookie refresh cooldown (don't re-fetch within 5 minutes)
        self._last_cookie_refresh = None
        self._cookie_refresh_interval = 300  # 5 minutes

        # User ID cache (username -> id)
        self._user_id_cache: Dict[str, str] = {}

    # ==================== Cookie / Session ====================

    def _recreate_session(self):
        """Recreate the curl_cffi session when impersonation fails at request time."""
        self.log("Impersonation error, recreating curl_cffi session...", "warning")
        try:
            self.session.close()
        except Exception:
            pass
        self.session = _create_cf_session()
        self._setup_session()
        self._refresh_session_cookies()

    def _load_cookies_from_db(self):
        if not self.unified_db:
            return
        try:
            cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
            if cookies:
                self.cf_handler._cookies = cookies
                self.log(f"Loaded {len(cookies)} cookies from database", "debug")
        except Exception as e:
            self.log(f"Error loading cookies: {e}", "warning")

    def _save_cookies_to_db(self, cookies, user_agent=None):
        if not self.unified_db:
            return
        try:
            ua = user_agent or self.user_agent
            self.unified_db.save_scraper_cookies(self.scraper_id, cookies,
                                                  user_agent=ua, merge=True)
        except Exception as e:
            self.log(f"Error saving cookies: {e}", "warning")

    def _setup_session(self):
        """Configure curl_cffi session with CF-matching headers."""
        fingerprint = get_flaresolverr_fingerprint()

        stored_ua = None
        if self.unified_db:
            try:
                stored_ua = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
            except Exception:
                pass

        self._stored_ua = stored_ua or fingerprint.get('user_agent', self.user_agent)

        self._default_headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
            'Accept-Language': fingerprint.get('accept_language', 'en-US,en;q=0.9'),
            'Connection': 'keep-alive',
            'User-Agent': self._stored_ua,
        }

        # Load CF cookies
        self._refresh_session_cookies()

    def _refresh_session_cookies(self):
        """Reload CF cookies into the curl_cffi session."""
        cf_cookies = self.cf_handler.get_cookies_dict()
        # curl_cffi session uses a cookies dict
        for name, value in cf_cookies.items():
            self.session.cookies.set(name, value, domain=".imginn.com")

    def _ensure_cookies(self, force: bool = False) -> bool:
        """Ensure valid CF cookies, refresh via FlareSolverr if needed.

        Uses a cooldown to avoid calling FlareSolverr too frequently.
        With aggressive_expiry=True, cookies_expired() returns True whenever
        cf_clearance expiry is within 7 days — but cf_clearance only lasts ~30 min,
        so without cooldown we'd call FlareSolverr on every single request.

        Args:
            force: If True, skip cooldown and expiry checks and always refresh.
                   Used when a 403 proves the current cookies are invalid.
        """
        if not force:
            # If we refreshed recently, skip the expiry check entirely
            if self._last_cookie_refresh:
                elapsed = time.time() - self._last_cookie_refresh
                if elapsed < self._cookie_refresh_interval:
                    return True

            if not self.cf_handler.cookies_expired():
                return True

        self.log("Cookies expired, refreshing via FlareSolverr...", "info")
        success = self.cf_handler.get_cookies_via_flaresolverr(f"{self.IMGINN_BASE}/")
        self._last_cookie_refresh = time.time()

        if success:
            cookies_list = self.cf_handler.get_cookies_list()
            flaresolverr_ua = self.cf_handler.get_user_agent()
            if cookies_list and self.unified_db:
                self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua)

            # Refresh session cookies and UA
            if flaresolverr_ua:
                self._stored_ua = flaresolverr_ua
                self._default_headers['User-Agent'] = flaresolverr_ua
            self._refresh_session_cookies()
            return True

        self.log("Failed to get fresh cookies", "warning")
        return False

    # ==================== HTTP Helpers ====================

    def _rate_limit(self):
        if self._last_request_time:
            elapsed = time.time() - self._last_request_time
            if elapsed < self._min_request_interval:
                time.sleep(self._min_request_interval - elapsed)
        self._last_request_time = time.time()

    def _is_cf_challenge(self, text: str) -> bool:
        """Check if response is a Cloudflare challenge page."""
        if len(text) > 10000:
            return False
        lower = text[:2000].lower()
        return any(ind in lower for ind in [
            'just a moment', 'checking your browser',
            'verify you are human', 'challenge-platform'
        ]) and '<form' not in lower[:500]

    def _fetch_html(self, url: str) -> Optional[str]:
        """Fetch a page via curl_cffi (Chrome TLS), handle CF challenges."""
        self._ensure_cookies()
        self._rate_limit()

        headers = {**self._default_headers}

        try:
            try:
                resp = self.session.get(url, headers=headers, timeout=30, allow_redirects=True)
            except ImpersonateError:
                self._recreate_session()
                resp = self.session.get(url, headers=headers, timeout=30, allow_redirects=True)

            if resp.status_code == 403 or self._is_cf_challenge(resp.text):
                self.log(f"CF challenge on {url}, trying FlareSolverr direct fetch...", "info")
                return self._fetch_html_via_flaresolverr(url)

            if resp.status_code == 404:
                self.log(f"Page not found: {url}", "warning")
                return None

            # ImgInn returns 410 for some valid profiles — treat as OK if body has content
            if resp.status_code == 410 and len(resp.text) > 1000:
                return resp.text

            # Retry on server errors (500/502/503) — often transient
            if resp.status_code >= 500:
                self.log(f"HTTP {resp.status_code} for {url}, retrying in 5s...", "warning")
                time.sleep(5)
                self._rate_limit()
                resp = self.session.get(url, headers=headers, timeout=30, allow_redirects=True)
                if resp.status_code >= 500:
                    self.log(f"HTTP {resp.status_code} for {url} on retry, trying FlareSolverr...", "warning")
                    return self._fetch_html_via_flaresolverr(url)
                if resp.status_code == 200:
                    return resp.text
                if resp.status_code == 410 and len(resp.text) > 1000:
                    return resp.text

            if resp.status_code != 200:
                self.log(f"HTTP {resp.status_code} for {url}", "warning")
                return None

            return resp.text
        except Exception as e:
            self.log(f"Error fetching {url}: {e}", "error")
            return None

    def _call_api(self, endpoint: str, params: dict) -> Optional[dict]:
        """Make an API call to ImgInn, return parsed JSON.

        Falls back to FlareSolverr if curl_cffi gets 403 (CF challenge).
        This is needed for endpoints like /api/story/ where Cloudflare
        applies stricter path-based rules.
        """
        self._ensure_cookies()
        self._rate_limit()

        url = f"{self.IMGINN_BASE}{endpoint}"
        headers = {
            **self._default_headers,
            'Accept': '*/*',
            'Referer': f'{self.IMGINN_BASE}/',
            'X-Requested-With': 'XMLHttpRequest',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-origin',
        }

        try:
            try:
                resp = self.session.get(url, params=params, headers=headers, timeout=30)
            except ImpersonateError:
                self._recreate_session()
                resp = self.session.get(url, params=params, headers=headers, timeout=30)

            if resp.status_code == 429:
                self.log("Rate limited (429), waiting 30s...", "warning")
                time.sleep(30)
                self._rate_limit()
                resp = self.session.get(url, params=params, headers=headers, timeout=30)

            if resp.status_code == 403 or self._is_cf_challenge(resp.text):
                self.log(f"CF challenge on {endpoint}, trying FlareSolverr...", "info")
                return self._call_api_via_flaresolverr(url, params)

            if resp.status_code != 200:
                self.log(f"API {resp.status_code} for {endpoint}", "warning")
                return None

            return resp.json()
        except (ValueError, json.JSONDecodeError):
            self.log(f"Invalid JSON from {endpoint}", "warning")
            return None
        except Exception as e:
            self.log(f"API error {endpoint}: {e}", "error")
            return None

    def _call_api_via_flaresolverr(self, url: str, params: dict) -> Optional[dict]:
        """Fetch an API endpoint through FlareSolverr's browser.

        Used as fallback when curl_cffi gets 403 from Cloudflare on
        certain API endpoints (e.g. /api/story/).
        """
        import html as html_mod

        # Build full URL with query params
        from urllib.parse import urlencode
        full_url = f"{url}?{urlencode(params)}" if params else url

        try:
            import requests as std_requests
            payload = {
                'cmd': 'request.get',
                'url': full_url,
                'maxTimeout': 60000,
            }
            resp = std_requests.post('http://localhost:8191/v1', json=payload, timeout=70)
            data = resp.json()

            if data.get('status') != 'ok':
                self.log(f"FlareSolverr error: {data.get('message', 'unknown')}", "warning")
                return None

            solution = data.get('solution', {})
            response_text = solution.get('response', '')

            # Save cookies from FlareSolverr for future curl_cffi requests
            cookies_list = solution.get('cookies', [])
            if cookies_list:
                flaresolverr_ua = solution.get('userAgent', self.cf_handler.get_user_agent())
                self.cf_handler.save_cookies(cookies_list, user_agent=flaresolverr_ua)
                if flaresolverr_ua:
                    self._stored_ua = flaresolverr_ua
                    self._default_headers['User-Agent'] = flaresolverr_ua
                if self.unified_db:
                    self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua)
                self._refresh_session_cookies()

            if not response_text:
                return None

            # FlareSolverr wraps JSON responses in HTML <pre> tags
            pre_match = re.search(r'<pre[^>]*>(.*?)</pre>', response_text, re.DOTALL)
            if pre_match:
                json_text = html_mod.unescape(pre_match.group(1))
                return json.loads(json_text)

            # Try parsing raw response as JSON
            return json.loads(response_text)

        except (ValueError, json.JSONDecodeError) as e:
            self.log(f"FlareSolverr JSON parse error: {e}", "warning")
            return None
        except Exception as e:
            self.log(f"FlareSolverr fetch error: {e}", "error")
            return None

    def _fetch_html_via_flaresolverr(self, url: str) -> Optional[str]:
        """Fetch an HTML page through FlareSolverr's browser.

        Used as fallback when curl_cffi gets 403 from Cloudflare on
        HTML pages (e.g. /tagged/) that have stricter path-based rules.
        """
        try:
            import requests as std_requests
            payload = {
                'cmd': 'request.get',
                'url': url,
                'maxTimeout': 120000,
            }
            resp = std_requests.post('http://localhost:8191/v1', json=payload, timeout=130)
            data = resp.json()

            if data.get('status') != 'ok':
                self.log(f"FlareSolverr error: {data.get('message', 'unknown')}", "warning")
                return None

            solution = data.get('solution', {})
            response_text = solution.get('response', '')

            if not response_text:
                self.log("FlareSolverr returned empty response", "warning")
                return None

            if self._is_cf_challenge(response_text):
                self.log("FlareSolverr could not bypass CF challenge", "warning")
                return None

            # Save cookies from FlareSolverr for future curl_cffi requests
            cookies_list = solution.get('cookies', [])
            if cookies_list:
                flaresolverr_ua = solution.get('userAgent', self.cf_handler.get_user_agent())
                self.cf_handler.save_cookies(cookies_list, user_agent=flaresolverr_ua)
                if flaresolverr_ua:
                    self._stored_ua = flaresolverr_ua
                    self._default_headers['User-Agent'] = flaresolverr_ua
                if self.unified_db:
                    self._save_cookies_to_db(cookies_list, user_agent=flaresolverr_ua)
                self._refresh_session_cookies()

            return response_text

        except Exception as e:
            self.log(f"FlareSolverr HTML fetch error: {e}", "error")
            return None

    def _download_file(self, url: str, output_path: Path) -> bool:
        """Download a file from CDN URL (no rate limit — goes to Instagram CDN, not ImgInn)."""
        try:
            # CDN downloads don't need ImgInn CF cookies - use session for TLS fingerprint
            resp = self.session.get(
                url,
                headers={'Referer': f'{self.IMGINN_BASE}/'},
                timeout=120,
            )
            if resp.status_code != 200:
                self.log(f"Download HTTP {resp.status_code}: {output_path.name}", "warning")
                return False

            output_path.parent.mkdir(parents=True, exist_ok=True)

            with open(output_path, 'wb') as f:
                f.write(resp.content)

            size = output_path.stat().st_size
            if size < 1000:
                self.log(f"File too small ({size}B), discarding: {output_path.name}", "warning")
                output_path.unlink()
                return False

            return True
        except Exception as e:
            self.log(f"Download error: {e}", "error")
            if output_path.exists():
                try:
                    output_path.unlink()
                except OSError:
                    pass
            return False

    # ==================== Profile / Page Data Extraction ====================

    def _get_profile_info(self, username: str) -> Optional[dict]:
        """Fetch profile page and extract user_id, cursor, shortcodes."""
        url = f"{self.IMGINN_BASE}/{username}/"
        html = self._fetch_html(url)
        if not html:
            # Retry once with forced cookie refresh (handles expired CF cookies)
            if self._ensure_cookies(force=True):
                html = self._fetch_html(url)
            if not html:
                return None

        info = {
            'user_id': None,
            'cursor': None,
            'verified': False,
            'shortcodes': [],
        }

        # data-id on container or load-more button
        id_match = re.search(r'data-id="(\d+)"', html)
        if id_match:
            info['user_id'] = id_match.group(1)
            self._user_id_cache[username] = info['user_id']

        # data-cursor on load-more button
        cursor_match = re.search(r'data-cursor="([^"]+)"', html)
        if cursor_match:
            info['cursor'] = cursor_match.group(1)

        # verified flag
        if 'data-verified="true"' in html or 'data-verified="1"' in html:
            info['verified'] = True

        # Extract post shortcodes from grid links
        shortcodes = re.findall(r'href="/p/([A-Za-z0-9_-]+)/"', html)
        seen = set()
        for sc in shortcodes:
            if sc not in seen:
                seen.add(sc)
                info['shortcodes'].append(sc)

        return info

    def get_user_profile(self, username: str) -> Optional[dict]:
        """Fetch public profile info: avatar, display name, bio, stats.

        Args:
            username: Instagram username

        Returns:
            Dict with keys: username, user_id, display_name, avatar_url,
            bio, followers, following, posts_count, verified
            Returns None if profile cannot be fetched.
        """
        import html as html_mod

        if not self._ensure_cookies():
            return None

        html = self._fetch_html(f"{self.IMGINN_BASE}/{username}/")
        if not html:
            return None

        profile = {
            'username': username,
            'user_id': None,
            'display_name': None,
            'avatar_url': None,
            'bio': None,
            'followers': None,
            'following': None,
            'posts_count': None,
            'verified': False,
        }

        # User ID
        id_match = re.search(r'data-id="(\d+)"', html)
        if id_match:
            profile['user_id'] = id_match.group(1)
            self._user_id_cache[username] = profile['user_id']

        # Verified
        if 'data-verified="true"' in html or 'data-verified="1"' in html:
            profile['verified'] = True

        # Avatar from og:image
        og_img = re.search(r'property="og:image"\s*content="([^"]+)"', html)
        if og_img:
            profile['avatar_url'] = html_mod.unescape(og_img.group(1))

        # Display name from og:title: "View Display Name(@username)..."
        og_title = re.search(r'property="og:title"\s*content="([^"]+)"', html)
        if og_title:
            title_text = html_mod.unescape(og_title.group(1))
            name_match = re.match(r'View\s+(.+?)\s*\(@', title_text)
            if name_match:
                profile['display_name'] = name_match.group(1).strip()

        # Bio and stats from og:description
        # Format: "Bio text here Followers_count Followers, Following_count Following, Posts_count Posts"
        og_desc = re.search(r'property="og:description"\s*content="([^"]+)"', html)
        if og_desc:
            desc = html_mod.unescape(og_desc.group(1))

            # Extract stats from end of description
            stats_match = re.search(
                r'([\d,.]+[MKk]?)\s*Followers?,\s*([\d,.]+[MKk]?)\s*Following,\s*([\d,.]+[MKk]?)\s*Posts?',
                desc
            )
            if stats_match:
                profile['followers'] = stats_match.group(1)
                profile['following'] = stats_match.group(2)
                profile['posts_count'] = stats_match.group(3)

                # Bio is everything before the stats
                bio = desc[:stats_match.start()].strip().rstrip(',').strip()
                if bio:
                    profile['bio'] = bio

        return profile

    def _get_stories_params(self, username: str, user_id: str = None) -> Optional[dict]:
        """Get parameters for the stories API call.

        The stories API requires uid, name, and hash parameters.
        - uid: Instagram numeric user ID (from profile page or cache)
        - name: Instagram username
        - hash: floor(current_time / 100000) — time-based hash

        Note: We don't fetch the /stories/ HTML page because Cloudflare applies
        stricter challenge rules to that path. Instead, we get the uid from the
        profile page (which works fine) and compute the hash directly.
        """
        uid = user_id or self._user_id_cache.get(username)

        if not uid:
            # Fetch profile page to get user_id
            profile = self._get_profile_info(username)
            if profile:
                uid = profile['user_id']

        if not uid:
            self.log(f"Cannot resolve user_id for @{username}", "warning")
            return None

        # Hash computation: ceil(current_time / 100000)
        # Using ceil (floor + 1) to get the current cache period instead of the
        # previous one, which returns stale story data.
        story_hash = str(int(time.time()) // 100000 + 1)

        return {'uid': uid, 'name': username, 'hash': story_hash}

    def _get_tagged_info(self, username: str) -> Optional[dict]:
        """Fetch tagged page and extract user_id + cursor."""
        html = self._fetch_html(f"{self.IMGINN_BASE}/tagged/{username}/")
        if not html:
            return None

        info = {'user_id': None, 'cursor': None, 'shortcodes': []}

        id_match = re.search(r'data-id="(\d+)"', html)
        if id_match:
            info['user_id'] = id_match.group(1)
            self._user_id_cache[username] = info['user_id']

        cursor_match = re.search(r'data-cursor="([^"]+)"', html)
        if cursor_match:
            info['cursor'] = cursor_match.group(1)

        # Extract tagged post shortcodes
        shortcodes = re.findall(r'href="/p/([A-Za-z0-9_-]+)/"', html)
        seen = set()
        for sc in shortcodes:
            if sc not in seen:
                seen.add(sc)
                info['shortcodes'].append(sc)

        return info

    def _get_post_detail(self, shortcode: str) -> Optional[dict]:
        """Fetch individual post page and extract media URLs + metadata."""
        html = self._fetch_html(f"{self.IMGINN_BASE}/p/{shortcode}/")
        if not html:
            return None

        post = {
            'code': shortcode,
            'date': None,
            'alt': '',
            'author': None,
            'srcs': [],
            'isSidecar': False,
            'isPind': False,
        }

        import html as html_mod

        # Extract date from data-created (UNIX timestamp)
        date_match = re.search(r'data-created="(\d+)"', html)
        if date_match:
            post['date'] = int(date_match.group(1))
        else:
            # Fallback: try datetime attribute on <time> elements
            time_match = re.search(r'<time[^>]*datetime="([^"]+)"', html)
            if time_match:
                try:
                    from datetime import timezone
                    dt = datetime.fromisoformat(time_match.group(1).replace('Z', '+00:00'))
                    post['date'] = int(dt.timestamp())
                except Exception:
                    pass
            if not post['date']:
                # Fallback: try data-date or data-timestamp attributes
                alt_date = re.search(r'data-(?:date|timestamp|time)="(\d{10,13})"', html)
                if alt_date:
                    ts = int(alt_date.group(1))
                    if ts > 1e12:  # milliseconds
                        ts = ts // 1000
                    post['date'] = ts
            if not post['date']:
                self.log(f"Could not extract date for post {shortcode}", "warning")

        # Extract author username from div.username link (most reliable)
        # Format: <div class="username"><a href="/username/">...</a></div>
        username_link = re.search(r'class="username"[^>]*>\s*<a\s+href="/([^"]+?)/"', html)
        if username_link:
            author_candidate = username_link.group(1).strip().lower()
            if re.match(r'^[a-zA-Z0-9_.]{1,30}$', author_candidate):
                post['author'] = author_candidate

        # Extract caption from og:description (format: "username: caption text")
        cap_match = re.search(r'<meta\s+property="og:description"\s+content="([^"]*)"', html)
        if cap_match:
            full_text = html_mod.unescape(cap_match.group(1))
            # Fallback: extract author from caption if not found above
            if not post['author'] and ':' in full_text:
                author_candidate = full_text.split(':')[0].strip()
                if re.match(r'^[a-zA-Z0-9_.]{1,30}$', author_candidate):
                    post['author'] = author_candidate
            post['alt'] = full_text

        # Extract media URLs from swiper slides
        # Each swiper-slide has a data-src with the full-res CDN URL
        # Only grab data-src from within swiper-slide divs (not profile pics etc.)
        slide_pattern = re.compile(
            r'class="swiper-slide[^"]*"[^>]*data-src="([^"]+)"', re.DOTALL)
        slide_srcs = slide_pattern.findall(html)

        # Also check for plain data-src within the main post area (non-carousel)
        if not slide_srcs:
            # Look for the main download button link with scontent URL
            dl_pattern = re.compile(
                r'class="[^"]*downloads[^"]*"[^>]*href="(https://scontent[^"]+)"', re.DOTALL)
            dl_srcs = dl_pattern.findall(html)
            if not dl_srcs:
                # Broader: any scontent link with dl=1
                dl_srcs = re.findall(r'href="(https://scontent[^"]*dl=1[^"]*)"', html)

            slide_srcs = dl_srcs

        # Clean URLs and filter to CDN only
        urls = []
        seen_urls = set()
        for src in slide_srcs:
            src = html_mod.unescape(src)
            # Only keep Instagram CDN URLs
            if 'scontent' not in src and 'cdninstagram' not in src:
                continue
            # Deduplicate
            base = src.split('?')[0]
            if base in seen_urls:
                continue
            seen_urls.add(base)
            urls.append(src)

        post['srcs'] = urls
        post['isSidecar'] = len(urls) > 1

        return post if urls else None

    # ==================== File Naming ====================

    def _extract_cdn_filename(self, url: str) -> str:
        """Extract the base filename from a CDN URL (without extension)."""
        path = url.split('?')[0]
        filename = path.split('/')[-1]
        name = filename.rsplit('.', 1)[0] if '.' in filename else filename
        return name

    def _extract_ext(self, url: str) -> str:
        """Extract file extension from CDN URL."""
        path = url.split('?')[0]
        if '.mp4' in path:
            return '.mp4'
        elif '.webp' in path:
            return '.webp'
        elif '.png' in path:
            return '.png'
        elif '.jpeg' in path:
            return '.jpeg'
        return '.jpg'

    def _make_filename(self, profile: str, date_ts: int, cdn_filename: str,
                       ext: str, slide_index: int = None) -> str:
        """Generate filename: {profile}_{YYYYMMDD_HHMMSS}_{cdn_filename}[_{idx}]{ext}"""
        dt = datetime.fromtimestamp(date_ts) if date_ts else datetime.now()
        date_str = dt.strftime('%Y%m%d_%H%M%S')

        if slide_index is not None and slide_index > 0:
            return f"{profile}_{date_str}_{cdn_filename}_{slide_index}{ext}"
        return f"{profile}_{date_str}_{cdn_filename}{ext}"

    def _update_file_timestamps(self, filepath: Path, post_date_ts: int):
        """Set file modification time to match post date."""
        if not post_date_ts:
            return
        try:
            os.utime(str(filepath), (post_date_ts, post_date_ts))
        except Exception:
            pass

    # ==================== Duplicate Detection ====================

    def _is_already_downloaded(self, media_id: str, username: str = None) -> bool:
        if media_id in self.downloaded_files:
            return True
        if self.unified_db and self.use_database:
            return is_instagram_downloaded(self.unified_db, media_id, username)
        return False

    def _scan_existing_files(self, output_dir: Path, profile: str):
        existing = scan_existing_files_for_media_ids(
            output_dir, profile, min_file_size=1000
        )
        self.downloaded_files.update(existing)
        if existing:
            self.log(f"Found {len(existing)} existing files", "debug")

    # ==================== Phrase Filtering ====================

    def _check_phrases(self, caption: str, phrase_config: dict) -> bool:
        """Check if caption matches phrase filter. Returns True if post should be downloaded."""
        if not phrase_config or not phrase_config.get('enabled'):
            return True
        phrases = phrase_config.get('phrases', [])
        if not phrases:
            return True

        case_sensitive = phrase_config.get('case_sensitive', False)
        match_all = phrase_config.get('match_all', False)

        text = caption if case_sensitive else caption.lower()

        if match_all:
            return all((p if case_sensitive else p.lower()) in text for p in phrases)
        else:
            return any((p if case_sensitive else p.lower()) in text for p in phrases)

    # ==================== Database Recording ====================

    def _record_download(self, media_id, username, filename, url=None,
                         post_date=None, file_path=None, content_type='post',
                         metadata=None, deferred=False):
        record = {
            'media_id': media_id,
            'username': username,
            'filename': filename,
            'url': url or f'instagram://{media_id}',
            'post_date': post_date,
            'file_path': file_path,
            'content_type': content_type,
            'metadata': metadata or {},
        }

        if deferred:
            self.pending_downloads.append(record)
            return True

        if self.unified_db and self.use_database:
            try:
                return record_instagram_download(
                    self.unified_db,
                    media_id=media_id,
                    username=username,
                    content_type=content_type,
                    filename=filename,
                    file_path=file_path,
                    url=url,
                    post_date=datetime.fromtimestamp(post_date) if isinstance(post_date, (int, float)) else post_date,
                    method='imginn',
                    extra_metadata=metadata
                )
            except Exception as e:
                self.log(f"Error recording download: {e}", "warning")
        return False

    def get_pending_downloads(self) -> list:
        return self.pending_downloads.copy()

    def clear_pending_downloads(self):
        self.pending_downloads = []

    # ==================== Post Processing ====================

    def _batch_upgrade_to_hires(self, items: list) -> Tuple[dict, dict]:
        """Batch fetch post detail pages via FlareSolverr session for full-res URLs.

        Creates a persistent browser session that solves CF once, then
        reuses it for all subsequent requests (~0.5s each instead of ~10s).

        Args:
            items: List of API post items with 'code' keys

        Returns:
            Tuple of (srcs_map, dates_map) where:
            - srcs_map: Dict mapping shortcode -> list of full-res src URLs
            - dates_map: Dict mapping shortcode -> UNIX timestamp (for items missing dates)
        """
        import html as html_mod
        import requests as std_requests

        shortcodes = [item.get('code', '') for item in items if item.get('code')]
        if not shortcodes:
            return {}, {}

        results = {}
        dates = {}
        session_id = None
        total = len(shortcodes)

        try:
            resp = std_requests.post('http://localhost:8191/v1', json={
                'cmd': 'sessions.create'
            }, timeout=30)
            data = resp.json()
            if data.get('status') != 'ok':
                self.log("Failed to create FlareSolverr session for full-res, using API URLs", 'warning')
                return {}
            session_id = data.get('session')

            self.log(f"Fetching full-res URLs for {total} posts via browser session...", 'info')

            for i, code in enumerate(shortcodes):
                if self.show_progress and i % 10 == 0:
                    self.activity_manager.update_status(
                        f"Fetching full-res {i + 1}/{total}")

                try:
                    resp = std_requests.post('http://localhost:8191/v1', json={
                        'cmd': 'request.get',
                        'url': f'{self.IMGINN_BASE}/p/{code}/',
                        'session': session_id,
                        'maxTimeout': 60000,
                    }, timeout=70)
                    page_data = resp.json()

                    if page_data.get('status') != 'ok':
                        continue

                    html = page_data.get('solution', {}).get('response', '')
                    if not html:
                        continue

                    srcs = self._parse_detail_srcs(html)
                    if srcs:
                        results[code] = srcs

                    # Also extract date from detail page (for items missing dates)
                    date_match = re.search(r'data-created="(\d+)"', html)
                    if date_match:
                        dates[code] = int(date_match.group(1))

                except Exception as e:
                    self.log(f"Detail fetch failed for {code}: {e}", 'debug')
                    continue

        except Exception as e:
            self.log(f"FlareSolverr session error: {e}", 'warning')
        finally:
            if session_id:
                try:
                    std_requests.post('http://localhost:8191/v1', json={
                        'cmd': 'sessions.destroy',
                        'session': session_id,
                    }, timeout=10)
                except Exception:
                    pass

        self.log(f"Got full-res URLs for {len(results)}/{total} posts, dates for {len(dates)}", 'info')
        return results, dates

    @staticmethod
    def _parse_detail_srcs(html: str) -> list:
        """Extract full-res CDN URLs from a post detail page HTML."""
        import html as html_mod

        slide_pattern = re.compile(
            r'class="swiper-slide[^"]*"[^>]*data-src="([^"]+)"', re.DOTALL)
        slide_srcs = slide_pattern.findall(html)

        if not slide_srcs:
            dl_pattern = re.compile(
                r'class="[^"]*downloads[^"]*"[^>]*href="(https://scontent[^"]+)"', re.DOTALL)
            slide_srcs = dl_pattern.findall(html)
            if not slide_srcs:
                slide_srcs = re.findall(r'href="(https://scontent[^"]*dl=1[^"]*)"', html)

        urls = []
        seen = set()
        for src in slide_srcs:
            src = html_mod.unescape(src)
            if 'scontent' not in src and 'cdninstagram' not in src:
                continue
            base = src.split('?')[0]
            if base in seen:
                continue
            seen.add(base)
            urls.append(src)

        return urls

    def _process_api_post(self, item: dict, username: str, output_dir: Path,
                          cutoff_ts: int, phrase_config: dict,
                          defer_database: bool, video_only: bool = False,
                          date_to_ts: int = None,
                          content_type: str = 'post') -> Tuple[str, List[str]]:
        """Process a single post from the API response.

        Args:
            item: API post item dict
            username: Instagram username
            output_dir: Download directory
            cutoff_ts: Oldest allowed post timestamp (0/None = no lower bound)
            phrase_config: Phrase filter config
            defer_database: Whether to defer DB recording
            video_only: If True, only download video items (for reels mode)
            date_to_ts: Newest allowed post timestamp (None = no upper bound)

        Returns:
            Tuple of (status, downloaded_files) where status is:
            'downloaded', 'old', 'skipped', 'duplicate', 'filtered', 'future'
        """
        shortcode = item.get('code', '')
        post_date = item.get('date')  # UNIX timestamp
        if not post_date:
            self.log(f"Post {shortcode} has no date - timestamps will default to download time", "warning")
        caption = item.get('alt', '') or ''
        is_sidecar = item.get('isSidecar', False)
        srcs = item.get('srcs', [])

        # If no srcs, use src as fallback (single item)
        if not srcs:
            src = item.get('src', '')
            if src:
                srcs = [src]

        if not srcs:
            self.log(f"No media URLs for post {shortcode}", "debug")
            return ('skipped', [])

        # Date range: skip posts newer than date_to
        if post_date and date_to_ts and post_date > date_to_ts:
            return ('future', [])

        # Age check (cutoff_ts=0 or None means no lower bound)
        if cutoff_ts:
            if post_date:
                from datetime import datetime as _dt
                post_dt = _dt.fromtimestamp(post_date)
                cutoff_dt = _dt.fromtimestamp(cutoff_ts)
                self.log(f"Age check: post {shortcode} date={post_dt.isoformat()} cutoff={cutoff_dt.isoformat()} old={post_date < cutoff_ts}", "debug")
            else:
                self.log(f"Age check: post {shortcode} has no date (post_date={post_date}), skipping age filter", "debug")

        if post_date and cutoff_ts and post_date < cutoff_ts:
            return ('old', [])

        # Phrase check
        if not self._check_phrases(caption, phrase_config):
            self.log(f"Post {shortcode} filtered by phrase config", "debug")
            return ('filtered', [])

        # Video-only filter for reels mode
        if video_only:
            video_srcs = [s for s in srcs if '.mp4' in s.split('?')[0]]
            if not video_srcs:
                return ('skipped', [])
            srcs = video_srcs

        downloaded = []

        for idx, src_url in enumerate(srcs):
            cdn_filename = self._extract_cdn_filename(src_url)
            media_id = extract_instagram_media_id(cdn_filename)
            ext = self._extract_ext(src_url)

            # Duplicate check
            if self._is_already_downloaded(media_id, username):
                continue
            if self._is_already_downloaded(cdn_filename, username):
                continue

            # Build output filename
            slide_index = idx if is_sidecar and len(srcs) > 1 else None
            out_filename = self._make_filename(username, post_date, cdn_filename, ext, slide_index)
            out_path = output_dir / out_filename

            # Skip if file already exists
            if out_path.exists():
                self.downloaded_files.add(media_id)
                self.downloaded_files.add(cdn_filename)
                continue

            # Download
            if self._download_file(src_url, out_path):
                self.downloaded_files.add(media_id)
                self.downloaded_files.add(cdn_filename)

                # Set file timestamps
                if post_date:
                    self._update_file_timestamps(out_path, post_date)

                # Record - use per-slide URL for sidecars so each slide gets a unique url_hash
                if shortcode:
                    if is_sidecar and len(srcs) > 1:
                        instagram_url = f"https://www.instagram.com/p/{shortcode}/?img_index={idx + 1}"
                    else:
                        instagram_url = f"https://www.instagram.com/p/{shortcode}/"
                else:
                    instagram_url = None
                self._record_download(
                    media_id=media_id,
                    username=username,
                    filename=out_filename,
                    url=instagram_url,
                    post_date=post_date,
                    file_path=str(out_path),
                    content_type=content_type,
                    metadata={
                        'shortcode': shortcode,
                        'is_sidecar': is_sidecar,
                        'slide_index': idx if is_sidecar else None,
                        'total_slides': len(srcs) if is_sidecar else 1,
                        'cdn_filename': cdn_filename,
                    },
                    deferred=defer_database
                )

                downloaded.append(str(out_path))
                self.log(f"Downloaded: {out_filename}", "info")

        if downloaded:
            return ('downloaded', downloaded)
        return ('duplicate', [])

    def _process_post_detail(self, post: dict, username: str, output_dir: Path,
                             cutoff_ts: int, phrase_config: dict,
                             defer_database: bool,
                             content_type: str = 'post') -> Tuple[str, List[str]]:
        """Process a post from _get_post_detail (HTML-extracted data).

        Uses the post's author for the filename if available (important for
        tagged posts where the author differs from the searched profile).
        """
        # Use post author for filename if available, fall back to searched username
        file_username = post.get('author') or username

        # Convert to API-like format and delegate
        api_item = {
            'code': post.get('code', ''),
            'date': post.get('date'),
            'alt': post.get('alt', ''),
            'isPind': post.get('isPind', False),
            'isSidecar': post.get('isSidecar', False),
            'srcs': post.get('srcs', []),
        }
        return self._process_api_post(api_item, file_username, output_dir,
                                       cutoff_ts, phrase_config, defer_database,
                                       content_type=content_type)

    # ==================== Date Range Helpers ====================

    def _parse_date(self, date_val) -> Optional[int]:
        """Parse a date value to UNIX timestamp.

        Accepts: int/float (UNIX timestamp), ISO date string, datetime object, None.
        """
        if date_val is None:
            return None
        if isinstance(date_val, (int, float)):
            return int(date_val)
        if isinstance(date_val, datetime):
            return int(date_val.timestamp())
        if isinstance(date_val, str):
            for fmt in ('%Y-%m-%d', '%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S'):
                try:
                    return int(datetime.strptime(date_val, fmt).timestamp())
                except ValueError:
                    continue
        return None

    def _compute_cutoffs(self, days_back, date_from=None, date_to=None) -> Tuple[Optional[int], Optional[int]]:
        """Compute (cutoff_ts, date_to_ts) from days_back and explicit date range.

        - days_back=0 means no lower bound (download everything)
        - date_from overrides days_back if provided
        - date_to sets an upper bound (skip posts newer than this)
        """
        cutoff_ts = None
        if date_from is not None:
            cutoff_ts = self._parse_date(date_from)
        elif days_back and days_back > 0:
            cutoff_ts = int((datetime.now() - timedelta(days=days_back)).timestamp())

        date_to_ts = self._parse_date(date_to) if date_to is not None else None

        self.log(f"Cutoffs: days_back={days_back} cutoff_ts={cutoff_ts} ({datetime.fromtimestamp(cutoff_ts).isoformat() if cutoff_ts else 'None'}) date_to_ts={date_to_ts}", "info")

        return cutoff_ts, date_to_ts

    # ==================== Main Download Entry ====================

    def download(self, username, content_type="posts", days_back=14,
                 max_downloads=50, output_dir=None, phrase_config=None,
                 defer_database=False, date_from=None, date_to=None) -> int:
        """Main download entry point (compatible with ImgInnDownloader).

        Args:
            username: Instagram username (or shortcode for content_type="post")
            content_type: "posts", "stories", "tagged", "reels", or "post" (single)
            days_back: How far back to download (0 = unlimited, no date cutoff)
            max_downloads: Maximum files to download (0 = unlimited)
            output_dir: Output directory path
            phrase_config: Optional phrase filtering config
            defer_database: Whether to defer database recording
            date_from: Explicit start date (overrides days_back). Accepts:
                       UNIX timestamp, ISO string "YYYY-MM-DD", or datetime
            date_to: Explicit end date. Same formats as date_from.

        Returns:
            Number of files downloaded
        """
        self.downloaded_files = set()
        self.download_count = 0

        if output_dir is None:
            output_dir = f"/opt/media-downloader/downloads/{username}"
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)

        self._scan_existing_files(output_path, username)

        try:
            if content_type == "post":
                # Single post: username is treated as shortcode or URL
                files = self.download_single_post(username, output_path, defer_database)
            elif content_type == "posts":
                files = self.download_posts(username, days_back, max_downloads,
                                            output_path, phrase_config, defer_database,
                                            date_from=date_from, date_to=date_to)
            elif content_type == "stories":
                files = self.download_stories(username, days_back, max_downloads,
                                              output_path, defer_database)
            elif content_type == "tagged":
                files = self.download_tagged(username, days_back, max_downloads,
                                             output_path, phrase_config, defer_database,
                                             date_from=date_from, date_to=date_to)
            elif content_type == "reels":
                files = self.download_reels(username, days_back, max_downloads,
                                            output_path, phrase_config, defer_database,
                                            date_from=date_from, date_to=date_to)
            else:
                self.log(f"Unsupported content type: {content_type}", "warning")
                return 0

            count = len(files) if files else 0
            self.download_count = count
            return count

        except Exception as e:
            self.log(f"Download error for @{username} ({content_type}): {e}", "error")
            import traceback
            self.log(traceback.format_exc(), "debug")
            return 0

    # ==================== Single Post ====================

    def download_single_post(self, shortcode_or_url: str, output_dir=None,
                             defer_database=False) -> List[str]:
        """Download a single Instagram post by shortcode or URL.

        Args:
            shortcode_or_url: Post shortcode (e.g. "DVL2WxBFGBT") or
                              Instagram/ImgInn URL containing the shortcode
            output_dir: Output directory
            defer_database: Whether to defer DB recording

        Returns:
            List of downloaded file paths
        """
        # Extract shortcode from URL if needed
        shortcode = shortcode_or_url
        url_match = re.search(r'/p/([A-Za-z0-9_-]+)', shortcode_or_url)
        if url_match:
            shortcode = url_match.group(1)

        self.activity_manager.update_status(f"Downloading post {shortcode}")

        if not self._ensure_cookies():
            return []

        output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/posts")
        output_path.mkdir(parents=True, exist_ok=True)

        post = self._get_post_detail(shortcode)
        if not post:
            self.log(f"Could not fetch post /p/{shortcode}/", "error")
            return []

        username = post.get('author') or 'unknown'
        self.log(f"Post {shortcode} by @{username}: {len(post['srcs'])} media items", "info")

        # No date cutoff or phrase filter for single post
        status, files = self._process_post_detail(
            post, username, output_path, cutoff_ts=None,
            phrase_config=None, defer_database=defer_database)

        self.log(f"Single post complete: {len(files)} files", "info")
        return files

    # ==================== Posts ====================

    def download_posts(self, username, days_back=14, max_posts=50,
                       output_dir=None, phrase_config=None,
                       defer_database=False, date_from=None, date_to=None) -> List[str]:
        """Download posts using ImgInn API with full carousel support.

        Args:
            days_back: How far back to download. 0 = no date cutoff (all posts).
            max_posts: Maximum posts to process. 0 = unlimited.
            date_from: Explicit start date (overrides days_back).
            date_to: Explicit end date (skip posts newer than this).
        """
        self.activity_manager.update_status(f"Checking posts for @{username}")

        if not self._ensure_cookies():
            self.activity_manager.update_status(f"Skipped - ImgInn unavailable")
            return []

        output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/{username}")
        output_path.mkdir(parents=True, exist_ok=True)

        # Fetch profile page for user_id and cursor
        profile = self._get_profile_info(username)
        if not profile or not profile['user_id']:
            self.log(f"Could not resolve profile for @{username}", "warning")
            return []

        user_id = profile['user_id']
        cursor = profile['cursor']
        verified = '1' if profile['verified'] else '0'

        self.log(f"Profile @{username}: user_id={user_id}, {len(profile['shortcodes'])} initial posts", "info")

        cutoff_ts, date_to_ts = self._compute_cutoffs(days_back, date_from, date_to)
        has_date_cutoff = cutoff_ts is not None

        # max_posts=0 means unlimited
        effective_max = max_posts if max_posts > 0 else float('inf')
        max_consecutive_old = 5 if has_date_cutoff else float('inf')

        # ── Phase 1: Collect items via API pagination ──
        self.activity_manager.update_status(f"Scanning posts for @{username}")
        collected_items = []
        consecutive_old = 0

        first_page = self._call_api('/api/posts/', {
            'id': user_id,
            'cursor': '',
            'username': username,
            'verified': verified,
        })

        if first_page and first_page.get('items'):
            for item in first_page['items']:
                if len(collected_items) >= effective_max or consecutive_old >= max_consecutive_old:
                    break
                post_date = item.get('date')
                if post_date and cutoff_ts and post_date < cutoff_ts and not item.get('isPind', False):
                    consecutive_old += 1
                    continue
                if post_date and date_to_ts and post_date > date_to_ts:
                    continue
                consecutive_old = 0
                collected_items.append(item)

            if first_page.get('cursor'):
                cursor = first_page['cursor']
            elif not first_page.get('hasNext', True):
                cursor = None
        else:
            # First page API didn't work — fetch individual post pages (already full-res)
            self.log("First page API unavailable, fetching post pages...", "debug")
            all_downloaded = []
            total_processed = 0
            consecutive_old = 0
            for shortcode in profile['shortcodes']:
                if total_processed >= effective_max or consecutive_old >= max_consecutive_old:
                    break
                if self._is_already_downloaded(shortcode, username):
                    total_processed += 1
                    continue
                post = self._get_post_detail(shortcode)
                if not post:
                    continue
                status, files = self._process_post_detail(
                    post, username, output_path, cutoff_ts, phrase_config, defer_database)
                total_processed += 1
                if status == 'old':
                    consecutive_old += 1
                elif status == 'downloaded':
                    consecutive_old = 0
                    all_downloaded.extend(files)
            self.log(f"Posts complete: {len(all_downloaded)} files for @{username}", "info")
            return all_downloaded

        # Continue API pagination
        while cursor and len(collected_items) < effective_max and consecutive_old < max_consecutive_old:
            self.log(f"Fetching posts page (collected={len(collected_items)})...", "debug")
            data = self._call_api('/api/posts/', {
                'id': user_id,
                'cursor': cursor,
                'username': username,
                'verified': verified,
            })
            if not data or not data.get('items'):
                break

            for item in data['items']:
                if len(collected_items) >= effective_max or consecutive_old >= max_consecutive_old:
                    break
                post_date = item.get('date')
                if post_date and cutoff_ts and post_date < cutoff_ts and not item.get('isPind', False):
                    consecutive_old += 1
                    continue
                if post_date and date_to_ts and post_date > date_to_ts:
                    continue
                consecutive_old = 0
                collected_items.append(item)

            if data.get('hasNext') and data.get('cursor'):
                cursor = data['cursor']
            else:
                break

        if not collected_items:
            self.log(f"Posts complete: 0 files for @{username}", "info")
            return []

        # Filter out already-downloaded and out-of-range posts before expensive browser session
        needs_download = []
        for item in collected_items:
            code = item.get('code', '')
            post_date = item.get('date')
            # Date filters — same checks _process_api_post will do
            if post_date and cutoff_ts and post_date < cutoff_ts:
                continue
            if post_date and date_to_ts and post_date > date_to_ts:
                continue
            if self._is_already_downloaded(code, username):
                continue
            # Check if all srcs are already downloaded
            srcs = item.get('srcs', []) or ([item['src']] if item.get('src') else [])
            all_downloaded_flag = srcs and all(
                self._is_already_downloaded(self._extract_cdn_filename(s), username)
                for s in srcs
            )
            if all_downloaded_flag:
                continue
            needs_download.append(item)

        if not needs_download:
            self.log(f"Posts complete: 0 new files for @{username} ({len(collected_items)} already downloaded)", "info")
            return []

        self.log(f"Collected {len(collected_items)} posts, {len(needs_download)} need downloading, upgrading to full-res...", "info")

        # ── Phase 2: Batch upgrade to full-res via FlareSolverr session ──
        hires_map, dates_map = self._batch_upgrade_to_hires(needs_download)

        # Apply full-res URLs and missing dates to items
        for item in needs_download:
            code = item.get('code', '')
            if code in hires_map:
                item['srcs'] = hires_map[code]
                item['isSidecar'] = len(hires_map[code]) > 1
            # Fill in missing dates from detail pages
            if not item.get('date') and code in dates_map:
                item['date'] = dates_map[code]
                self.log(f"Recovered date for post {code} from detail page", "debug")

        # ── Phase 3: Process and download ──
        all_downloaded = []
        total_processed = 0

        for item in needs_download:
            status, files = self._process_api_post(
                item, username, output_path, cutoff_ts, phrase_config,
                defer_database, date_to_ts=date_to_ts)
            total_processed += 1

            if status == 'downloaded':
                all_downloaded.extend(files)

            if self.show_progress:
                progress_max = max_posts if max_posts > 0 else None
                self.activity_manager.update_status(
                    f"Downloading posts for @{username}",
                    len(all_downloaded), progress_max)

        self.log(f"Posts complete: {len(all_downloaded)} files for @{username}", "info")
        return all_downloaded

    # ==================== Stories ====================

    def download_stories(self, username, days_back=1, max_stories=50,
                         output_dir=None, defer_database=False) -> List[str]:
        """Download stories using ImgInn API."""
        self.activity_manager.update_status(f"Checking stories for @{username}")

        if not self._ensure_cookies():
            return []

        output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/{username}")
        output_path.mkdir(parents=True, exist_ok=True)

        # Get story API params
        params = self._get_stories_params(username)
        if not params:
            self.log(f"Cannot get story params for @{username}", "warning")
            return []

        self.log(f"Fetching stories for @{username} (uid={params['uid']})...", "info")

        data = self._call_api('/api/story/', params)

        items = []
        if data and data.get('items'):
            items = data['items']
        else:
            # API returned no data (404/empty) — fall back to parsing the stories HTML page
            items = self._parse_stories_from_html(username)

        if not items:
            self.log(f"No stories found for @{username}", "info")
            return []

        self.log(f"Found {len(items)} stories for @{username}", "info")

        downloaded = []

        for idx, item in enumerate(items):
            if len(downloaded) >= max_stories:
                break

            src = item.get('src', '')
            if not src:
                # Try proxy URL as fallback
                src = item.get('proxy', '')
            if not src:
                continue

            # Extract info
            story_time = item.get('time')  # UNIX timestamp
            if not story_time:
                # Stories are always recent, use current time as best approximation
                story_time = int(time.time())
                self.log(f"Story {idx + 1} for @{username} has no timestamp, using current time", "warning")
            cdn_filename = self._extract_cdn_filename(src)
            media_id = extract_instagram_media_id(cdn_filename) if cdn_filename else f"story_{idx}"
            ext = self._extract_ext(src)

            # Duplicate check
            if self._is_already_downloaded(media_id, username):
                continue
            if self._is_already_downloaded(cdn_filename, username):
                continue

            # Generate filename with story suffix
            dt = datetime.fromtimestamp(story_time)
            date_str = dt.strftime('%Y%m%d_%H%M%S')
            out_filename = f"{username}_{date_str}_{cdn_filename}_story{idx + 1}{ext}"
            out_path = output_path / out_filename

            if out_path.exists():
                self.downloaded_files.add(media_id)
                continue

            if self._download_file(src, out_path):
                self.downloaded_files.add(media_id)
                self.downloaded_files.add(cdn_filename)

                if story_time:
                    self._update_file_timestamps(out_path, story_time)

                # Use per-story URL so each story gets a unique url_hash
                story_url = f"https://www.instagram.com/stories/{username}/{media_id}/"
                self._record_download(
                    media_id=media_id,
                    username=username,
                    filename=out_filename,
                    url=story_url,
                    post_date=story_time,
                    file_path=str(out_path),
                    content_type='stories',
                    metadata={'story_index': idx + 1, 'cdn_filename': cdn_filename},
                    deferred=defer_database
                )

                downloaded.append(str(out_path))
                self.log(f"Downloaded story: {out_filename}", "info")

            if self.show_progress:
                self.activity_manager.update_status(
                    f"Downloading stories for @{username}",
                    len(downloaded), len(items))

        self.log(f"Stories complete: {len(downloaded)} files for @{username}", "info")
        return downloaded

    def _parse_stories_from_html(self, username: str) -> list:
        """Parse stories from the /stories/{username}/ HTML page.

        Fallback when /api/story/ returns 404. Extracts story media URLs
        from the reels-media section of the stories page.

        Returns:
            List of dicts with 'src' and optionally 'time' keys,
            matching the API response format.
        """
        self.log(f"Trying HTML fallback for stories @{username}", "info")
        html = self._fetch_html(f"{self.IMGINN_BASE}/stories/{username}/")
        if not html:
            return []

        items = []

        # Find the reels-media section containing story items
        media_idx = html.find('class="reels-media"')
        if media_idx < 0:
            self.log("No reels-media section found in stories page", "debug")
            return []

        section = html[media_idx:]

        # Each story is in a <div class="media"> container
        # Videos have: <div class="media-video-wrap" data-src="VIDEO_URL">
        #              and <a class="download" href="VIDEO_URL&dl=1">
        # Images have: <a class="download" href="IMAGE_URL&dl=1">
        for m in re.finditer(r'<div class="media">', section):
            # Get chunk until next media div or end
            start = m.start()
            next_media = section.find('<div class="media">', start + 1)
            chunk = section[start:next_media] if next_media > 0 else section[start:start + 5000]

            # Extract download URL (most reliable source)
            dl_match = re.search(r'class="download"[^>]*href="([^"]+)"', chunk)
            if not dl_match:
                continue

            import html as html_mod
            src = html_mod.unescape(dl_match.group(1))
            # Remove &dl=1 suffix if present
            src = re.sub(r'[&?]dl=1$', '', src)

            # Extract relative time (e.g. "5 hours ago") and convert to timestamp
            item = {'src': src}
            time_match = re.search(r'class="time">(\d+)\s+(second|minute|hour|day|week)s?\s+ago<', chunk)
            if time_match:
                amount = int(time_match.group(1))
                unit = time_match.group(2)
                seconds_map = {'second': 1, 'minute': 60, 'hour': 3600, 'day': 86400, 'week': 604800}
                item['time'] = int(time.time()) - (amount * seconds_map.get(unit, 0))

            items.append(item)

        if items:
            self.log(f"Parsed {len(items)} stories from HTML for @{username}", "info")

        return items

    def _parse_highlights_from_html(self, username: str) -> list:
        """Parse highlight IDs and titles from /stories/{username}/ page.

        Returns list of dicts: [{id, title}, ...]
        """
        html = self._fetch_html(f"{self.IMGINN_BASE}/stories/{username}/")
        if not html:
            return []

        highlights = []
        # Highlights are: <li class="reel swiper-slide" data-id="{id}">
        #   <div class="title">{title}</div>
        for match in re.finditer(
            r'<li[^>]*\bdata-id="(\d+)"[^>]*>.*?'
            r'<div class="title">([^<]*)</div>',
            html, re.DOTALL
        ):
            highlights.append({
                'id': match.group(1),
                'title': match.group(2).strip(),
            })

        if highlights:
            self.log(f"Found {len(highlights)} highlights for @{username}", "info")

        return highlights

    # ==================== Tagged ====================

    def download_tagged(self, username, days_back=14, max_posts=50,
                        output_dir=None, phrase_config=None,
                        defer_database=False, date_from=None, date_to=None) -> List[str]:
        """Download tagged posts. Uses API for pagination, post pages for media URLs.

        Args:
            days_back: How far back to download. 0 = no date cutoff.
            max_posts: Maximum posts to process. 0 = unlimited.
            date_from: Explicit start date (overrides days_back).
            date_to: Explicit end date.
        """
        self.activity_manager.update_status(f"Checking tagged for @{username}")

        if not self._ensure_cookies():
            return []

        output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/{username}")
        output_path.mkdir(parents=True, exist_ok=True)

        # Fetch tagged page
        tagged_info = self._get_tagged_info(username)
        if not tagged_info or not tagged_info['user_id']:
            self.log(f"Could not load tagged page for @{username}", "error")
            return []

        user_id = tagged_info['user_id']
        cursor = tagged_info['cursor']

        cutoff_ts, date_to_ts = self._compute_cutoffs(days_back, date_from, date_to)
        has_date_cutoff = cutoff_ts is not None

        all_downloaded = []
        total_processed = 0
        consecutive_old = 0
        max_consecutive_old = 5 if has_date_cutoff else float('inf')
        effective_max = max_posts if max_posts > 0 else float('inf')

        # Process first-page shortcodes via API to get full data (dates, srcs)
        # The HTML page only has shortcodes, so we fetch each via /api/posts/ for dates
        first_page_codes = tagged_info['shortcodes']
        self.log(f"Tagged: {len(first_page_codes)} posts on first page", "debug")

        for shortcode in first_page_codes:
            if total_processed >= effective_max or consecutive_old >= max_consecutive_old:
                break

            if self._is_already_downloaded(shortcode, username):
                total_processed += 1
                continue

            # Fetch post detail page for media URLs and author
            post = self._get_post_detail(shortcode)
            if not post:
                total_processed += 1
                continue

            status, files = self._process_post_detail(
                post, username, output_path, cutoff_ts, phrase_config, defer_database,
                content_type='tagged')
            total_processed += 1

            if status == 'old':
                consecutive_old += 1
            elif status == 'downloaded':
                consecutive_old = 0
                all_downloaded.extend(files)

            if self.show_progress:
                progress_max = max_posts if max_posts > 0 else None
                self.activity_manager.update_status(
                    f"Downloading tagged for @{username}",
                    len(all_downloaded), progress_max)

        # Paginate via tagged API — use API data directly (has time, srcs, code)
        while cursor and total_processed < effective_max and consecutive_old < max_consecutive_old:
            self.log(f"Fetching tagged page (processed={total_processed})...", "debug")

            data = self._call_api('/api/tagged', {
                'id': user_id,
                'cursor': cursor,
            })

            if not data or not data.get('items'):
                break

            items = data['items']

            for item in items:
                if total_processed >= effective_max or consecutive_old >= max_consecutive_old:
                    break

                shortcode = item.get('code', '')
                if not shortcode:
                    item_id = item.get('id', '')
                    if not item_id:
                        continue
                    shortcode = media_id_to_shortcode(item_id)

                if self._is_already_downloaded(shortcode, username):
                    total_processed += 1
                    continue

                # Use API data directly — normalize 'time' to 'date' for _process_api_post
                if 'time' in item and 'date' not in item:
                    item['date'] = item['time']

                # Use owner username for filename if available
                api_username = username
                owner = item.get('owner', {})
                if isinstance(owner, dict) and owner.get('username'):
                    api_username = owner['username']

                status, files = self._process_api_post(
                    item, api_username, output_path, cutoff_ts, phrase_config, defer_database,
                    content_type='tagged')
                total_processed += 1

                if status == 'old':
                    consecutive_old += 1
                elif status == 'downloaded':
                    consecutive_old = 0
                    all_downloaded.extend(files)

            # Tagged API uses last item's ID as next cursor
            if items:
                last_id = items[-1].get('id')
                if last_id and str(last_id) != str(cursor):
                    cursor = str(last_id)
                else:
                    break
            else:
                break

        self.log(f"Tagged complete: {len(all_downloaded)} files for @{username}", "info")
        return all_downloaded

    # ==================== Reels ====================

    def download_reels(self, username, days_back=14, max_downloads=50,
                       output_dir=None, phrase_config=None,
                       defer_database=False, date_from=None, date_to=None) -> List[str]:
        """Download reels (video posts) using the posts API with video filter.

        Args:
            days_back: How far back to download. 0 = no date cutoff.
            max_downloads: Maximum files to download. 0 = unlimited.
            date_from: Explicit start date (overrides days_back).
            date_to: Explicit end date.
        """
        self.activity_manager.update_status(f"Checking reels for @{username}")

        if not self._ensure_cookies():
            return []

        output_path = Path(output_dir) if output_dir else Path(f"/opt/media-downloader/downloads/{username}")
        output_path.mkdir(parents=True, exist_ok=True)

        # Same as posts but with video_only flag
        profile = self._get_profile_info(username)
        if not profile or not profile['user_id']:
            self.log(f"Could not resolve profile for @{username}", "warning")
            return []

        user_id = profile['user_id']
        cursor = profile['cursor']
        verified = '1' if profile['verified'] else '0'

        cutoff_ts, date_to_ts = self._compute_cutoffs(days_back, date_from, date_to)
        has_date_cutoff = cutoff_ts is not None

        effective_max = max_downloads if max_downloads > 0 else float('inf')
        max_consecutive_old = 5 if has_date_cutoff else float('inf')

        # ── Phase 1: Collect video items via API ──
        collected_items = []
        consecutive_old = 0

        first_page = self._call_api('/api/posts/', {
            'id': user_id,
            'cursor': '',
            'username': username,
            'verified': verified,
        })

        if first_page and first_page.get('items'):
            for item in first_page['items']:
                if len(collected_items) >= effective_max or consecutive_old >= max_consecutive_old:
                    break
                post_date = item.get('date')
                if post_date and cutoff_ts and post_date < cutoff_ts and not item.get('isPind', False):
                    consecutive_old += 1
                    continue
                if post_date and date_to_ts and post_date > date_to_ts:
                    continue
                # Only keep video items
                srcs = item.get('srcs', []) or ([item['src']] if item.get('src') else [])
                video_srcs = [s for s in srcs if '.mp4' in s.split('?')[0]]
                if not video_srcs:
                    continue
                consecutive_old = 0
                collected_items.append(item)

            if first_page.get('cursor'):
                cursor = first_page['cursor']
            elif not first_page.get('hasNext', True):
                cursor = None

        while cursor and len(collected_items) < effective_max and consecutive_old < max_consecutive_old:
            data = self._call_api('/api/posts/', {
                'id': user_id,
                'cursor': cursor,
                'username': username,
                'verified': verified,
            })
            if not data or not data.get('items'):
                break

            for item in data['items']:
                if len(collected_items) >= effective_max or consecutive_old >= max_consecutive_old:
                    break
                post_date = item.get('date')
                if post_date and cutoff_ts and post_date < cutoff_ts and not item.get('isPind', False):
                    consecutive_old += 1
                    continue
                if post_date and date_to_ts and post_date > date_to_ts:
                    continue
                srcs = item.get('srcs', []) or ([item['src']] if item.get('src') else [])
                video_srcs = [s for s in srcs if '.mp4' in s.split('?')[0]]
                if not video_srcs:
                    continue
                consecutive_old = 0
                collected_items.append(item)

            if data.get('hasNext') and data.get('cursor'):
                cursor = data['cursor']
            else:
                break

        if not collected_items:
            self.log(f"Reels complete: 0 files for @{username}", "info")
            return []

        # Filter out already-downloaded and out-of-range reels before expensive browser session
        needs_download = []
        for item in collected_items:
            code = item.get('code', '')
            post_date = item.get('date')
            # Date filters — same checks _process_api_post will do
            if post_date and cutoff_ts and post_date < cutoff_ts:
                continue
            if post_date and date_to_ts and post_date > date_to_ts:
                continue
            if self._is_already_downloaded(code, username):
                continue
            srcs = item.get('srcs', []) or ([item['src']] if item.get('src') else [])
            all_downloaded_flag = srcs and all(
                self._is_already_downloaded(self._extract_cdn_filename(s), username)
                for s in srcs
            )
            if all_downloaded_flag:
                continue
            needs_download.append(item)

        if not needs_download:
            self.log(f"Reels complete: 0 new files for @{username} ({len(collected_items)} already downloaded)", "info")
            return []

        self.log(f"Collected {len(collected_items)} reels, {len(needs_download)} need downloading, upgrading to full-res...", "info")

        # ── Phase 2: Batch upgrade to full-res ──
        hires_map, dates_map = self._batch_upgrade_to_hires(needs_download)
        for item in needs_download:
            code = item.get('code', '')
            if code in hires_map:
                item['srcs'] = hires_map[code]
                item['isSidecar'] = len(hires_map[code]) > 1
            # Fill in missing dates from detail pages
            if not item.get('date') and code in dates_map:
                item['date'] = dates_map[code]
                self.log(f"Recovered date for reel {code} from detail page", "debug")

        # ── Phase 3: Process and download ──
        all_downloaded = []
        for item in needs_download:
            status, files = self._process_api_post(
                item, username, output_path, cutoff_ts, phrase_config,
                defer_database, video_only=True, date_to_ts=date_to_ts,
                content_type='reels')
            if status == 'downloaded':
                all_downloaded.extend(files)

        self.log(f"Reels complete: {len(all_downloaded)} files for @{username}", "info")
        return all_downloaded