diff --git a/modules/paid_content/instagram_adapter.py b/modules/paid_content/instagram_adapter.py index c748916..b3792ff 100644 --- a/modules/paid_content/instagram_adapter.py +++ b/modules/paid_content/instagram_adapter.py @@ -548,27 +548,49 @@ class InstagramAdapter(LoggingMixin): self.log(f"Authenticated API: fetching feed for @{username}{' (full backfill)' if paginate_all else ''}...", 'info') try: + import random as _random from curl_cffi.requests import Session as CurlSession - session = CurlSession(impersonate='edge101') - session.headers.update({ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0', - 'X-IG-App-ID': '936619743392459', - 'X-Requested-With': 'XMLHttpRequest', - 'Referer': 'https://www.instagram.com/', - 'Origin': 'https://www.instagram.com', - 'Sec-CH-UA': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"', - 'Sec-CH-UA-Mobile': '?0', - 'Sec-CH-UA-Platform': '"Windows"', - 'Sec-Fetch-Dest': 'empty', - 'Sec-Fetch-Mode': 'cors', - 'Sec-Fetch-Site': 'same-origin', - }) + session = CurlSession(impersonate='chrome136') + + # Load cookies first so we can extract csrftoken for the header + csrf_token = '' for c in cookie_list: name = c.get('name', '') value = c.get('value', '') domain = c.get('domain', '.instagram.com') if name and value: session.cookies.set(name, value, domain=domain) + if name == 'csrftoken': + csrf_token = value + + # Extract ig_www_claim from cookies if present (set by IG in responses) + ig_claim = '0' + for c in cookie_list: + if c.get('name') == 'ig_www_claim': + ig_claim = c.get('value', '0') + break + + # Override UA/platform headers to match Edge on macOS (where + # the cookies were created). The chrome136 TLS fingerprint is + # identical to Edge 136 since both use Chromium's BoringSSL. + session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0', + 'Sec-CH-UA': '"Microsoft Edge";v="136", "Chromium";v="136", "Not_A Brand";v="24"', + 'Sec-CH-UA-Mobile': '?0', + 'Sec-CH-UA-Platform': '"macOS"', + 'X-CSRFToken': csrf_token, + 'X-IG-App-ID': '936619743392459', + 'X-IG-WWW-Claim': ig_claim, + 'X-ASBD-ID': '129477', + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': f'https://www.instagram.com/{username}/', + 'Origin': 'https://www.instagram.com', + 'Accept': '*/*', + 'Accept-Language': 'en-US,en;q=0.9', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-origin', + }) def _save_cookies(): try: @@ -590,13 +612,22 @@ class InstagramAdapter(LoggingMixin): except Exception as e: self.log(f"Authenticated API: failed to save cookies: {e}", 'debug') + def _update_claim_header(resp): + """Update X-IG-WWW-Claim from response headers (IG rotates this).""" + claim = resp.headers.get('x-ig-set-www-claim') + if claim: + session.headers['X-IG-WWW-Claim'] = claim + # ── Single-page mode (normal sync) ── if not paginate_all: + # Randomized pre-request delay to avoid machine-like timing + _time.sleep(_random.uniform(1.0, 3.0)) resp = session.get( f'https://www.instagram.com/api/v1/feed/user/{user_id}/', - params={'count': 33}, + params={'count': 12}, timeout=15 ) + _update_claim_header(resp) _save_cookies() if resp.status_code == 401: @@ -656,23 +687,30 @@ class InstagramAdapter(LoggingMixin): while True: page += 1 - params = {'count': 33} + params = {'count': 12} if max_id: params['max_id'] = max_id + # Human-like delay between pages (randomized) + if page > 1: + _time.sleep(_random.uniform(2.0, 5.0)) + else: + _time.sleep(_random.uniform(1.0, 3.0)) + try: resp = session.get( f'https://www.instagram.com/api/v1/feed/user/{user_id}/', params=params, timeout=15 ) + _update_claim_header(resp) except Exception as e: self.log(f"Backfill page {page}: request error: {e}", 'warning') consecutive_errors += 1 if consecutive_errors >= 3: self.log("Backfill: too many consecutive errors, stopping.", 'warning') break - _time.sleep(5) + _time.sleep(_random.uniform(5.0, 10.0)) continue if resp.status_code == 401: @@ -682,8 +720,8 @@ class InstagramAdapter(LoggingMixin): break if resp.status_code == 429: - self.log("Backfill: rate limited, waiting 60s...", 'warning') - _time.sleep(60) + self.log("Backfill: rate limited, waiting 60-120s...", 'warning') + _time.sleep(_random.uniform(60, 120)) continue if resp.status_code != 200: @@ -692,7 +730,7 @@ class InstagramAdapter(LoggingMixin): if consecutive_errors >= 3: self.log("Backfill: too many consecutive errors, stopping.", 'warning') break - _time.sleep(5) + _time.sleep(_random.uniform(5.0, 10.0)) continue consecutive_errors = 0 @@ -743,7 +781,6 @@ class InstagramAdapter(LoggingMixin): break max_id = next_max_id - _time.sleep(2) _save_cookies() self.log(f"Backfill complete: {total_fetched} fetched, {total_new} new posts for @{username}", 'info') diff --git a/modules/paid_content/scraper.py b/modules/paid_content/scraper.py index 184e616..b3cf692 100644 --- a/modules/paid_content/scraper.py +++ b/modules/paid_content/scraper.py @@ -2885,6 +2885,11 @@ class PaidContentScraper(LoggingMixin, DeferredDownloadsMixin): }) return SyncResult(success=False, error=str(e)) finally: + # Inter-creator cooldown: wait 8-15s before releasing the lock so + # the next Instagram creator doesn't fire immediately. This makes + # the request pattern look like a human browsing between profiles. + import random as _random + await asyncio.sleep(_random.uniform(8.0, 15.0)) ig_rate_limiter.operation_lock.release() async def _fetch_stories_via_fastdl(self, username: str) -> Tuple[List, Dict]: @@ -8148,17 +8153,8 @@ class PaidContentScraper(LoggingMixin, DeferredDownloadsMixin): placeholders = ','.join(['?'] * len(filter_users)) with self.db.unified_db.get_connection() as conn: cursor = conn.cursor() - # Count new posts that have at least one matching tagged user - cursor.execute(f""" - SELECT COUNT(DISTINCT p.id) - FROM paid_content_posts p - JOIN paid_content_post_tagged_users tu ON tu.post_id = p.id - WHERE p.creator_id = ? - AND tu.username IN ({placeholders}) - AND p.created_at >= datetime('now', '-1 hour') - """, (creator['id'], *filter_users)) - filtered_new_posts = cursor.fetchone()[0] - # Count downloaded attachments from matching posts + # Count downloaded attachments from posts matching the tagged-user filter. + # Use downloaded_at as the time anchor (added_at is often NULL). cursor.execute(f""" SELECT COUNT(DISTINCT a.id) FROM paid_content_attachments a @@ -8167,9 +8163,21 @@ class PaidContentScraper(LoggingMixin, DeferredDownloadsMixin): WHERE p.creator_id = ? AND tu.username IN ({placeholders}) AND a.status = 'downloaded' - AND a.updated_at >= datetime('now', '-1 hour') + AND a.downloaded_at >= datetime('now', '-1 hour') """, (creator['id'], *filter_users)) filtered_downloaded = cursor.fetchone()[0] + # Count distinct posts that had matching downloads + cursor.execute(f""" + SELECT COUNT(DISTINCT p.id) + FROM paid_content_posts p + JOIN paid_content_post_tagged_users tu ON tu.post_id = p.id + JOIN paid_content_attachments a ON a.post_id = p.id + WHERE p.creator_id = ? + AND tu.username IN ({placeholders}) + AND a.status = 'downloaded' + AND a.downloaded_at >= datetime('now', '-1 hour') + """, (creator['id'], *filter_users)) + filtered_new_posts = cursor.fetchone()[0] self.log(f"Notification filter: {new_posts} posts -> {filtered_new_posts}, " f"{downloaded} downloads -> {filtered_downloaded} (filter: {filter_users})", 'debug') new_posts = filtered_new_posts