Fix Instagram detection and notification filter bugs

- Update browser fingerprint from Edge 101 to Chrome 136 with Edge/macOS headers - Add missing headers: X-CSRFToken, X-IG-WWW-Claim, X-ASBD-ID - Reduce page size from 33 to 12 (matches real browser behavior) - Add randomized delays between requests and between creators (8-15s cooldown) - Update X-IG-WWW-Claim dynamically from response headers - Fix notification tagged-user filter using wrong column names (p.created_at, a.updated_at don't exist), now uses a.downloaded_at Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-30 13:28:21 -04:00
parent c5781197cc
commit 7c87fc1ff4
2 changed files with 78 additions and 33 deletions
--- a/modules/paid_content/instagram_adapter.py
+++ b/modules/paid_content/instagram_adapter.py
@@ -548,27 +548,49 @@ class InstagramAdapter(LoggingMixin):
        self.log(f"Authenticated API: fetching feed for @{username}{'  (full backfill)' if paginate_all else ''}...", 'info')
        try:
            import random as _random
            from curl_cffi.requests import Session as CurlSession
-            session = CurlSession(impersonate='edge101')
+            session = CurlSession(impersonate='chrome136')
-            session.headers.update({
+
-                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
+            # Load cookies first so we can extract csrftoken for the header
-                'X-IG-App-ID': '936619743392459',
+            csrf_token = ''
                'X-Requested-With': 'XMLHttpRequest',
                'Referer': 'https://www.instagram.com/',
                'Origin': 'https://www.instagram.com',
                'Sec-CH-UA': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
                'Sec-CH-UA-Mobile': '?0',
                'Sec-CH-UA-Platform': '"Windows"',
                'Sec-Fetch-Dest': 'empty',
                'Sec-Fetch-Mode': 'cors',
                'Sec-Fetch-Site': 'same-origin',
            })
            for c in cookie_list:
                name = c.get('name', '')
                value = c.get('value', '')
                domain = c.get('domain', '.instagram.com')
                if name and value:
                    session.cookies.set(name, value, domain=domain)
                    if name == 'csrftoken':
                        csrf_token = value
            # Extract ig_www_claim from cookies if present (set by IG in responses)
            ig_claim = '0'
            for c in cookie_list:
                if c.get('name') == 'ig_www_claim':
                    ig_claim = c.get('value', '0')
                    break
            # Override UA/platform headers to match Edge on macOS (where
            # the cookies were created).  The chrome136 TLS fingerprint is
            # identical to Edge 136 since both use Chromium's BoringSSL.
            session.headers.update({
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0',
                'Sec-CH-UA': '"Microsoft Edge";v="136", "Chromium";v="136", "Not_A Brand";v="24"',
                'Sec-CH-UA-Mobile': '?0',
                'Sec-CH-UA-Platform': '"macOS"',
                'X-CSRFToken': csrf_token,
                'X-IG-App-ID': '936619743392459',
                'X-IG-WWW-Claim': ig_claim,
                'X-ASBD-ID': '129477',
                'X-Requested-With': 'XMLHttpRequest',
                'Referer': f'https://www.instagram.com/{username}/',
                'Origin': 'https://www.instagram.com',
                'Accept': '*/*',
                'Accept-Language': 'en-US,en;q=0.9',
                'Sec-Fetch-Dest': 'empty',
                'Sec-Fetch-Mode': 'cors',
                'Sec-Fetch-Site': 'same-origin',
            })
            def _save_cookies():
                try:
@@ -590,13 +612,22 @@ class InstagramAdapter(LoggingMixin):
                except Exception as e:
                    self.log(f"Authenticated API: failed to save cookies: {e}", 'debug')
            def _update_claim_header(resp):
                """Update X-IG-WWW-Claim from response headers (IG rotates this)."""
                claim = resp.headers.get('x-ig-set-www-claim')
                if claim:
                    session.headers['X-IG-WWW-Claim'] = claim
            # ── Single-page mode (normal sync) ──
            if not paginate_all:
                # Randomized pre-request delay to avoid machine-like timing
                _time.sleep(_random.uniform(1.0, 3.0))
                resp = session.get(
                    f'https://www.instagram.com/api/v1/feed/user/{user_id}/',
-                    params={'count': 33},
+                    params={'count': 12},
                    timeout=15
                )
                _update_claim_header(resp)
                _save_cookies()
                if resp.status_code == 401:
@@ -656,23 +687,30 @@ class InstagramAdapter(LoggingMixin):
            while True:
                page += 1
-                params = {'count': 33}
+                params = {'count': 12}
                if max_id:
                    params['max_id'] = max_id
                # Human-like delay between pages (randomized)
                if page > 1:
                    _time.sleep(_random.uniform(2.0, 5.0))
                else:
                    _time.sleep(_random.uniform(1.0, 3.0))
                try:
                    resp = session.get(
                        f'https://www.instagram.com/api/v1/feed/user/{user_id}/',
                        params=params,
                        timeout=15
                    )
                    _update_claim_header(resp)
                except Exception as e:
                    self.log(f"Backfill page {page}: request error: {e}", 'warning')
                    consecutive_errors += 1
                    if consecutive_errors >= 3:
                        self.log("Backfill: too many consecutive errors, stopping.", 'warning')
                        break
-                    _time.sleep(5)
+                    _time.sleep(_random.uniform(5.0, 10.0))
                    continue
                if resp.status_code == 401:
@@ -682,8 +720,8 @@ class InstagramAdapter(LoggingMixin):
                    break
                if resp.status_code == 429:
-                    self.log("Backfill: rate limited, waiting 60s...", 'warning')
+                    self.log("Backfill: rate limited, waiting 60-120s...", 'warning')
-                    _time.sleep(60)
+                    _time.sleep(_random.uniform(60, 120))
                    continue
                if resp.status_code != 200:
@@ -692,7 +730,7 @@ class InstagramAdapter(LoggingMixin):
                    if consecutive_errors >= 3:
                        self.log("Backfill: too many consecutive errors, stopping.", 'warning')
                        break
-                    _time.sleep(5)
+                    _time.sleep(_random.uniform(5.0, 10.0))
                    continue
                consecutive_errors = 0
@@ -743,7 +781,6 @@ class InstagramAdapter(LoggingMixin):
                    break
                max_id = next_max_id
                _time.sleep(2)
            _save_cookies()
            self.log(f"Backfill complete: {total_fetched} fetched, {total_new} new posts for @{username}", 'info')
--- a/modules/paid_content/scraper.py
+++ b/modules/paid_content/scraper.py
@@ -2885,6 +2885,11 @@ class PaidContentScraper(LoggingMixin, DeferredDownloadsMixin):
            })
            return SyncResult(success=False, error=str(e))
        finally:
            # Inter-creator cooldown: wait 8-15s before releasing the lock so
            # the next Instagram creator doesn't fire immediately.  This makes
            # the request pattern look like a human browsing between profiles.
            import random as _random
            await asyncio.sleep(_random.uniform(8.0, 15.0))
            ig_rate_limiter.operation_lock.release()
    async def _fetch_stories_via_fastdl(self, username: str) -> Tuple[List, Dict]:
@@ -8148,17 +8153,8 @@ class PaidContentScraper(LoggingMixin, DeferredDownloadsMixin):
                    placeholders = ','.join(['?'] * len(filter_users))
                    with self.db.unified_db.get_connection() as conn:
                        cursor = conn.cursor()
-                        # Count new posts that have at least one matching tagged user
+                        # Count downloaded attachments from posts matching the tagged-user filter.
-                        cursor.execute(f"""
+                        # Use downloaded_at as the time anchor (added_at is often NULL).
                            SELECT COUNT(DISTINCT p.id)
                            FROM paid_content_posts p
                            JOIN paid_content_post_tagged_users tu ON tu.post_id = p.id
                            WHERE p.creator_id = ?
                              AND tu.username IN ({placeholders})
                              AND p.created_at >= datetime('now', '-1 hour')
                        """, (creator['id'], *filter_users))
                        filtered_new_posts = cursor.fetchone()[0]
                        # Count downloaded attachments from matching posts
                        cursor.execute(f"""
                            SELECT COUNT(DISTINCT a.id)
                            FROM paid_content_attachments a
@@ -8167,9 +8163,21 @@ class PaidContentScraper(LoggingMixin, DeferredDownloadsMixin):
                            WHERE p.creator_id = ?
                              AND tu.username IN ({placeholders})
                              AND a.status = 'downloaded'
-                              AND a.updated_at >= datetime('now', '-1 hour')
+                              AND a.downloaded_at >= datetime('now', '-1 hour')
                        """, (creator['id'], *filter_users))
                        filtered_downloaded = cursor.fetchone()[0]
                        # Count distinct posts that had matching downloads
                        cursor.execute(f"""
                            SELECT COUNT(DISTINCT p.id)
                            FROM paid_content_posts p
                            JOIN paid_content_post_tagged_users tu ON tu.post_id = p.id
                            JOIN paid_content_attachments a ON a.post_id = p.id
                            WHERE p.creator_id = ?
                              AND tu.username IN ({placeholders})
                              AND a.status = 'downloaded'
                              AND a.downloaded_at >= datetime('now', '-1 hour')
                        """, (creator['id'], *filter_users))
                        filtered_new_posts = cursor.fetchone()[0]
                    self.log(f"Notification filter: {new_posts} posts -> {filtered_new_posts}, "
                             f"{downloaded} downloads -> {filtered_downloaded} (filter: {filter_users})", 'debug')
                    new_posts = filtered_new_posts