Fix Instagram detection and notification filter bugs

- Update browser fingerprint from Edge 101 to Chrome 136 with Edge/macOS headers
- Add missing headers: X-CSRFToken, X-IG-WWW-Claim, X-ASBD-ID
- Reduce page size from 33 to 12 (matches real browser behavior)
- Add randomized delays between requests and between creators (8-15s cooldown)
- Update X-IG-WWW-Claim dynamically from response headers
- Fix notification tagged-user filter using wrong column names (p.created_at, a.updated_at don't exist), now uses a.downloaded_at

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-30 13:28:21 -04:00
parent c5781197cc
commit 7c87fc1ff4
2 changed files with 78 additions and 33 deletions

View File

@@ -548,27 +548,49 @@ class InstagramAdapter(LoggingMixin):
self.log(f"Authenticated API: fetching feed for @{username}{' (full backfill)' if paginate_all else ''}...", 'info') self.log(f"Authenticated API: fetching feed for @{username}{' (full backfill)' if paginate_all else ''}...", 'info')
try: try:
import random as _random
from curl_cffi.requests import Session as CurlSession from curl_cffi.requests import Session as CurlSession
session = CurlSession(impersonate='edge101') session = CurlSession(impersonate='chrome136')
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0', # Load cookies first so we can extract csrftoken for the header
'X-IG-App-ID': '936619743392459', csrf_token = ''
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://www.instagram.com/',
'Origin': 'https://www.instagram.com',
'Sec-CH-UA': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
'Sec-CH-UA-Mobile': '?0',
'Sec-CH-UA-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
})
for c in cookie_list: for c in cookie_list:
name = c.get('name', '') name = c.get('name', '')
value = c.get('value', '') value = c.get('value', '')
domain = c.get('domain', '.instagram.com') domain = c.get('domain', '.instagram.com')
if name and value: if name and value:
session.cookies.set(name, value, domain=domain) session.cookies.set(name, value, domain=domain)
if name == 'csrftoken':
csrf_token = value
# Extract ig_www_claim from cookies if present (set by IG in responses)
ig_claim = '0'
for c in cookie_list:
if c.get('name') == 'ig_www_claim':
ig_claim = c.get('value', '0')
break
# Override UA/platform headers to match Edge on macOS (where
# the cookies were created). The chrome136 TLS fingerprint is
# identical to Edge 136 since both use Chromium's BoringSSL.
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0',
'Sec-CH-UA': '"Microsoft Edge";v="136", "Chromium";v="136", "Not_A Brand";v="24"',
'Sec-CH-UA-Mobile': '?0',
'Sec-CH-UA-Platform': '"macOS"',
'X-CSRFToken': csrf_token,
'X-IG-App-ID': '936619743392459',
'X-IG-WWW-Claim': ig_claim,
'X-ASBD-ID': '129477',
'X-Requested-With': 'XMLHttpRequest',
'Referer': f'https://www.instagram.com/{username}/',
'Origin': 'https://www.instagram.com',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.9',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
})
def _save_cookies(): def _save_cookies():
try: try:
@@ -590,13 +612,22 @@ class InstagramAdapter(LoggingMixin):
except Exception as e: except Exception as e:
self.log(f"Authenticated API: failed to save cookies: {e}", 'debug') self.log(f"Authenticated API: failed to save cookies: {e}", 'debug')
def _update_claim_header(resp):
"""Update X-IG-WWW-Claim from response headers (IG rotates this)."""
claim = resp.headers.get('x-ig-set-www-claim')
if claim:
session.headers['X-IG-WWW-Claim'] = claim
# ── Single-page mode (normal sync) ── # ── Single-page mode (normal sync) ──
if not paginate_all: if not paginate_all:
# Randomized pre-request delay to avoid machine-like timing
_time.sleep(_random.uniform(1.0, 3.0))
resp = session.get( resp = session.get(
f'https://www.instagram.com/api/v1/feed/user/{user_id}/', f'https://www.instagram.com/api/v1/feed/user/{user_id}/',
params={'count': 33}, params={'count': 12},
timeout=15 timeout=15
) )
_update_claim_header(resp)
_save_cookies() _save_cookies()
if resp.status_code == 401: if resp.status_code == 401:
@@ -656,23 +687,30 @@ class InstagramAdapter(LoggingMixin):
while True: while True:
page += 1 page += 1
params = {'count': 33} params = {'count': 12}
if max_id: if max_id:
params['max_id'] = max_id params['max_id'] = max_id
# Human-like delay between pages (randomized)
if page > 1:
_time.sleep(_random.uniform(2.0, 5.0))
else:
_time.sleep(_random.uniform(1.0, 3.0))
try: try:
resp = session.get( resp = session.get(
f'https://www.instagram.com/api/v1/feed/user/{user_id}/', f'https://www.instagram.com/api/v1/feed/user/{user_id}/',
params=params, params=params,
timeout=15 timeout=15
) )
_update_claim_header(resp)
except Exception as e: except Exception as e:
self.log(f"Backfill page {page}: request error: {e}", 'warning') self.log(f"Backfill page {page}: request error: {e}", 'warning')
consecutive_errors += 1 consecutive_errors += 1
if consecutive_errors >= 3: if consecutive_errors >= 3:
self.log("Backfill: too many consecutive errors, stopping.", 'warning') self.log("Backfill: too many consecutive errors, stopping.", 'warning')
break break
_time.sleep(5) _time.sleep(_random.uniform(5.0, 10.0))
continue continue
if resp.status_code == 401: if resp.status_code == 401:
@@ -682,8 +720,8 @@ class InstagramAdapter(LoggingMixin):
break break
if resp.status_code == 429: if resp.status_code == 429:
self.log("Backfill: rate limited, waiting 60s...", 'warning') self.log("Backfill: rate limited, waiting 60-120s...", 'warning')
_time.sleep(60) _time.sleep(_random.uniform(60, 120))
continue continue
if resp.status_code != 200: if resp.status_code != 200:
@@ -692,7 +730,7 @@ class InstagramAdapter(LoggingMixin):
if consecutive_errors >= 3: if consecutive_errors >= 3:
self.log("Backfill: too many consecutive errors, stopping.", 'warning') self.log("Backfill: too many consecutive errors, stopping.", 'warning')
break break
_time.sleep(5) _time.sleep(_random.uniform(5.0, 10.0))
continue continue
consecutive_errors = 0 consecutive_errors = 0
@@ -743,7 +781,6 @@ class InstagramAdapter(LoggingMixin):
break break
max_id = next_max_id max_id = next_max_id
_time.sleep(2)
_save_cookies() _save_cookies()
self.log(f"Backfill complete: {total_fetched} fetched, {total_new} new posts for @{username}", 'info') self.log(f"Backfill complete: {total_fetched} fetched, {total_new} new posts for @{username}", 'info')

View File

@@ -2885,6 +2885,11 @@ class PaidContentScraper(LoggingMixin, DeferredDownloadsMixin):
}) })
return SyncResult(success=False, error=str(e)) return SyncResult(success=False, error=str(e))
finally: finally:
# Inter-creator cooldown: wait 8-15s before releasing the lock so
# the next Instagram creator doesn't fire immediately. This makes
# the request pattern look like a human browsing between profiles.
import random as _random
await asyncio.sleep(_random.uniform(8.0, 15.0))
ig_rate_limiter.operation_lock.release() ig_rate_limiter.operation_lock.release()
async def _fetch_stories_via_fastdl(self, username: str) -> Tuple[List, Dict]: async def _fetch_stories_via_fastdl(self, username: str) -> Tuple[List, Dict]:
@@ -8148,17 +8153,8 @@ class PaidContentScraper(LoggingMixin, DeferredDownloadsMixin):
placeholders = ','.join(['?'] * len(filter_users)) placeholders = ','.join(['?'] * len(filter_users))
with self.db.unified_db.get_connection() as conn: with self.db.unified_db.get_connection() as conn:
cursor = conn.cursor() cursor = conn.cursor()
# Count new posts that have at least one matching tagged user # Count downloaded attachments from posts matching the tagged-user filter.
cursor.execute(f""" # Use downloaded_at as the time anchor (added_at is often NULL).
SELECT COUNT(DISTINCT p.id)
FROM paid_content_posts p
JOIN paid_content_post_tagged_users tu ON tu.post_id = p.id
WHERE p.creator_id = ?
AND tu.username IN ({placeholders})
AND p.created_at >= datetime('now', '-1 hour')
""", (creator['id'], *filter_users))
filtered_new_posts = cursor.fetchone()[0]
# Count downloaded attachments from matching posts
cursor.execute(f""" cursor.execute(f"""
SELECT COUNT(DISTINCT a.id) SELECT COUNT(DISTINCT a.id)
FROM paid_content_attachments a FROM paid_content_attachments a
@@ -8167,9 +8163,21 @@ class PaidContentScraper(LoggingMixin, DeferredDownloadsMixin):
WHERE p.creator_id = ? WHERE p.creator_id = ?
AND tu.username IN ({placeholders}) AND tu.username IN ({placeholders})
AND a.status = 'downloaded' AND a.status = 'downloaded'
AND a.updated_at >= datetime('now', '-1 hour') AND a.downloaded_at >= datetime('now', '-1 hour')
""", (creator['id'], *filter_users)) """, (creator['id'], *filter_users))
filtered_downloaded = cursor.fetchone()[0] filtered_downloaded = cursor.fetchone()[0]
# Count distinct posts that had matching downloads
cursor.execute(f"""
SELECT COUNT(DISTINCT p.id)
FROM paid_content_posts p
JOIN paid_content_post_tagged_users tu ON tu.post_id = p.id
JOIN paid_content_attachments a ON a.post_id = p.id
WHERE p.creator_id = ?
AND tu.username IN ({placeholders})
AND a.status = 'downloaded'
AND a.downloaded_at >= datetime('now', '-1 hour')
""", (creator['id'], *filter_users))
filtered_new_posts = cursor.fetchone()[0]
self.log(f"Notification filter: {new_posts} posts -> {filtered_new_posts}, " self.log(f"Notification filter: {new_posts} posts -> {filtered_new_posts}, "
f"{downloaded} downloads -> {filtered_downloaded} (filter: {filter_users})", 'debug') f"{downloaded} downloads -> {filtered_downloaded} (filter: {filter_users})", 'debug')
new_posts = filtered_new_posts new_posts = filtered_new_posts