#!/usr/bin/env python3 """Backfill missing kyliejenner posts using authenticated browser cookies. Paginates through the full timeline via /api/v1/feed/user/ and inserts any posts missing from paid_content_posts. Uses Edge browser fingerprint and the cookies stored in the instagram_browser scraper entry. Usage: cd /opt/media-downloader ./venv/bin/python3 -u scripts/backfill_kylie_posts.py """ import json import sys import time import os sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Bootstrap DB (pgadapter) — module-level import activates monkey-patching import modules.db_bootstrap # noqa: F401 import sqlite3 # routed to PostgreSQL via pgadapter from curl_cffi.requests import Session as CurlSession from datetime import datetime CREATOR_ID = 101 USERNAME = 'kyliejenner' IG_USER_ID = '12281817' PER_PAGE = 33 SLEEP_BETWEEN = 2.0 # seconds between API calls def load_cookies(conn): cursor = conn.cursor() cursor.execute("SELECT cookies_json FROM scrapers WHERE id = 'instagram_browser'") row = cursor.fetchone() if not row or not row[0]: print("ERROR: No cookies found in instagram_browser scraper") sys.exit(1) cookies = json.loads(row[0]) has_session = any(c.get('name') == 'sessionid' and c.get('value') for c in cookies) if not has_session: print("ERROR: No sessionid in cookies") sys.exit(1) return cookies def save_cookies(conn, session): updated = [] for c in session.cookies: updated.append({ 'name': c.name, 'value': c.value, 'domain': c.domain or '.instagram.com', }) if updated: cursor = conn.cursor() cursor.execute( "UPDATE scrapers SET cookies_json = ?, cookies_updated_at = ? WHERE id = 'instagram_browser'", (json.dumps(updated), datetime.now().isoformat()) ) conn.commit() def load_known_post_ids(conn): cursor = conn.cursor() cursor.execute("SELECT post_id FROM paid_content_posts WHERE creator_id = ?", (CREATOR_ID,)) return set(row[0] for row in cursor.fetchall()) def best_media_url(node): media_type = node.get('media_type', 1) if media_type == 2 and node.get('video_versions'): best = max(node['video_versions'], key=lambda v: v.get('width', 0) * v.get('height', 0)) return best.get('url', '') candidates = node.get('image_versions2', {}).get('candidates', []) if candidates: best = max(candidates, key=lambda c: c.get('width', 0) * c.get('height', 0)) return best.get('url', '') return None def node_to_post_row(node): """Convert an IG API node to DB row data.""" code = node.get('code', '') if not code: return None taken_at = node.get('taken_at', 0) published_at = datetime.fromtimestamp(taken_at).isoformat() if taken_at else None caption_obj = node.get('caption') caption = caption_obj.get('text', '') if isinstance(caption_obj, dict) else '' srcs = [] media_type = node.get('media_type', 1) if media_type == 8 and node.get('carousel_media'): for child in node['carousel_media']: url = best_media_url(child) if url: srcs.append(url) else: url = best_media_url(node) if url: srcs.append(url) if not srcs: return None # Tagged users tagged_users = [] for tag in (node.get('usertags') or {}).get('in', []): uname = (tag.get('user') or {}).get('username') if uname and uname not in tagged_users: tagged_users.append(uname) for cm in node.get('carousel_media') or []: for tag in (cm.get('usertags') or {}).get('in', []): uname = (tag.get('user') or {}).get('username') if uname and uname not in tagged_users: tagged_users.append(uname) is_pinned = 1 if node.get('timeline_pinned_user_ids') else 0 return { 'post_id': code, 'published_at': published_at, 'content': caption, 'srcs': srcs, 'attachment_count': len(srcs), 'is_pinned': is_pinned, 'tagged_users': tagged_users, } def insert_post(conn, post_data): """Insert a post + attachments into the DB.""" cursor = conn.cursor() now = datetime.now().isoformat() cursor.execute( """INSERT INTO paid_content_posts (creator_id, post_id, title, content, published_at, added_at, has_attachments, attachment_count, downloaded, is_pinned) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", (CREATOR_ID, post_data['post_id'], None, post_data['content'], post_data['published_at'], now, 1 if post_data['attachment_count'] > 0 else 0, post_data['attachment_count'], False, post_data['is_pinned']) ) # Get the inserted post's ID cursor.execute( "SELECT id FROM paid_content_posts WHERE creator_id = ? AND post_id = ?", (CREATOR_ID, post_data['post_id']) ) row = cursor.fetchone() if not row: return db_post_id = row[0] # Insert attachments for idx, src_url in enumerate(post_data['srcs']): ext = '.mp4' if '.mp4' in src_url.split('?')[0] else '.jpg' file_type = 'video' if ext == '.mp4' else 'image' name = f"{post_data['post_id']}_{idx}{ext}" cursor.execute( """INSERT INTO paid_content_attachments (post_id, attachment_index, name, file_type, extension, server_path, download_url, status) VALUES (?, ?, ?, ?, ?, ?, ?, 'pending')""", (db_post_id, idx, name, file_type, ext, f"https://www.instagram.com/p/{post_data['post_id']}/?img_index={idx + 1}", src_url) ) conn.commit() def main(): conn = sqlite3.connect('media_downloader') cookie_list = load_cookies(conn) known = load_known_post_ids(conn) print(f"Known posts in DB: {len(known)}") session = CurlSession(impersonate='edge101') session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0', 'X-IG-App-ID': '936619743392459', 'X-Requested-With': 'XMLHttpRequest', 'Referer': 'https://www.instagram.com/', 'Origin': 'https://www.instagram.com', 'Sec-CH-UA': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"', 'Sec-CH-UA-Mobile': '?0', 'Sec-CH-UA-Platform': '"Windows"', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', }) for c in cookie_list: name = c.get('name', '') value = c.get('value', '') domain = c.get('domain', '.instagram.com') if name and value: session.cookies.set(name, value, domain=domain) max_id = None total_fetched = 0 total_new = 0 page = 0 consecutive_errors = 0 while True: page += 1 params = {'count': PER_PAGE} if max_id: params['max_id'] = max_id try: resp = session.get( f'https://www.instagram.com/api/v1/feed/user/{IG_USER_ID}/', params=params, timeout=15 ) except Exception as e: print(f" Page {page}: request error: {e}") consecutive_errors += 1 if consecutive_errors >= 3: print("Too many consecutive errors, stopping.") break time.sleep(5) continue if resp.status_code != 200: print(f" Page {page}: HTTP {resp.status_code}") if resp.status_code == 401: print("Session expired! Stopping.") break if resp.status_code == 429: print("Rate limited. Waiting 60s...") time.sleep(60) continue consecutive_errors += 1 if consecutive_errors >= 3: print("Too many consecutive errors, stopping.") break time.sleep(5) continue consecutive_errors = 0 data = resp.json() items = data.get('items', []) more = data.get('more_available', False) next_max_id = data.get('next_max_id') if not items: print(f" Page {page}: no items returned, done.") break total_fetched += len(items) page_new = 0 for node in items: code = node.get('code', '') if not code: continue if code in known: continue post_data = node_to_post_row(node) if not post_data: continue insert_post(conn, post_data) known.add(code) page_new += 1 total_new += 1 print(f" Page {page}: {len(items)} items, {page_new} new (total: {total_fetched} fetched, {total_new} new)") if not more or not next_max_id: print("No more pages available.") break max_id = next_max_id time.sleep(SLEEP_BETWEEN) # Save updated cookies save_cookies(conn, session) conn.close() print(f"\nDone! Fetched {total_fetched} posts total, inserted {total_new} new posts.") if __name__ == '__main__': main()