Files
media-downloader/scripts/backfill_kylie_posts.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

297 lines
9.3 KiB
Python

#!/usr/bin/env python3
"""Backfill missing kyliejenner posts using authenticated browser cookies.
Paginates through the full timeline via /api/v1/feed/user/ and inserts
any posts missing from paid_content_posts. Uses Edge browser fingerprint
and the cookies stored in the instagram_browser scraper entry.
Usage:
cd /opt/media-downloader
./venv/bin/python3 -u scripts/backfill_kylie_posts.py
"""
import json
import sys
import time
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Bootstrap DB (pgadapter) — module-level import activates monkey-patching
import modules.db_bootstrap # noqa: F401
import sqlite3 # routed to PostgreSQL via pgadapter
from curl_cffi.requests import Session as CurlSession
from datetime import datetime
CREATOR_ID = 101
USERNAME = 'kyliejenner'
IG_USER_ID = '12281817'
PER_PAGE = 33
SLEEP_BETWEEN = 2.0 # seconds between API calls
def load_cookies(conn):
cursor = conn.cursor()
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = 'instagram_browser'")
row = cursor.fetchone()
if not row or not row[0]:
print("ERROR: No cookies found in instagram_browser scraper")
sys.exit(1)
cookies = json.loads(row[0])
has_session = any(c.get('name') == 'sessionid' and c.get('value') for c in cookies)
if not has_session:
print("ERROR: No sessionid in cookies")
sys.exit(1)
return cookies
def save_cookies(conn, session):
updated = []
for c in session.cookies:
updated.append({
'name': c.name,
'value': c.value,
'domain': c.domain or '.instagram.com',
})
if updated:
cursor = conn.cursor()
cursor.execute(
"UPDATE scrapers SET cookies_json = ?, cookies_updated_at = ? WHERE id = 'instagram_browser'",
(json.dumps(updated), datetime.now().isoformat())
)
conn.commit()
def load_known_post_ids(conn):
cursor = conn.cursor()
cursor.execute("SELECT post_id FROM paid_content_posts WHERE creator_id = ?", (CREATOR_ID,))
return set(row[0] for row in cursor.fetchall())
def best_media_url(node):
media_type = node.get('media_type', 1)
if media_type == 2 and node.get('video_versions'):
best = max(node['video_versions'], key=lambda v: v.get('width', 0) * v.get('height', 0))
return best.get('url', '')
candidates = node.get('image_versions2', {}).get('candidates', [])
if candidates:
best = max(candidates, key=lambda c: c.get('width', 0) * c.get('height', 0))
return best.get('url', '')
return None
def node_to_post_row(node):
"""Convert an IG API node to DB row data."""
code = node.get('code', '')
if not code:
return None
taken_at = node.get('taken_at', 0)
published_at = datetime.fromtimestamp(taken_at).isoformat() if taken_at else None
caption_obj = node.get('caption')
caption = caption_obj.get('text', '') if isinstance(caption_obj, dict) else ''
srcs = []
media_type = node.get('media_type', 1)
if media_type == 8 and node.get('carousel_media'):
for child in node['carousel_media']:
url = best_media_url(child)
if url:
srcs.append(url)
else:
url = best_media_url(node)
if url:
srcs.append(url)
if not srcs:
return None
# Tagged users
tagged_users = []
for tag in (node.get('usertags') or {}).get('in', []):
uname = (tag.get('user') or {}).get('username')
if uname and uname not in tagged_users:
tagged_users.append(uname)
for cm in node.get('carousel_media') or []:
for tag in (cm.get('usertags') or {}).get('in', []):
uname = (tag.get('user') or {}).get('username')
if uname and uname not in tagged_users:
tagged_users.append(uname)
is_pinned = 1 if node.get('timeline_pinned_user_ids') else 0
return {
'post_id': code,
'published_at': published_at,
'content': caption,
'srcs': srcs,
'attachment_count': len(srcs),
'is_pinned': is_pinned,
'tagged_users': tagged_users,
}
def insert_post(conn, post_data):
"""Insert a post + attachments into the DB."""
cursor = conn.cursor()
now = datetime.now().isoformat()
cursor.execute(
"""INSERT INTO paid_content_posts
(creator_id, post_id, title, content, published_at, added_at,
has_attachments, attachment_count, downloaded, is_pinned)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(CREATOR_ID, post_data['post_id'], None, post_data['content'],
post_data['published_at'], now,
1 if post_data['attachment_count'] > 0 else 0,
post_data['attachment_count'], False, post_data['is_pinned'])
)
# Get the inserted post's ID
cursor.execute(
"SELECT id FROM paid_content_posts WHERE creator_id = ? AND post_id = ?",
(CREATOR_ID, post_data['post_id'])
)
row = cursor.fetchone()
if not row:
return
db_post_id = row[0]
# Insert attachments
for idx, src_url in enumerate(post_data['srcs']):
ext = '.mp4' if '.mp4' in src_url.split('?')[0] else '.jpg'
file_type = 'video' if ext == '.mp4' else 'image'
name = f"{post_data['post_id']}_{idx}{ext}"
cursor.execute(
"""INSERT INTO paid_content_attachments
(post_id, attachment_index, name, file_type, extension, server_path, download_url, status)
VALUES (?, ?, ?, ?, ?, ?, ?, 'pending')""",
(db_post_id, idx, name, file_type, ext,
f"https://www.instagram.com/p/{post_data['post_id']}/?img_index={idx + 1}",
src_url)
)
conn.commit()
def main():
conn = sqlite3.connect('media_downloader')
cookie_list = load_cookies(conn)
known = load_known_post_ids(conn)
print(f"Known posts in DB: {len(known)}")
session = CurlSession(impersonate='edge101')
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
'X-IG-App-ID': '936619743392459',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://www.instagram.com/',
'Origin': 'https://www.instagram.com',
'Sec-CH-UA': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
'Sec-CH-UA-Mobile': '?0',
'Sec-CH-UA-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
})
for c in cookie_list:
name = c.get('name', '')
value = c.get('value', '')
domain = c.get('domain', '.instagram.com')
if name and value:
session.cookies.set(name, value, domain=domain)
max_id = None
total_fetched = 0
total_new = 0
page = 0
consecutive_errors = 0
while True:
page += 1
params = {'count': PER_PAGE}
if max_id:
params['max_id'] = max_id
try:
resp = session.get(
f'https://www.instagram.com/api/v1/feed/user/{IG_USER_ID}/',
params=params,
timeout=15
)
except Exception as e:
print(f" Page {page}: request error: {e}")
consecutive_errors += 1
if consecutive_errors >= 3:
print("Too many consecutive errors, stopping.")
break
time.sleep(5)
continue
if resp.status_code != 200:
print(f" Page {page}: HTTP {resp.status_code}")
if resp.status_code == 401:
print("Session expired! Stopping.")
break
if resp.status_code == 429:
print("Rate limited. Waiting 60s...")
time.sleep(60)
continue
consecutive_errors += 1
if consecutive_errors >= 3:
print("Too many consecutive errors, stopping.")
break
time.sleep(5)
continue
consecutive_errors = 0
data = resp.json()
items = data.get('items', [])
more = data.get('more_available', False)
next_max_id = data.get('next_max_id')
if not items:
print(f" Page {page}: no items returned, done.")
break
total_fetched += len(items)
page_new = 0
for node in items:
code = node.get('code', '')
if not code:
continue
if code in known:
continue
post_data = node_to_post_row(node)
if not post_data:
continue
insert_post(conn, post_data)
known.add(code)
page_new += 1
total_new += 1
print(f" Page {page}: {len(items)} items, {page_new} new (total: {total_fetched} fetched, {total_new} new)")
if not more or not next_max_id:
print("No more pages available.")
break
max_id = next_max_id
time.sleep(SLEEP_BETWEEN)
# Save updated cookies
save_cookies(conn, session)
conn.close()
print(f"\nDone! Fetched {total_fetched} posts total, inserted {total_new} new posts.")
if __name__ == '__main__':
main()