Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

View File

@@ -0,0 +1,351 @@
#!/usr/bin/env python3
"""Backfill missing Instagram posts using authenticated browser cookies.
Paginates through the full timeline via /api/v1/feed/user/ and inserts
any posts missing from paid_content_posts. Uses Edge browser fingerprint
and the cookies stored in the instagram_browser scraper entry.
Usage:
cd /opt/media-downloader
./venv/bin/python3 -u scripts/backfill_ig_posts.py --creator-id 101
./venv/bin/python3 -u scripts/backfill_ig_posts.py --creator-id 110
"""
import argparse
import json
import sys
import time
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Bootstrap DB (pgadapter) — module-level import activates monkey-patching
import modules.db_bootstrap # noqa: F401
import sqlite3 # routed to PostgreSQL via pgadapter
from curl_cffi.requests import Session as CurlSession
from datetime import datetime
PER_PAGE = 33
SLEEP_BETWEEN = 2.0 # seconds between API calls
def load_cookies(conn):
cursor = conn.cursor()
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = 'instagram_browser'")
row = cursor.fetchone()
if not row or not row[0]:
print("ERROR: No cookies found in instagram_browser scraper")
sys.exit(1)
cookies = json.loads(row[0])
has_session = any(c.get('name') == 'sessionid' and c.get('value') for c in cookies)
if not has_session:
print("ERROR: No sessionid in cookies")
sys.exit(1)
return cookies
def save_cookies(conn, session):
updated = []
for c in session.cookies.jar:
updated.append({
'name': c.name,
'value': c.value,
'domain': c.domain or '.instagram.com',
})
if updated:
cursor = conn.cursor()
cursor.execute(
"UPDATE scrapers SET cookies_json = ?, cookies_updated_at = ? WHERE id = 'instagram_browser'",
(json.dumps(updated), datetime.now().isoformat())
)
conn.commit()
def load_known_post_ids(conn, creator_id):
cursor = conn.cursor()
cursor.execute("SELECT post_id FROM paid_content_posts WHERE creator_id = ?", (creator_id,))
return set(row[0] for row in cursor.fetchall())
def lookup_ig_user_id(session, username):
"""Look up Instagram user ID from username using authenticated session."""
resp = session.get(
f'https://www.instagram.com/api/v1/users/web_profile_info/?username={username}',
timeout=10
)
if resp.status_code != 200:
print(f"ERROR: Failed to look up user ID for @{username}: HTTP {resp.status_code}")
sys.exit(1)
data = resp.json()
user = data['data']['user']
ig_user_id = user['id']
ig_post_count = user['edge_owner_to_timeline_media']['count']
print(f"Instagram user ID for @{username}: {ig_user_id} ({ig_post_count} posts)")
return ig_user_id
def best_media_url(node):
media_type = node.get('media_type', 1)
if media_type == 2 and node.get('video_versions'):
best = max(node['video_versions'], key=lambda v: v.get('width', 0) * v.get('height', 0))
return best.get('url', '')
candidates = node.get('image_versions2', {}).get('candidates', [])
if candidates:
best = max(candidates, key=lambda c: c.get('width', 0) * c.get('height', 0))
return best.get('url', '')
return None
def node_to_post_row(node):
"""Convert an IG API node to DB row data."""
code = node.get('code', '')
if not code:
return None
taken_at = node.get('taken_at', 0)
published_at = datetime.fromtimestamp(taken_at).isoformat() if taken_at else None
caption_obj = node.get('caption')
caption = caption_obj.get('text', '') if isinstance(caption_obj, dict) else ''
srcs = []
media_type = node.get('media_type', 1)
if media_type == 8 and node.get('carousel_media'):
for child in node['carousel_media']:
url = best_media_url(child)
if url:
srcs.append(url)
else:
url = best_media_url(node)
if url:
srcs.append(url)
if not srcs:
return None
# Tagged users
tagged_users = []
for tag in (node.get('usertags') or {}).get('in', []):
uname = (tag.get('user') or {}).get('username')
if uname and uname not in tagged_users:
tagged_users.append(uname)
for cm in node.get('carousel_media') or []:
for tag in (cm.get('usertags') or {}).get('in', []):
uname = (tag.get('user') or {}).get('username')
if uname and uname not in tagged_users:
tagged_users.append(uname)
is_pinned = 1 if node.get('timeline_pinned_user_ids') else 0
return {
'post_id': code,
'published_at': published_at,
'content': caption,
'srcs': srcs,
'attachment_count': len(srcs),
'is_pinned': is_pinned,
'tagged_users': tagged_users,
}
def insert_post(conn, creator_id, post_data):
"""Insert a post + attachments into the DB."""
cursor = conn.cursor()
now = datetime.now().isoformat()
cursor.execute(
"""INSERT INTO paid_content_posts
(creator_id, post_id, title, content, published_at, added_at,
has_attachments, attachment_count, downloaded, is_pinned)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(creator_id, post_data['post_id'], None, post_data['content'],
post_data['published_at'], now,
1 if post_data['attachment_count'] > 0 else 0,
post_data['attachment_count'], False, post_data['is_pinned'])
)
# Get the inserted post's ID
cursor.execute(
"SELECT id FROM paid_content_posts WHERE creator_id = ? AND post_id = ?",
(creator_id, post_data['post_id'])
)
row = cursor.fetchone()
if not row:
return
db_post_id = row[0]
# Insert attachments
for idx, src_url in enumerate(post_data['srcs']):
ext = '.mp4' if '.mp4' in src_url.split('?')[0] else '.jpg'
file_type = 'video' if ext == '.mp4' else 'image'
name = f"{post_data['post_id']}_{idx}{ext}"
cursor.execute(
"""INSERT INTO paid_content_attachments
(post_id, attachment_index, name, file_type, extension, server_path, download_url, status)
VALUES (?, ?, ?, ?, ?, ?, ?, 'pending')""",
(db_post_id, idx, name, file_type, ext,
f"https://www.instagram.com/p/{post_data['post_id']}/?img_index={idx + 1}",
src_url)
)
# Insert tagged users
for uname in post_data.get('tagged_users', []):
cursor.execute(
"""INSERT INTO paid_content_post_tagged_users (post_id, username, created_at)
VALUES (?, ?, ?)
ON CONFLICT (post_id, username) DO NOTHING""",
(db_post_id, uname, now)
)
conn.commit()
def create_session(cookie_list):
session = CurlSession(impersonate='edge101')
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
'X-IG-App-ID': '936619743392459',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://www.instagram.com/',
'Origin': 'https://www.instagram.com',
'Sec-CH-UA': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
'Sec-CH-UA-Mobile': '?0',
'Sec-CH-UA-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
})
for c in cookie_list:
name = c.get('name', '')
value = c.get('value', '')
domain = c.get('domain', '.instagram.com')
if name and value:
session.cookies.set(name, value, domain=domain)
return session
def main():
parser = argparse.ArgumentParser(description='Backfill missing Instagram posts')
parser.add_argument('--creator-id', type=int, required=True, help='Paid content creator ID')
args = parser.parse_args()
conn = sqlite3.connect('media_downloader')
# Look up creator
cursor = conn.cursor()
cursor.execute(
"SELECT username FROM paid_content_creators WHERE id = ? AND platform = 'instagram'",
(args.creator_id,)
)
row = cursor.fetchone()
if not row:
print(f"ERROR: Creator ID {args.creator_id} not found")
sys.exit(1)
username = row[0]
print(f"Backfilling @{username} (creator_id={args.creator_id})")
cookie_list = load_cookies(conn)
session = create_session(cookie_list)
# Look up Instagram user ID
ig_user_id = lookup_ig_user_id(session, username)
time.sleep(1)
known = load_known_post_ids(conn, args.creator_id)
print(f"Known posts in DB: {len(known)}")
max_id = None
total_fetched = 0
total_new = 0
page = 0
consecutive_errors = 0
while True:
page += 1
params = {'count': PER_PAGE}
if max_id:
params['max_id'] = max_id
try:
resp = session.get(
f'https://www.instagram.com/api/v1/feed/user/{ig_user_id}/',
params=params,
timeout=15
)
except Exception as e:
print(f" Page {page}: request error: {e}")
consecutive_errors += 1
if consecutive_errors >= 3:
print("Too many consecutive errors, stopping.")
break
time.sleep(5)
continue
if resp.status_code != 200:
print(f" Page {page}: HTTP {resp.status_code}")
if resp.status_code == 401:
print("Session expired! Stopping.")
break
if resp.status_code == 429:
print("Rate limited. Waiting 60s...")
time.sleep(60)
continue
consecutive_errors += 1
if consecutive_errors >= 3:
print("Too many consecutive errors, stopping.")
break
time.sleep(5)
continue
consecutive_errors = 0
data = resp.json()
items = data.get('items', [])
more = data.get('more_available', False)
next_max_id = data.get('next_max_id')
if not items:
print(f" Page {page}: no items returned, done.")
break
total_fetched += len(items)
page_new = 0
for node in items:
code = node.get('code', '')
if not code:
continue
if code in known:
continue
post_data = node_to_post_row(node)
if not post_data:
continue
insert_post(conn, args.creator_id, post_data)
known.add(code)
page_new += 1
total_new += 1
print(f" Page {page}: {len(items)} items, {page_new} new (total: {total_fetched} fetched, {total_new} new)")
if not more or not next_max_id:
print("No more pages available.")
break
max_id = next_max_id
time.sleep(SLEEP_BETWEEN)
# Save updated cookies
try:
save_cookies(conn, session)
except Exception as e:
print(f"Warning: failed to save cookies: {e}")
conn.close()
print(f"\nDone! Fetched {total_fetched} posts total, inserted {total_new} new posts for @{username}.")
if __name__ == '__main__':
main()