Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/scripts/backfill_ig_posts.py
+++ b/scripts/backfill_ig_posts.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python3
+"""Backfill missing Instagram posts using authenticated browser cookies.
+
+Paginates through the full timeline via /api/v1/feed/user/ and inserts
+any posts missing from paid_content_posts.  Uses Edge browser fingerprint
+and the cookies stored in the instagram_browser scraper entry.
+
+Usage:
+    cd /opt/media-downloader
+    ./venv/bin/python3 -u scripts/backfill_ig_posts.py --creator-id 101
+    ./venv/bin/python3 -u scripts/backfill_ig_posts.py --creator-id 110
+"""
+import argparse
+import json
+import sys
+import time
+import os
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Bootstrap DB (pgadapter) — module-level import activates monkey-patching
+import modules.db_bootstrap  # noqa: F401
+
+import sqlite3  # routed to PostgreSQL via pgadapter
+from curl_cffi.requests import Session as CurlSession
+from datetime import datetime
+
+PER_PAGE = 33
+SLEEP_BETWEEN = 2.0  # seconds between API calls
+
+
+def load_cookies(conn):
+    cursor = conn.cursor()
+    cursor.execute("SELECT cookies_json FROM scrapers WHERE id = 'instagram_browser'")
+    row = cursor.fetchone()
+    if not row or not row[0]:
+        print("ERROR: No cookies found in instagram_browser scraper")
+        sys.exit(1)
+    cookies = json.loads(row[0])
+    has_session = any(c.get('name') == 'sessionid' and c.get('value') for c in cookies)
+    if not has_session:
+        print("ERROR: No sessionid in cookies")
+        sys.exit(1)
+    return cookies
+
+
+def save_cookies(conn, session):
+    updated = []
+    for c in session.cookies.jar:
+        updated.append({
+            'name': c.name,
+            'value': c.value,
+            'domain': c.domain or '.instagram.com',
+        })
+    if updated:
+        cursor = conn.cursor()
+        cursor.execute(
+            "UPDATE scrapers SET cookies_json = ?, cookies_updated_at = ? WHERE id = 'instagram_browser'",
+            (json.dumps(updated), datetime.now().isoformat())
+        )
+        conn.commit()
+
+
+def load_known_post_ids(conn, creator_id):
+    cursor = conn.cursor()
+    cursor.execute("SELECT post_id FROM paid_content_posts WHERE creator_id = ?", (creator_id,))
+    return set(row[0] for row in cursor.fetchall())
+
+
+def lookup_ig_user_id(session, username):
+    """Look up Instagram user ID from username using authenticated session."""
+    resp = session.get(
+        f'https://www.instagram.com/api/v1/users/web_profile_info/?username={username}',
+        timeout=10
+    )
+    if resp.status_code != 200:
+        print(f"ERROR: Failed to look up user ID for @{username}: HTTP {resp.status_code}")
+        sys.exit(1)
+    data = resp.json()
+    user = data['data']['user']
+    ig_user_id = user['id']
+    ig_post_count = user['edge_owner_to_timeline_media']['count']
+    print(f"Instagram user ID for @{username}: {ig_user_id} ({ig_post_count} posts)")
+    return ig_user_id
+
+
+def best_media_url(node):
+    media_type = node.get('media_type', 1)
+    if media_type == 2 and node.get('video_versions'):
+        best = max(node['video_versions'], key=lambda v: v.get('width', 0) * v.get('height', 0))
+        return best.get('url', '')
+    candidates = node.get('image_versions2', {}).get('candidates', [])
+    if candidates:
+        best = max(candidates, key=lambda c: c.get('width', 0) * c.get('height', 0))
+        return best.get('url', '')
+    return None
+
+
+def node_to_post_row(node):
+    """Convert an IG API node to DB row data."""
+    code = node.get('code', '')
+    if not code:
+        return None
+
+    taken_at = node.get('taken_at', 0)
+    published_at = datetime.fromtimestamp(taken_at).isoformat() if taken_at else None
+
+    caption_obj = node.get('caption')
+    caption = caption_obj.get('text', '') if isinstance(caption_obj, dict) else ''
+
+    srcs = []
+    media_type = node.get('media_type', 1)
+    if media_type == 8 and node.get('carousel_media'):
+        for child in node['carousel_media']:
+            url = best_media_url(child)
+            if url:
+                srcs.append(url)
+    else:
+        url = best_media_url(node)
+        if url:
+            srcs.append(url)
+
+    if not srcs:
+        return None
+
+    # Tagged users
+    tagged_users = []
+    for tag in (node.get('usertags') or {}).get('in', []):
+        uname = (tag.get('user') or {}).get('username')
+        if uname and uname not in tagged_users:
+            tagged_users.append(uname)
+    for cm in node.get('carousel_media') or []:
+        for tag in (cm.get('usertags') or {}).get('in', []):
+            uname = (tag.get('user') or {}).get('username')
+            if uname and uname not in tagged_users:
+                tagged_users.append(uname)
+
+    is_pinned = 1 if node.get('timeline_pinned_user_ids') else 0
+
+    return {
+        'post_id': code,
+        'published_at': published_at,
+        'content': caption,
+        'srcs': srcs,
+        'attachment_count': len(srcs),
+        'is_pinned': is_pinned,
+        'tagged_users': tagged_users,
+    }
+
+
+def insert_post(conn, creator_id, post_data):
+    """Insert a post + attachments into the DB."""
+    cursor = conn.cursor()
+    now = datetime.now().isoformat()
+
+    cursor.execute(
+        """INSERT INTO paid_content_posts
+           (creator_id, post_id, title, content, published_at, added_at,
+            has_attachments, attachment_count, downloaded, is_pinned)
+           VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+        (creator_id, post_data['post_id'], None, post_data['content'],
+         post_data['published_at'], now,
+         1 if post_data['attachment_count'] > 0 else 0,
+         post_data['attachment_count'], False, post_data['is_pinned'])
+    )
+
+    # Get the inserted post's ID
+    cursor.execute(
+        "SELECT id FROM paid_content_posts WHERE creator_id = ? AND post_id = ?",
+        (creator_id, post_data['post_id'])
+    )
+    row = cursor.fetchone()
+    if not row:
+        return
+    db_post_id = row[0]
+
+    # Insert attachments
+    for idx, src_url in enumerate(post_data['srcs']):
+        ext = '.mp4' if '.mp4' in src_url.split('?')[0] else '.jpg'
+        file_type = 'video' if ext == '.mp4' else 'image'
+        name = f"{post_data['post_id']}_{idx}{ext}"
+
+        cursor.execute(
+            """INSERT INTO paid_content_attachments
+               (post_id, attachment_index, name, file_type, extension, server_path, download_url, status)
+               VALUES (?, ?, ?, ?, ?, ?, ?, 'pending')""",
+            (db_post_id, idx, name, file_type, ext,
+             f"https://www.instagram.com/p/{post_data['post_id']}/?img_index={idx + 1}",
+             src_url)
+        )
+
+    # Insert tagged users
+    for uname in post_data.get('tagged_users', []):
+        cursor.execute(
+            """INSERT INTO paid_content_post_tagged_users (post_id, username, created_at)
+               VALUES (?, ?, ?)
+               ON CONFLICT (post_id, username) DO NOTHING""",
+            (db_post_id, uname, now)
+        )
+
+    conn.commit()
+
+
+def create_session(cookie_list):
+    session = CurlSession(impersonate='edge101')
+    session.headers.update({
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
+        'X-IG-App-ID': '936619743392459',
+        'X-Requested-With': 'XMLHttpRequest',
+        'Referer': 'https://www.instagram.com/',
+        'Origin': 'https://www.instagram.com',
+        'Sec-CH-UA': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
+        'Sec-CH-UA-Mobile': '?0',
+        'Sec-CH-UA-Platform': '"Windows"',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-origin',
+    })
+    for c in cookie_list:
+        name = c.get('name', '')
+        value = c.get('value', '')
+        domain = c.get('domain', '.instagram.com')
+        if name and value:
+            session.cookies.set(name, value, domain=domain)
+    return session
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Backfill missing Instagram posts')
+    parser.add_argument('--creator-id', type=int, required=True, help='Paid content creator ID')
+    args = parser.parse_args()
+
+    conn = sqlite3.connect('media_downloader')
+
+    # Look up creator
+    cursor = conn.cursor()
+    cursor.execute(
+        "SELECT username FROM paid_content_creators WHERE id = ? AND platform = 'instagram'",
+        (args.creator_id,)
+    )
+    row = cursor.fetchone()
+    if not row:
+        print(f"ERROR: Creator ID {args.creator_id} not found")
+        sys.exit(1)
+    username = row[0]
+
+    print(f"Backfilling @{username} (creator_id={args.creator_id})")
+
+    cookie_list = load_cookies(conn)
+    session = create_session(cookie_list)
+
+    # Look up Instagram user ID
+    ig_user_id = lookup_ig_user_id(session, username)
+    time.sleep(1)
+
+    known = load_known_post_ids(conn, args.creator_id)
+    print(f"Known posts in DB: {len(known)}")
+
+    max_id = None
+    total_fetched = 0
+    total_new = 0
+    page = 0
+    consecutive_errors = 0
+
+    while True:
+        page += 1
+        params = {'count': PER_PAGE}
+        if max_id:
+            params['max_id'] = max_id
+
+        try:
+            resp = session.get(
+                f'https://www.instagram.com/api/v1/feed/user/{ig_user_id}/',
+                params=params,
+                timeout=15
+            )
+        except Exception as e:
+            print(f"  Page {page}: request error: {e}")
+            consecutive_errors += 1
+            if consecutive_errors >= 3:
+                print("Too many consecutive errors, stopping.")
+                break
+            time.sleep(5)
+            continue
+
+        if resp.status_code != 200:
+            print(f"  Page {page}: HTTP {resp.status_code}")
+            if resp.status_code == 401:
+                print("Session expired! Stopping.")
+                break
+            if resp.status_code == 429:
+                print("Rate limited. Waiting 60s...")
+                time.sleep(60)
+                continue
+            consecutive_errors += 1
+            if consecutive_errors >= 3:
+                print("Too many consecutive errors, stopping.")
+                break
+            time.sleep(5)
+            continue
+
+        consecutive_errors = 0
+        data = resp.json()
+        items = data.get('items', [])
+        more = data.get('more_available', False)
+        next_max_id = data.get('next_max_id')
+
+        if not items:
+            print(f"  Page {page}: no items returned, done.")
+            break
+
+        total_fetched += len(items)
+        page_new = 0
+
+        for node in items:
+            code = node.get('code', '')
+            if not code:
+                continue
+            if code in known:
+                continue
+
+            post_data = node_to_post_row(node)
+            if not post_data:
+                continue
+
+            insert_post(conn, args.creator_id, post_data)
+            known.add(code)
+            page_new += 1
+            total_new += 1
+
+        print(f"  Page {page}: {len(items)} items, {page_new} new (total: {total_fetched} fetched, {total_new} new)")
+
+        if not more or not next_max_id:
+            print("No more pages available.")
+            break
+
+        max_id = next_max_id
+        time.sleep(SLEEP_BETWEEN)
+
+    # Save updated cookies
+    try:
+        save_cookies(conn, session)
+    except Exception as e:
+        print(f"Warning: failed to save cookies: {e}")
+
+    conn.close()
+    print(f"\nDone! Fetched {total_fetched} posts total, inserted {total_new} new posts for @{username}.")
+
+
+if __name__ == '__main__':
+    main()