media-downloader/scripts/backfill_kylie_posts.py

#!/usr/bin/env python3
"""Backfill missing kyliejenner posts using authenticated browser cookies.

Paginates through the full timeline via /api/v1/feed/user/ and inserts
any posts missing from paid_content_posts.  Uses Edge browser fingerprint
and the cookies stored in the instagram_browser scraper entry.

Usage:
    cd /opt/media-downloader
    ./venv/bin/python3 -u scripts/backfill_kylie_posts.py
"""
import json
import sys
import time
import os

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Bootstrap DB (pgadapter) — module-level import activates monkey-patching
import modules.db_bootstrap  # noqa: F401

import sqlite3  # routed to PostgreSQL via pgadapter
from curl_cffi.requests import Session as CurlSession
from datetime import datetime

CREATOR_ID = 101
USERNAME = 'kyliejenner'
IG_USER_ID = '12281817'
PER_PAGE = 33
SLEEP_BETWEEN = 2.0  # seconds between API calls


def load_cookies(conn):
    cursor = conn.cursor()
    cursor.execute("SELECT cookies_json FROM scrapers WHERE id = 'instagram_browser'")
    row = cursor.fetchone()
    if not row or not row[0]:
        print("ERROR: No cookies found in instagram_browser scraper")
        sys.exit(1)
    cookies = json.loads(row[0])
    has_session = any(c.get('name') == 'sessionid' and c.get('value') for c in cookies)
    if not has_session:
        print("ERROR: No sessionid in cookies")
        sys.exit(1)
    return cookies


def save_cookies(conn, session):
    updated = []
    for c in session.cookies:
        updated.append({
            'name': c.name,
            'value': c.value,
            'domain': c.domain or '.instagram.com',
        })
    if updated:
        cursor = conn.cursor()
        cursor.execute(
            "UPDATE scrapers SET cookies_json = ?, cookies_updated_at = ? WHERE id = 'instagram_browser'",
            (json.dumps(updated), datetime.now().isoformat())
        )
        conn.commit()


def load_known_post_ids(conn):
    cursor = conn.cursor()
    cursor.execute("SELECT post_id FROM paid_content_posts WHERE creator_id = ?", (CREATOR_ID,))
    return set(row[0] for row in cursor.fetchall())


def best_media_url(node):
    media_type = node.get('media_type', 1)
    if media_type == 2 and node.get('video_versions'):
        best = max(node['video_versions'], key=lambda v: v.get('width', 0) * v.get('height', 0))
        return best.get('url', '')
    candidates = node.get('image_versions2', {}).get('candidates', [])
    if candidates:
        best = max(candidates, key=lambda c: c.get('width', 0) * c.get('height', 0))
        return best.get('url', '')
    return None


def node_to_post_row(node):
    """Convert an IG API node to DB row data."""
    code = node.get('code', '')
    if not code:
        return None

    taken_at = node.get('taken_at', 0)
    published_at = datetime.fromtimestamp(taken_at).isoformat() if taken_at else None

    caption_obj = node.get('caption')
    caption = caption_obj.get('text', '') if isinstance(caption_obj, dict) else ''

    srcs = []
    media_type = node.get('media_type', 1)
    if media_type == 8 and node.get('carousel_media'):
        for child in node['carousel_media']:
            url = best_media_url(child)
            if url:
                srcs.append(url)
    else:
        url = best_media_url(node)
        if url:
            srcs.append(url)

    if not srcs:
        return None

    # Tagged users
    tagged_users = []
    for tag in (node.get('usertags') or {}).get('in', []):
        uname = (tag.get('user') or {}).get('username')
        if uname and uname not in tagged_users:
            tagged_users.append(uname)
    for cm in node.get('carousel_media') or []:
        for tag in (cm.get('usertags') or {}).get('in', []):
            uname = (tag.get('user') or {}).get('username')
            if uname and uname not in tagged_users:
                tagged_users.append(uname)

    is_pinned = 1 if node.get('timeline_pinned_user_ids') else 0

    return {
        'post_id': code,
        'published_at': published_at,
        'content': caption,
        'srcs': srcs,
        'attachment_count': len(srcs),
        'is_pinned': is_pinned,
        'tagged_users': tagged_users,
    }


def insert_post(conn, post_data):
    """Insert a post + attachments into the DB."""
    cursor = conn.cursor()
    now = datetime.now().isoformat()

    cursor.execute(
        """INSERT INTO paid_content_posts
           (creator_id, post_id, title, content, published_at, added_at,
            has_attachments, attachment_count, downloaded, is_pinned)
           VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
        (CREATOR_ID, post_data['post_id'], None, post_data['content'],
         post_data['published_at'], now,
         1 if post_data['attachment_count'] > 0 else 0,
         post_data['attachment_count'], False, post_data['is_pinned'])
    )

    # Get the inserted post's ID
    cursor.execute(
        "SELECT id FROM paid_content_posts WHERE creator_id = ? AND post_id = ?",
        (CREATOR_ID, post_data['post_id'])
    )
    row = cursor.fetchone()
    if not row:
        return
    db_post_id = row[0]

    # Insert attachments
    for idx, src_url in enumerate(post_data['srcs']):
        ext = '.mp4' if '.mp4' in src_url.split('?')[0] else '.jpg'
        file_type = 'video' if ext == '.mp4' else 'image'
        name = f"{post_data['post_id']}_{idx}{ext}"

        cursor.execute(
            """INSERT INTO paid_content_attachments
               (post_id, attachment_index, name, file_type, extension, server_path, download_url, status)
               VALUES (?, ?, ?, ?, ?, ?, ?, 'pending')""",
            (db_post_id, idx, name, file_type, ext,
             f"https://www.instagram.com/p/{post_data['post_id']}/?img_index={idx + 1}",
             src_url)
        )

    conn.commit()


def main():
    conn = sqlite3.connect('media_downloader')
    cookie_list = load_cookies(conn)
    known = load_known_post_ids(conn)

    print(f"Known posts in DB: {len(known)}")

    session = CurlSession(impersonate='edge101')
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
        'X-IG-App-ID': '936619743392459',
        'X-Requested-With': 'XMLHttpRequest',
        'Referer': 'https://www.instagram.com/',
        'Origin': 'https://www.instagram.com',
        'Sec-CH-UA': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
        'Sec-CH-UA-Mobile': '?0',
        'Sec-CH-UA-Platform': '"Windows"',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
    })
    for c in cookie_list:
        name = c.get('name', '')
        value = c.get('value', '')
        domain = c.get('domain', '.instagram.com')
        if name and value:
            session.cookies.set(name, value, domain=domain)

    max_id = None
    total_fetched = 0
    total_new = 0
    page = 0
    consecutive_errors = 0

    while True:
        page += 1
        params = {'count': PER_PAGE}
        if max_id:
            params['max_id'] = max_id

        try:
            resp = session.get(
                f'https://www.instagram.com/api/v1/feed/user/{IG_USER_ID}/',
                params=params,
                timeout=15
            )
        except Exception as e:
            print(f"  Page {page}: request error: {e}")
            consecutive_errors += 1
            if consecutive_errors >= 3:
                print("Too many consecutive errors, stopping.")
                break
            time.sleep(5)
            continue

        if resp.status_code != 200:
            print(f"  Page {page}: HTTP {resp.status_code}")
            if resp.status_code == 401:
                print("Session expired! Stopping.")
                break
            if resp.status_code == 429:
                print("Rate limited. Waiting 60s...")
                time.sleep(60)
                continue
            consecutive_errors += 1
            if consecutive_errors >= 3:
                print("Too many consecutive errors, stopping.")
                break
            time.sleep(5)
            continue

        consecutive_errors = 0
        data = resp.json()
        items = data.get('items', [])
        more = data.get('more_available', False)
        next_max_id = data.get('next_max_id')

        if not items:
            print(f"  Page {page}: no items returned, done.")
            break

        total_fetched += len(items)
        page_new = 0

        for node in items:
            code = node.get('code', '')
            if not code:
                continue
            if code in known:
                continue

            post_data = node_to_post_row(node)
            if not post_data:
                continue

            insert_post(conn, post_data)
            known.add(code)
            page_new += 1
            total_new += 1

        print(f"  Page {page}: {len(items)} items, {page_new} new (total: {total_fetched} fetched, {total_new} new)")

        if not more or not next_max_id:
            print("No more pages available.")
            break

        max_id = next_max_id
        time.sleep(SLEEP_BETWEEN)

    # Save updated cookies
    save_cookies(conn, session)
    conn.close()

    print(f"\nDone! Fetched {total_fetched} posts total, inserted {total_new} new posts.")


if __name__ == '__main__':
    main()