351
scripts/backfill_ig_posts.py
Normal file
351
scripts/backfill_ig_posts.py
Normal file
@@ -0,0 +1,351 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Backfill missing Instagram posts using authenticated browser cookies.
|
||||
|
||||
Paginates through the full timeline via /api/v1/feed/user/ and inserts
|
||||
any posts missing from paid_content_posts. Uses Edge browser fingerprint
|
||||
and the cookies stored in the instagram_browser scraper entry.
|
||||
|
||||
Usage:
|
||||
cd /opt/media-downloader
|
||||
./venv/bin/python3 -u scripts/backfill_ig_posts.py --creator-id 101
|
||||
./venv/bin/python3 -u scripts/backfill_ig_posts.py --creator-id 110
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
# Bootstrap DB (pgadapter) — module-level import activates monkey-patching
|
||||
import modules.db_bootstrap # noqa: F401
|
||||
|
||||
import sqlite3 # routed to PostgreSQL via pgadapter
|
||||
from curl_cffi.requests import Session as CurlSession
|
||||
from datetime import datetime
|
||||
|
||||
PER_PAGE = 33
|
||||
SLEEP_BETWEEN = 2.0 # seconds between API calls
|
||||
|
||||
|
||||
def load_cookies(conn):
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = 'instagram_browser'")
|
||||
row = cursor.fetchone()
|
||||
if not row or not row[0]:
|
||||
print("ERROR: No cookies found in instagram_browser scraper")
|
||||
sys.exit(1)
|
||||
cookies = json.loads(row[0])
|
||||
has_session = any(c.get('name') == 'sessionid' and c.get('value') for c in cookies)
|
||||
if not has_session:
|
||||
print("ERROR: No sessionid in cookies")
|
||||
sys.exit(1)
|
||||
return cookies
|
||||
|
||||
|
||||
def save_cookies(conn, session):
|
||||
updated = []
|
||||
for c in session.cookies.jar:
|
||||
updated.append({
|
||||
'name': c.name,
|
||||
'value': c.value,
|
||||
'domain': c.domain or '.instagram.com',
|
||||
})
|
||||
if updated:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"UPDATE scrapers SET cookies_json = ?, cookies_updated_at = ? WHERE id = 'instagram_browser'",
|
||||
(json.dumps(updated), datetime.now().isoformat())
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def load_known_post_ids(conn, creator_id):
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT post_id FROM paid_content_posts WHERE creator_id = ?", (creator_id,))
|
||||
return set(row[0] for row in cursor.fetchall())
|
||||
|
||||
|
||||
def lookup_ig_user_id(session, username):
|
||||
"""Look up Instagram user ID from username using authenticated session."""
|
||||
resp = session.get(
|
||||
f'https://www.instagram.com/api/v1/users/web_profile_info/?username={username}',
|
||||
timeout=10
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
print(f"ERROR: Failed to look up user ID for @{username}: HTTP {resp.status_code}")
|
||||
sys.exit(1)
|
||||
data = resp.json()
|
||||
user = data['data']['user']
|
||||
ig_user_id = user['id']
|
||||
ig_post_count = user['edge_owner_to_timeline_media']['count']
|
||||
print(f"Instagram user ID for @{username}: {ig_user_id} ({ig_post_count} posts)")
|
||||
return ig_user_id
|
||||
|
||||
|
||||
def best_media_url(node):
|
||||
media_type = node.get('media_type', 1)
|
||||
if media_type == 2 and node.get('video_versions'):
|
||||
best = max(node['video_versions'], key=lambda v: v.get('width', 0) * v.get('height', 0))
|
||||
return best.get('url', '')
|
||||
candidates = node.get('image_versions2', {}).get('candidates', [])
|
||||
if candidates:
|
||||
best = max(candidates, key=lambda c: c.get('width', 0) * c.get('height', 0))
|
||||
return best.get('url', '')
|
||||
return None
|
||||
|
||||
|
||||
def node_to_post_row(node):
|
||||
"""Convert an IG API node to DB row data."""
|
||||
code = node.get('code', '')
|
||||
if not code:
|
||||
return None
|
||||
|
||||
taken_at = node.get('taken_at', 0)
|
||||
published_at = datetime.fromtimestamp(taken_at).isoformat() if taken_at else None
|
||||
|
||||
caption_obj = node.get('caption')
|
||||
caption = caption_obj.get('text', '') if isinstance(caption_obj, dict) else ''
|
||||
|
||||
srcs = []
|
||||
media_type = node.get('media_type', 1)
|
||||
if media_type == 8 and node.get('carousel_media'):
|
||||
for child in node['carousel_media']:
|
||||
url = best_media_url(child)
|
||||
if url:
|
||||
srcs.append(url)
|
||||
else:
|
||||
url = best_media_url(node)
|
||||
if url:
|
||||
srcs.append(url)
|
||||
|
||||
if not srcs:
|
||||
return None
|
||||
|
||||
# Tagged users
|
||||
tagged_users = []
|
||||
for tag in (node.get('usertags') or {}).get('in', []):
|
||||
uname = (tag.get('user') or {}).get('username')
|
||||
if uname and uname not in tagged_users:
|
||||
tagged_users.append(uname)
|
||||
for cm in node.get('carousel_media') or []:
|
||||
for tag in (cm.get('usertags') or {}).get('in', []):
|
||||
uname = (tag.get('user') or {}).get('username')
|
||||
if uname and uname not in tagged_users:
|
||||
tagged_users.append(uname)
|
||||
|
||||
is_pinned = 1 if node.get('timeline_pinned_user_ids') else 0
|
||||
|
||||
return {
|
||||
'post_id': code,
|
||||
'published_at': published_at,
|
||||
'content': caption,
|
||||
'srcs': srcs,
|
||||
'attachment_count': len(srcs),
|
||||
'is_pinned': is_pinned,
|
||||
'tagged_users': tagged_users,
|
||||
}
|
||||
|
||||
|
||||
def insert_post(conn, creator_id, post_data):
|
||||
"""Insert a post + attachments into the DB."""
|
||||
cursor = conn.cursor()
|
||||
now = datetime.now().isoformat()
|
||||
|
||||
cursor.execute(
|
||||
"""INSERT INTO paid_content_posts
|
||||
(creator_id, post_id, title, content, published_at, added_at,
|
||||
has_attachments, attachment_count, downloaded, is_pinned)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(creator_id, post_data['post_id'], None, post_data['content'],
|
||||
post_data['published_at'], now,
|
||||
1 if post_data['attachment_count'] > 0 else 0,
|
||||
post_data['attachment_count'], False, post_data['is_pinned'])
|
||||
)
|
||||
|
||||
# Get the inserted post's ID
|
||||
cursor.execute(
|
||||
"SELECT id FROM paid_content_posts WHERE creator_id = ? AND post_id = ?",
|
||||
(creator_id, post_data['post_id'])
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
if not row:
|
||||
return
|
||||
db_post_id = row[0]
|
||||
|
||||
# Insert attachments
|
||||
for idx, src_url in enumerate(post_data['srcs']):
|
||||
ext = '.mp4' if '.mp4' in src_url.split('?')[0] else '.jpg'
|
||||
file_type = 'video' if ext == '.mp4' else 'image'
|
||||
name = f"{post_data['post_id']}_{idx}{ext}"
|
||||
|
||||
cursor.execute(
|
||||
"""INSERT INTO paid_content_attachments
|
||||
(post_id, attachment_index, name, file_type, extension, server_path, download_url, status)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, 'pending')""",
|
||||
(db_post_id, idx, name, file_type, ext,
|
||||
f"https://www.instagram.com/p/{post_data['post_id']}/?img_index={idx + 1}",
|
||||
src_url)
|
||||
)
|
||||
|
||||
# Insert tagged users
|
||||
for uname in post_data.get('tagged_users', []):
|
||||
cursor.execute(
|
||||
"""INSERT INTO paid_content_post_tagged_users (post_id, username, created_at)
|
||||
VALUES (?, ?, ?)
|
||||
ON CONFLICT (post_id, username) DO NOTHING""",
|
||||
(db_post_id, uname, now)
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
||||
def create_session(cookie_list):
|
||||
session = CurlSession(impersonate='edge101')
|
||||
session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
|
||||
'X-IG-App-ID': '936619743392459',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'Referer': 'https://www.instagram.com/',
|
||||
'Origin': 'https://www.instagram.com',
|
||||
'Sec-CH-UA': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
||||
'Sec-CH-UA-Mobile': '?0',
|
||||
'Sec-CH-UA-Platform': '"Windows"',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
})
|
||||
for c in cookie_list:
|
||||
name = c.get('name', '')
|
||||
value = c.get('value', '')
|
||||
domain = c.get('domain', '.instagram.com')
|
||||
if name and value:
|
||||
session.cookies.set(name, value, domain=domain)
|
||||
return session
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Backfill missing Instagram posts')
|
||||
parser.add_argument('--creator-id', type=int, required=True, help='Paid content creator ID')
|
||||
args = parser.parse_args()
|
||||
|
||||
conn = sqlite3.connect('media_downloader')
|
||||
|
||||
# Look up creator
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT username FROM paid_content_creators WHERE id = ? AND platform = 'instagram'",
|
||||
(args.creator_id,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
if not row:
|
||||
print(f"ERROR: Creator ID {args.creator_id} not found")
|
||||
sys.exit(1)
|
||||
username = row[0]
|
||||
|
||||
print(f"Backfilling @{username} (creator_id={args.creator_id})")
|
||||
|
||||
cookie_list = load_cookies(conn)
|
||||
session = create_session(cookie_list)
|
||||
|
||||
# Look up Instagram user ID
|
||||
ig_user_id = lookup_ig_user_id(session, username)
|
||||
time.sleep(1)
|
||||
|
||||
known = load_known_post_ids(conn, args.creator_id)
|
||||
print(f"Known posts in DB: {len(known)}")
|
||||
|
||||
max_id = None
|
||||
total_fetched = 0
|
||||
total_new = 0
|
||||
page = 0
|
||||
consecutive_errors = 0
|
||||
|
||||
while True:
|
||||
page += 1
|
||||
params = {'count': PER_PAGE}
|
||||
if max_id:
|
||||
params['max_id'] = max_id
|
||||
|
||||
try:
|
||||
resp = session.get(
|
||||
f'https://www.instagram.com/api/v1/feed/user/{ig_user_id}/',
|
||||
params=params,
|
||||
timeout=15
|
||||
)
|
||||
except Exception as e:
|
||||
print(f" Page {page}: request error: {e}")
|
||||
consecutive_errors += 1
|
||||
if consecutive_errors >= 3:
|
||||
print("Too many consecutive errors, stopping.")
|
||||
break
|
||||
time.sleep(5)
|
||||
continue
|
||||
|
||||
if resp.status_code != 200:
|
||||
print(f" Page {page}: HTTP {resp.status_code}")
|
||||
if resp.status_code == 401:
|
||||
print("Session expired! Stopping.")
|
||||
break
|
||||
if resp.status_code == 429:
|
||||
print("Rate limited. Waiting 60s...")
|
||||
time.sleep(60)
|
||||
continue
|
||||
consecutive_errors += 1
|
||||
if consecutive_errors >= 3:
|
||||
print("Too many consecutive errors, stopping.")
|
||||
break
|
||||
time.sleep(5)
|
||||
continue
|
||||
|
||||
consecutive_errors = 0
|
||||
data = resp.json()
|
||||
items = data.get('items', [])
|
||||
more = data.get('more_available', False)
|
||||
next_max_id = data.get('next_max_id')
|
||||
|
||||
if not items:
|
||||
print(f" Page {page}: no items returned, done.")
|
||||
break
|
||||
|
||||
total_fetched += len(items)
|
||||
page_new = 0
|
||||
|
||||
for node in items:
|
||||
code = node.get('code', '')
|
||||
if not code:
|
||||
continue
|
||||
if code in known:
|
||||
continue
|
||||
|
||||
post_data = node_to_post_row(node)
|
||||
if not post_data:
|
||||
continue
|
||||
|
||||
insert_post(conn, args.creator_id, post_data)
|
||||
known.add(code)
|
||||
page_new += 1
|
||||
total_new += 1
|
||||
|
||||
print(f" Page {page}: {len(items)} items, {page_new} new (total: {total_fetched} fetched, {total_new} new)")
|
||||
|
||||
if not more or not next_max_id:
|
||||
print("No more pages available.")
|
||||
break
|
||||
|
||||
max_id = next_max_id
|
||||
time.sleep(SLEEP_BETWEEN)
|
||||
|
||||
# Save updated cookies
|
||||
try:
|
||||
save_cookies(conn, session)
|
||||
except Exception as e:
|
||||
print(f"Warning: failed to save cookies: {e}")
|
||||
|
||||
conn.close()
|
||||
print(f"\nDone! Fetched {total_fetched} posts total, inserted {total_new} new posts for @{username}.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user