Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

View File

@@ -0,0 +1,871 @@
#!/usr/bin/env python3
"""
Snapchat Client Module - Direct HTTP-based Snapchat downloader using curl_cffi.
Replaces Playwright-based scraping with direct HTTP requests. Snapchat embeds
all page data in <script id="__NEXT_DATA__"> JSON tags, so no JavaScript
execution is needed. Uses story.snapchat.com which may not require Cloudflare.
Follows the same pattern as instagram_client_module.py.
"""
import os
import json
import re
import subprocess
import time
import random
import platform
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Dict, List, Set
from modules.base_module import LoggingMixin
from modules.snapchat_scraper import SnapMedia, SnapCollection
class SnapchatClientDownloader(LoggingMixin):
"""Snapchat downloader using direct HTTP via curl_cffi (no Playwright)"""
def __init__(self,
show_progress: bool = True,
use_database: bool = True,
log_callback=None,
unified_db=None):
"""Initialize the Snapchat Client downloader.
Args:
show_progress: Whether to show download progress
use_database: Whether to use database for dedup
log_callback: Optional logging callback
unified_db: UnifiedDatabase instance
"""
self._init_logger('SnapchatClient', log_callback, default_module='Download')
self.scraper_id = 'snapchat_client'
self.show_progress = show_progress
self.use_database = use_database
self.download_count = 0
self.downloaded_files: Set[str] = set()
self.pending_downloads = []
# Session (lazy-initialized)
self._session = None
# Database
if unified_db and use_database:
from modules.unified_database import SnapchatDatabaseAdapter
self.db = SnapchatDatabaseAdapter(unified_db)
self.unified_db = unified_db
else:
self.db = None
self.unified_db = None
self.use_database = False
# Activity status manager
try:
from modules.activity_status import get_activity_manager
self.activity_manager = get_activity_manager(unified_db)
except ImportError:
self.activity_manager = None
# Cookie data from DB
self.cookies = []
self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
def _get_session(self):
"""Get or create a curl_cffi session with browser TLS fingerprinting."""
if self._session is None:
from curl_cffi.requests import Session
# Try multiple browser versions for curl_cffi compatibility
for _browser in ("chrome131", "chrome136", "chrome"):
try:
self._session = Session(impersonate=_browser)
break
except Exception:
continue
else:
self._session = Session()
self._session.headers.update({
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'no-cache',
})
# Load cookies from database
self._load_cookies()
return self._session
def _load_cookies(self):
"""Load cookies from database for authenticated requests."""
if not self.unified_db:
return
# Try snapchat_client cookies first, fall back to snapchat
for scraper_id in ['snapchat_client', 'snapchat']:
try:
cookies = self.unified_db.get_scraper_cookies(scraper_id)
if cookies:
self.log(f"Loaded {len(cookies)} cookies from '{scraper_id}' scraper", "debug")
self.cookies = cookies
for cookie in cookies:
name = cookie.get('name', '')
value = cookie.get('value', '')
domain = cookie.get('domain', '.snapchat.com')
if name and value and self._session:
self._session.cookies.set(name, value, domain=domain)
# Check if we have a stored user-agent (important for cf_clearance match)
try:
import json as _json
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT user_agent FROM scrapers WHERE id = ?",
(scraper_id,)
)
row = cursor.fetchone()
if row and row[0]:
self.user_agent = row[0]
if self._session:
self._session.headers['User-Agent'] = self.user_agent
except Exception:
pass
return
except Exception as e:
self.log(f"Error loading cookies from '{scraper_id}': {e}", "debug")
def _fetch_page(self, url: str) -> Optional[str]:
"""Fetch a page via HTTP and return the HTML content.
Tries story.snapchat.com first (no Cloudflare), falls back to www.snapchat.com.
"""
session = self._get_session()
# If URL uses www.snapchat.com, try story.snapchat.com first
story_url = url.replace('www.snapchat.com', 'story.snapchat.com')
www_url = url.replace('story.snapchat.com', 'www.snapchat.com')
# Try story.snapchat.com first (likely no Cloudflare)
for attempt_url in [story_url, www_url]:
try:
resp = session.get(attempt_url, timeout=30)
if resp.status_code == 200 and '__NEXT_DATA__' in resp.text:
return resp.text
elif resp.status_code == 403:
self.log(f"403 Forbidden from {attempt_url.split('/@')[0]}", "debug")
continue
elif resp.status_code != 200:
self.log(f"HTTP {resp.status_code} from {attempt_url.split('/@')[0]}", "debug")
continue
except Exception as e:
self.log(f"Error fetching {attempt_url.split('/@')[0]}: {e}", "debug")
continue
return None
def _extract_next_data(self, html: str) -> Optional[Dict]:
"""Extract __NEXT_DATA__ JSON from HTML page."""
match = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
if not match:
return None
try:
return json.loads(match.group(1))
except json.JSONDecodeError as e:
self.log(f"Failed to parse __NEXT_DATA__ JSON: {e}", "error")
return None
def get_profile_content(self, username: str) -> Dict[str, List]:
"""Get all spotlight URLs, highlight URLs, and inline story/highlight data from a profile.
Parses __NEXT_DATA__ JSON to extract:
- spotlights: list of spotlight URL strings
- highlights: list of highlight URL strings
- story_collection: SnapCollection from story.snapList (recent stories), or None
- highlight_collections: list of SnapCollection from curatedHighlights (inline data)
The inline data avoids needing separate HTTP requests for stories and highlights.
"""
result = {'spotlights': [], 'highlights': [], 'story_collection': None, 'highlight_collections': []}
url = f"https://story.snapchat.com/@{username}"
self.log(f"Fetching profile for @{username}", "info")
html = self._fetch_page(url)
if not html:
self.log(f"Failed to fetch profile page for @{username}", "warning")
return result
# Extract spotlight URLs via regex (still needed — spotlight metadata requires per-URL fetch)
spotlight_pattern = rf'/@{re.escape(username)}/spotlight/([A-Za-z0-9_-]+)'
spotlight_ids = list(set(re.findall(spotlight_pattern, html)))
result['spotlights'] = [
f"https://story.snapchat.com/@{username}/spotlight/{sid}"
for sid in spotlight_ids
]
self.log(f"Found {len(result['spotlights'])} spotlights", "info")
# Parse __NEXT_DATA__ for stories and highlights (much more reliable than regex)
data = self._extract_next_data(html)
if not data:
# Fall back to regex for highlights
highlight_pattern = rf'/@{re.escape(username)}/highlight/([A-Za-z0-9-]+)'
highlight_ids = list(set(re.findall(highlight_pattern, html)))
result['highlights'] = [
f"https://story.snapchat.com/@{username}/highlight/{hid}"
for hid in highlight_ids
]
self.log(f"Found {len(result['highlights'])} highlights (regex fallback)", "info")
return result
props = (data.get('props') or {}).get('pageProps') or {}
# Extract story snapList (recent stories — not available via individual URLs)
story = props.get('story') or {}
story_snaps = story.get('snapList') or []
if story_snaps:
story_id = story.get('storyId') or {}
if isinstance(story_id, dict):
story_id = story_id.get('value', 'story')
story_collection = SnapCollection(
collection_id=story_id or 'story',
collection_type='story',
title=story.get('storyTitle', '') or 'Stories',
username=username,
url=url
)
for snap_data in story_snaps:
snap = self._parse_snap_data(snap_data)
if snap:
story_collection.snaps.append(snap)
if story_collection.snaps:
result['story_collection'] = story_collection
self.log(f"Found {len(story_collection.snaps)} story snaps", "info")
# Extract curatedHighlights inline (avoids per-highlight HTTP requests)
curated_highlights = props.get('curatedHighlights') or []
for highlight in curated_highlights:
highlight_id = highlight.get('highlightId') or {}
if isinstance(highlight_id, dict):
highlight_id = highlight_id.get('value', '')
title = highlight.get('storyTitle') or {}
if isinstance(title, dict):
title = title.get('value', '')
collection = SnapCollection(
collection_id=highlight_id,
collection_type='highlight',
title=title or 'Untitled Highlight',
username=username,
url=f"https://story.snapchat.com/@{username}/highlight/{highlight_id}"
)
for snap_data in highlight.get('snapList') or []:
snap = self._parse_snap_data(snap_data)
if snap:
collection.snaps.append(snap)
if collection.snaps:
result['highlight_collections'].append(collection)
self.log(f"Found {len(result['highlight_collections'])} highlights (inline)", "info")
return result
def _parse_snap_data(self, snap_data: Dict) -> Optional[SnapMedia]:
"""Parse a snap from __NEXT_DATA__ snapList into a SnapMedia object."""
snap_urls = snap_data.get('snapUrls') or {}
media_url = snap_urls.get('mediaUrl', '')
if not media_url:
return None
snap_id = (snap_data.get('snapId') or {}).get('value', '')
media_id = ''
if '/d/' in media_url:
media_id = media_url.split('/d/')[1].split('.')[0]
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str and ts_str != '0' else datetime.now()
lat = snap_data.get('lat')
lng = snap_data.get('lng')
return SnapMedia(
media_id=media_id or snap_id,
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
media_url=media_url,
timestamp=timestamp,
index=snap_data.get('snapIndex', 0),
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
lat=float(lat) if lat else None,
lng=float(lng) if lng else None
)
def get_spotlight_metadata(self, url: str) -> Optional[SnapCollection]:
"""Extract full metadata from a spotlight URL via __NEXT_DATA__."""
html = self._fetch_page(url)
if not html:
return None
data = self._extract_next_data(html)
if not data:
return None
props = (data.get('props') or {}).get('pageProps') or {}
feed = props.get('spotlightFeed') or {}
stories = feed.get('spotlightStories') or []
if not stories:
return None
story_data = stories[0]
story = story_data.get('story') or {}
metadata = (story_data.get('metadata') or {}).get('videoMetadata') or {}
story_id = (story.get('storyId') or {}).get('value', '')
creator = (metadata.get('creator') or {}).get('personCreator') or {}
username = creator.get('username', '')
collection = SnapCollection(
collection_id=story_id,
collection_type='spotlight',
title=metadata.get('description', ''),
username=username,
url=url
)
for snap_data in story.get('snapList') or []:
snap_id = (snap_data.get('snapId') or {}).get('value', '')
snap_urls = snap_data.get('snapUrls') or {}
media_url = snap_urls.get('mediaUrl', '')
media_id = ''
if '/d/' in media_url:
media_id = media_url.split('/d/')[1].split('.')[0]
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
snap = SnapMedia(
media_id=media_id or snap_id,
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
media_url=media_url,
timestamp=timestamp,
index=snap_data.get('snapIndex', 0),
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
duration_ms=int(metadata.get('durationMs', 0)),
description=metadata.get('description', ''),
view_count=int(metadata.get('viewCount', 0)),
width=int(metadata.get('width', 540)),
height=int(metadata.get('height', 960))
)
collection.snaps.append(snap)
return collection
def get_highlight_metadata(self, url: str) -> Optional[SnapCollection]:
"""Extract full metadata from a highlight URL via __NEXT_DATA__."""
html = self._fetch_page(url)
if not html:
return None
data = self._extract_next_data(html)
if not data:
return None
props = (data.get('props') or {}).get('pageProps') or {}
highlight = props.get('highlight') or {}
if not highlight:
return None
highlight_id = highlight.get('highlightId') or {}
if isinstance(highlight_id, dict):
highlight_id = highlight_id.get('value', '')
username_match = re.search(r'@([^/]+)', url)
username = username_match.group(1) if username_match else ''
title = highlight.get('storyTitle') or {}
if isinstance(title, dict):
title = title.get('value', '')
collection = SnapCollection(
collection_id=highlight_id,
collection_type='highlight',
title=title or 'Untitled Highlight',
username=username,
url=url
)
for snap_data in highlight.get('snapList') or []:
snap_urls = snap_data.get('snapUrls') or {}
media_url = snap_urls.get('mediaUrl', '')
media_id = ''
if '/d/' in media_url:
media_id = media_url.split('/d/')[1].split('.')[0]
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
lat = snap_data.get('lat')
lng = snap_data.get('lng')
snap = SnapMedia(
media_id=media_id,
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
media_url=media_url,
timestamp=timestamp,
index=snap_data.get('snapIndex', 0),
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
lat=float(lat) if lat else None,
lng=float(lng) if lng else None
)
collection.snaps.append(snap)
return collection
def _download_media_file(self, snap: SnapMedia, output_path: str) -> bool:
"""Download a single media file via curl_cffi."""
try:
url = snap.media_url.replace('&amp;', '&')
session = self._get_session()
resp = session.get(url, timeout=60)
if resp.status_code == 200 and len(resp.content) > 0:
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'wb') as f:
f.write(resp.content)
self._set_metadata(output_path, snap)
return True
self.log(f"Download failed: HTTP {resp.status_code}", "debug")
return False
except Exception as e:
self.log(f"Error downloading media: {e}", "error")
return False
def _set_metadata(self, file_path: str, snap: SnapMedia, description: str = None):
"""Set EXIF metadata and file timestamp."""
try:
date_str = snap.timestamp.strftime('%Y:%m:%d %H:%M:%S')
desc = description or snap.description or ""
if snap.view_count:
desc += f" [Views: {snap.view_count}]"
desc = desc.strip()
ext = os.path.splitext(file_path)[1].lower()
is_video = ext in ['.mp4', '.mov', '.avi', '.webm']
is_image = ext in ['.jpg', '.jpeg', '.png', '.webp']
exif_args = [
'exiftool', '-overwrite_original', '-ignoreMinorErrors',
f'-FileModifyDate={date_str}',
]
if is_image:
exif_args.extend([
f'-DateTimeOriginal={date_str}',
f'-CreateDate={date_str}',
f'-ModifyDate={date_str}',
f'-MetadataDate={date_str}',
])
if desc:
exif_args.extend([
f'-ImageDescription={desc}',
f'-XPComment={desc}',
f'-UserComment={desc}',
])
if snap.lat and snap.lng:
lat_ref = 'N' if snap.lat >= 0 else 'S'
lng_ref = 'E' if snap.lng >= 0 else 'W'
exif_args.extend([
f'-GPSLatitude={abs(snap.lat)}',
f'-GPSLatitudeRef={lat_ref}',
f'-GPSLongitude={abs(snap.lng)}',
f'-GPSLongitudeRef={lng_ref}',
])
elif is_video:
exif_args.extend([
f'-CreateDate={date_str}',
f'-ModifyDate={date_str}',
f'-MediaCreateDate={date_str}',
f'-MediaModifyDate={date_str}',
f'-TrackCreateDate={date_str}',
f'-TrackModifyDate={date_str}',
])
if desc:
exif_args.extend([
f'-Description={desc}',
f'-Comment={desc}',
])
exif_args.append(file_path)
subprocess.run(exif_args, capture_output=True, timeout=30)
# Set filesystem modification time
ts = snap.timestamp.timestamp()
os.utime(file_path, (ts, ts))
except Exception as e:
self.log(f"Warning: Could not set metadata for {file_path}: {e}", "debug")
def _generate_filename(self, username: str, snap: SnapMedia, ext: str) -> str:
"""Generate filename with timestamp and media ID."""
date_str = snap.timestamp.strftime('%Y%m%d_%H%M%S')
return f"{username}_{date_str}_{snap.media_id}.{ext}"
def _get_processed_posts(self, username: str) -> Set[str]:
"""Get set of media IDs that have been processed."""
processed = set()
if not self.db:
return processed
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT filename, metadata FROM downloads
WHERE platform = 'snapchat'
AND source = ?
''', (username,))
for row in cursor.fetchall():
filename, metadata_str = row
if filename:
parts = filename.split('_')
if len(parts) >= 4:
media_id = '_'.join(parts[3:]).split('.')[0]
processed.add(media_id)
if metadata_str:
try:
metadata = json.loads(metadata_str)
if 'media_id' in metadata:
processed.add(metadata['media_id'])
except (json.JSONDecodeError, TypeError, KeyError):
pass
except Exception as e:
self.log(f"Error loading processed posts: {e}", "debug")
return processed
def _record_download(self, username: str, url: str, filename: str,
post_date=None, metadata: dict = None, file_path: str = None,
deferred: bool = False):
"""Record a download in the database."""
if deferred:
self.pending_downloads.append({
'username': username,
'url': url,
'filename': filename,
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
'file_path': file_path,
'metadata': metadata
})
return True
if not self.db:
return
try:
self.db.mark_downloaded(
username=username,
url=url,
filename=filename,
post_date=post_date,
metadata=metadata,
file_path=file_path
)
except Exception as e:
self.log(f"Failed to record download: {e}", "debug")
def get_pending_downloads(self) -> list:
"""Get list of pending downloads for deferred recording."""
return self.pending_downloads
def clear_pending_downloads(self):
"""Clear pending downloads list."""
self.pending_downloads = []
def download(self, username: str, content_type: str = "all", days_back: int = 14,
max_downloads: int = 50, output_dir: str = None,
spotlight_dir: str = None, stories_dir: str = None,
stitch_highlights: bool = True, defer_database: bool = False,
phrase_config: dict = None) -> int:
"""Download content from a user - compatible with media-downloader interface.
Args:
username: Snapchat username
content_type: "spotlight", "stories", "highlights", or "all"
days_back: How many days back to download (filters by post date)
max_downloads: Maximum items to download per content type
output_dir: Default output directory (used if specific dirs not set)
spotlight_dir: Output directory for spotlights
stories_dir: Output directory for stories/highlights
stitch_highlights: Ignored (kept for backwards compatibility)
defer_database: If True, defer database recording
phrase_config: Not used (for interface compatibility)
Returns:
Number of files downloaded
"""
self.defer_database = defer_database
self.downloaded_files.clear()
# Set output directories
if spotlight_dir:
spotlight_output = Path(spotlight_dir)
elif output_dir:
spotlight_output = Path(output_dir)
else:
spotlight_output = Path(f"/opt/media-downloader/downloads/snapchat_client/spotlight/{username}")
if stories_dir:
stories_output = Path(stories_dir)
elif output_dir:
stories_output = Path(output_dir)
else:
stories_output = Path(f"/opt/media-downloader/downloads/snapchat_client/stories/{username}")
spotlight_output.mkdir(parents=True, exist_ok=True)
stories_output.mkdir(parents=True, exist_ok=True)
# Update activity status
if self.activity_manager:
self.activity_manager.update_status("Checking Snapchat")
# Get processed posts (shared with snapchat module - both use platform='snapchat')
processed = self._get_processed_posts(username)
self.log(f"Loaded {len(processed)} processed posts from database", "debug")
cutoff_date = datetime.now() - timedelta(days=days_back)
downloaded_count = 0
# Crash recovery checkpoint
from modules.task_checkpoint import TaskCheckpoint
checkpoint = TaskCheckpoint(f'snapchat_client:{username}', 'scraping')
try:
# Get profile content via HTTP
content = self.get_profile_content(username)
# Count total items for checkpoint
total_items = 0
if content_type in ['spotlight', 'all'] and content['spotlights']:
total_items += min(len(content['spotlights']), max_downloads)
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
total_items += min(len(content['highlights']), max_downloads)
checkpoint.start(total_items=total_items)
if checkpoint.is_recovering():
self.log(f"Snapchat Client @{username}: recovering — skipping already-processed URLs", "info")
# Download spotlights
if content_type in ['spotlight', 'all'] and content['spotlights']:
spotlight_items = content['spotlights'][:max_downloads]
self.log(f"Processing {len(spotlight_items)} spotlights...", "info")
if self.activity_manager:
self.activity_manager.update_status(
"Downloading spotlights",
progress_current=0,
progress_total=len(spotlight_items)
)
for spot_idx, url in enumerate(spotlight_items):
if self.activity_manager:
self.activity_manager.update_status(
"Downloading spotlights",
progress_current=spot_idx + 1,
progress_total=len(spotlight_items)
)
if checkpoint.is_completed(url):
continue
checkpoint.set_current(url)
try:
# Rate limit between page fetches
if spot_idx > 0:
time.sleep(random.uniform(1.5, 2.5))
spotlight = self.get_spotlight_metadata(url)
if not spotlight or not spotlight.snaps:
continue
snap = spotlight.snaps[0]
# Check date filter
if snap.timestamp < cutoff_date:
self.log(f"Spotlight {snap.media_id} is older than {days_back} days, skipping", "debug")
continue
# Check if already processed
if snap.media_id in processed or snap.media_id in self.downloaded_files:
self.log(f"Spotlight {snap.media_id} already processed, skipping", "debug")
continue
# Download
ext = 'mp4' if snap.media_type == 'video' else 'jpg'
filename = self._generate_filename(username, snap, ext)
output_path = str(spotlight_output / filename)
# Rate limit between CDN downloads
time.sleep(random.uniform(0.3, 0.5))
if self._download_media_file(snap, output_path):
self.downloaded_files.add(snap.media_id)
downloaded_count += 1
self.log(f"Downloaded spotlight: {filename}", "info")
self._record_download(
username=username,
url=url,
filename=filename,
post_date=snap.timestamp,
metadata={
'media_id': snap.media_id,
'description': snap.description,
'view_count': snap.view_count,
'content_type': 'spotlight'
},
file_path=output_path,
deferred=defer_database
)
except Exception as e:
self.log(f"Error processing spotlight: {e}", "error")
checkpoint.mark_completed(url)
# Rate limit between content types
if content_type == 'all' and content['spotlights'] and content['highlights']:
time.sleep(random.uniform(2, 3))
# Download highlights (stories)
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
highlight_items = content['highlights'][:max_downloads]
self.log(f"Processing {len(highlight_items)} highlights...", "info")
if self.activity_manager:
self.activity_manager.update_status(
"Downloading highlights",
progress_current=0,
progress_total=len(highlight_items)
)
for hi_idx, url in enumerate(highlight_items):
if self.activity_manager:
self.activity_manager.update_status(
"Downloading highlights",
progress_current=hi_idx + 1,
progress_total=len(highlight_items)
)
if checkpoint.is_completed(url):
continue
checkpoint.set_current(url)
try:
# Rate limit between page fetches
if hi_idx > 0:
time.sleep(random.uniform(1.5, 2.5))
highlight = self.get_highlight_metadata(url)
if not highlight or not highlight.snaps:
continue
# Check if any snap is within date range
newest_snap = max(highlight.snaps, key=lambda s: s.timestamp)
if newest_snap.timestamp < cutoff_date:
self.log(f"Highlight {highlight.collection_id} is older than {days_back} days, skipping", "debug")
continue
# Check if already processed
if highlight.collection_id in processed or highlight.collection_id in self.downloaded_files:
self.log(f"Highlight {highlight.collection_id} already processed, skipping", "debug")
continue
# Separate videos and images
videos = [s for s in highlight.snaps if s.media_type == 'video']
images = [s for s in highlight.snaps if s.media_type == 'image']
# Download images individually
for snap in images:
if snap.timestamp < cutoff_date:
continue
if snap.media_id in processed or snap.media_id in self.downloaded_files:
continue
time.sleep(random.uniform(0.3, 0.5))
filename = self._generate_filename(username, snap, 'jpg')
output_path = str(stories_output / filename)
if self._download_media_file(snap, output_path):
self.downloaded_files.add(snap.media_id)
downloaded_count += 1
self.log(f"Downloaded image: {filename}", "info")
self._record_download(
username=username,
url=highlight.url,
filename=filename,
post_date=snap.timestamp,
metadata={
'media_id': snap.media_id,
'highlight_id': highlight.collection_id,
'content_type': 'highlight_image'
},
file_path=output_path,
deferred=defer_database
)
# Download videos individually
for snap in videos:
if snap.timestamp < cutoff_date:
continue
if snap.media_id in processed or snap.media_id in self.downloaded_files:
continue
time.sleep(random.uniform(0.3, 0.5))
filename = self._generate_filename(username, snap, 'mp4')
output_path = str(stories_output / filename)
if self._download_media_file(snap, output_path):
self._set_metadata(output_path, snap)
self.downloaded_files.add(snap.media_id)
downloaded_count += 1
self.log(f"Downloaded video: {filename}", "info")
self._record_download(
username=username,
url=highlight.url,
filename=filename,
post_date=snap.timestamp,
metadata={
'media_id': snap.media_id,
'highlight_id': highlight.collection_id,
'content_type': 'highlight_video'
},
file_path=output_path,
deferred=defer_database
)
except Exception as e:
self.log(f"Error processing highlight: {e}", "error")
checkpoint.mark_completed(url)
except Exception as e:
self.log(f"Error during download: {e}", "error")
checkpoint.finish()
self.log(f"Downloaded {downloaded_count} files for @{username}", "info")
return downloaded_count