Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

985
modules/snapchat_scraper.py Normal file
View File

@@ -0,0 +1,985 @@
#!/usr/bin/env python3
"""
Snapchat Direct Scraper Module - Scrapes directly from Snapchat.com
Uses Playwright to scrape profiles and extract:
- Spotlight videos (540x960)
- Stories/Highlights (480x852, stitched into single videos)
Full metadata extraction including timestamps, media IDs, descriptions.
Follows the same interface as the original snapchat_module.py
"""
import os
import json
import re
import tempfile
import subprocess
import shutil
import platform
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Dict, List, Any, Set
from dataclasses import dataclass, field
# Set environment for Playwright
os.environ.setdefault('PLAYWRIGHT_BROWSERS_PATH', '/root/.cache/ms-playwright')
from modules.base_module import LoggingMixin
from modules.cloudflare_handler import (
get_playwright_context_options,
get_playwright_stealth_scripts,
get_flaresolverr_user_agent
)
@dataclass
class SnapMedia:
"""Represents a single snap media item"""
media_id: str
media_type: str # 'video' or 'image'
media_url: str
timestamp: datetime
index: int = 0
thumbnail_url: str = ""
duration_ms: int = 0
description: str = ""
view_count: int = 0
width: int = 0
height: int = 0
lat: Optional[float] = None
lng: Optional[float] = None
@dataclass
class SnapCollection:
"""Represents a spotlight or highlight collection"""
collection_id: str
collection_type: str # 'spotlight' or 'highlight'
title: str = ""
username: str = ""
snaps: List[SnapMedia] = field(default_factory=list)
url: str = ""
class SnapchatDirectScraper(LoggingMixin):
"""
Scrapes Snapchat profiles directly for media content.
Follows the same interface as SnapchatDownloader for compatibility
with the media-downloader system.
"""
def __init__(self,
headless: bool = True,
show_progress: bool = True,
use_database: bool = True,
log_callback=None,
unified_db=None):
"""Initialize scraper compatible with media-downloader system"""
self.headless = headless
self.show_progress = show_progress
self.use_database = use_database
self.unified_db = unified_db
self.scraper_id = 'snapchat_direct'
self.download_count = 0
self.downloaded_files: Set[str] = set()
self.pending_downloads = []
# Initialize logging via mixin
self._init_logger('SnapchatDirect', log_callback, default_module='Download')
# User-Agent to match FlareSolverr (dynamically fetched for consistency)
self.user_agent = get_flaresolverr_user_agent()
# Browser state
self._playwright = None
self.browser = None
self.context = None
# Database adapter
if unified_db and use_database:
from modules.unified_database import SnapchatDatabaseAdapter
self.db = SnapchatDatabaseAdapter(unified_db)
else:
self.db = None
self.use_database = False
# Activity status manager
try:
from modules.activity_status import get_activity_manager
self.activity_manager = get_activity_manager(unified_db)
except ImportError:
self.activity_manager = None
# Load cookies from database
self.cookies = self._load_cookies_from_db()
# Load proxy configuration from database
self.proxy_url = None
if unified_db:
try:
scraper_config = unified_db.get_scraper('snapchat')
if scraper_config and scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
self.proxy_url = scraper_config['proxy_url']
self.log(f"Using proxy: {self.proxy_url}", "info")
except Exception as e:
self.log(f"Could not load proxy config: {e}", "debug")
def _load_cookies_from_db(self) -> List[Dict]:
"""Load cookies from database"""
if not self.unified_db:
return self._get_default_cookies()
try:
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
if cookies:
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
return cookies
except Exception as e:
self.log(f"Error loading cookies from database: {e}", "warning")
# Try loading from original snapchat scraper
try:
cookies = self.unified_db.get_scraper_cookies('snapchat')
if cookies:
self.log(f"Using cookies from 'snapchat' scraper", "debug")
return cookies
except Exception as e:
self.log(f"Error loading cookies from snapchat scraper: {e}", "debug")
return self._get_default_cookies()
def _get_default_cookies(self) -> List[Dict]:
"""Get default cookies for Snapchat"""
return [
{"name": "sc-cookies-accepted", "value": "true", "domain": "www.snapchat.com", "path": "/"},
]
def _save_cookies_to_db(self, cookies: List[Dict], user_agent: str = None):
"""Save cookies to database
Args:
cookies: List of cookie dictionaries
user_agent: User agent to associate with cookies (important for cf_clearance).
If not provided, uses self.user_agent as fallback.
"""
if not self.unified_db:
return
try:
# Use provided user_agent or fall back to self.user_agent
ua = user_agent or self.user_agent
self.unified_db.save_scraper_cookies(
self.scraper_id,
cookies,
user_agent=ua,
merge=True
)
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50]}...)", "debug")
except Exception as e:
self.log(f"Error saving cookies to database: {e}", "warning")
def _parse_proxy_url(self, proxy_url: str) -> Optional[Dict]:
"""
Parse proxy URL into Playwright proxy config.
Supports: protocol://user:pass@host:port or protocol://host:port
"""
import re
try:
# Match: protocol://[user:pass@]host:port
match = re.match(
r'^(https?|socks[45]?)://(?:([^:]+):([^@]+)@)?([^:]+):(\d+)$',
proxy_url
)
if match:
protocol, username, password, host, port = match.groups()
config = {'server': f'{protocol}://{host}:{port}'}
if username and password:
config['username'] = username
config['password'] = password
return config
except Exception as e:
self.log(f"Failed to parse proxy URL: {e}", "warning")
return None
def __enter__(self):
"""Context manager entry"""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
self._close_browser()
return False
def _start_browser(self):
"""Start Playwright browser"""
if self.browser is not None:
return
os.environ['DISPLAY'] = ':100'
from playwright.sync_api import sync_playwright
self._playwright = sync_playwright().start()
self.browser = self._playwright.chromium.launch(
headless=self.headless,
args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
)
# Build context options - use dynamic fingerprinting from FlareSolverr
context_options = get_playwright_context_options()
# IMPORTANT: If cookies have a stored user_agent, use THAT user_agent
# Cloudflare cf_clearance cookies are fingerprinted to the browser that solved the challenge
try:
if self.unified_db:
stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
if stored_user_agent:
self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug", module="Browser")
context_options['user_agent'] = stored_user_agent
else:
self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser")
else:
self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser")
except Exception as e:
self.log(f"Error getting stored user_agent, using default: {e}", "debug", module="Browser")
# Add proxy if configured
if self.proxy_url:
proxy_config = self._parse_proxy_url(self.proxy_url)
if proxy_config:
context_options['proxy'] = proxy_config
self.log(f"Browser using proxy: {proxy_config.get('server')}", "info", module="Browser")
self.context = self.browser.new_context(**context_options)
# Add anti-detection scripts to all pages in this context
self.context.add_init_script(get_playwright_stealth_scripts())
# Add cookies
if self.cookies:
# Clean cookies for Playwright and convert expiry->expires
cleaned = []
for c in self.cookies:
clean = {k: v for k, v in c.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
# FlareSolverr uses 'expiry' but Playwright uses 'expires'
if 'expiry' in clean and 'expires' not in clean:
clean['expires'] = clean.pop('expiry')
cleaned.append(clean)
# CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect
try:
self.context.clear_cookies()
except Exception:
pass
self.context.add_cookies(cleaned)
self.log("Browser started", "info", module="Browser")
def _close_browser(self):
"""Close browser and cleanup"""
if self.context:
try:
self.context.close()
except Exception as e:
self.log(f"Error closing browser context: {e}", "debug")
self.context = None
if self.browser:
try:
self.browser.close()
except Exception as e:
self.log(f"Error closing browser: {e}", "debug")
self.browser = None
if self._playwright:
try:
self._playwright.stop()
except Exception as e:
self.log(f"Error stopping playwright: {e}", "debug")
self._playwright = None
def _get_next_data(self, page) -> Optional[Dict]:
"""Extract __NEXT_DATA__ JSON from page"""
try:
next_data_elem = page.locator('script#__NEXT_DATA__').first
if next_data_elem.count() > 0:
return json.loads(next_data_elem.inner_text())
except Exception as e:
self.log(f"Error extracting __NEXT_DATA__: {e}", "debug")
return None
def _set_metadata(self, file_path: str, snap: SnapMedia, description: str = None):
"""Set EXIF metadata and file timestamp"""
try:
date_str = snap.timestamp.strftime('%Y:%m:%d %H:%M:%S')
desc = description or snap.description or ""
if snap.view_count:
desc += f" [Views: {snap.view_count}]"
desc = desc.strip()
ext = os.path.splitext(file_path)[1].lower()
is_video = ext in ['.mp4', '.mov', '.avi', '.webm']
is_image = ext in ['.jpg', '.jpeg', '.png', '.webp']
exif_args = [
'exiftool', '-overwrite_original', '-ignoreMinorErrors',
f'-FileModifyDate={date_str}',
]
if is_image:
exif_args.extend([
f'-DateTimeOriginal={date_str}',
f'-CreateDate={date_str}',
f'-ModifyDate={date_str}',
f'-MetadataDate={date_str}',
])
if desc:
exif_args.extend([
f'-ImageDescription={desc}',
f'-XPComment={desc}',
f'-UserComment={desc}',
])
if snap.lat and snap.lng:
lat_ref = 'N' if snap.lat >= 0 else 'S'
lng_ref = 'E' if snap.lng >= 0 else 'W'
exif_args.extend([
f'-GPSLatitude={abs(snap.lat)}',
f'-GPSLatitudeRef={lat_ref}',
f'-GPSLongitude={abs(snap.lng)}',
f'-GPSLongitudeRef={lng_ref}',
])
elif is_video:
exif_args.extend([
f'-CreateDate={date_str}',
f'-ModifyDate={date_str}',
f'-MediaCreateDate={date_str}',
f'-MediaModifyDate={date_str}',
f'-TrackCreateDate={date_str}',
f'-TrackModifyDate={date_str}',
])
if desc:
exif_args.extend([
f'-Description={desc}',
f'-Comment={desc}',
])
exif_args.append(file_path)
subprocess.run(exif_args, capture_output=True, timeout=30)
# Set filesystem modification time
ts = snap.timestamp.timestamp()
os.utime(file_path, (ts, ts))
except Exception as e:
self.log(f"Warning: Could not set metadata for {file_path}: {e}", "debug")
def get_profile_content(self, username: str) -> Dict[str, List[str]]:
"""Get all spotlight and highlight URLs from a profile"""
import time
if not self.browser:
self._start_browser()
page = self.context.new_page()
result = {'spotlights': [], 'highlights': []}
try:
url = f"https://www.snapchat.com/@{username}"
self.log(f"Navigating to profile @{username}", "info")
page.goto(url, wait_until='networkidle', timeout=30000)
time.sleep(2)
content = page.content()
# Extract spotlight URLs
spotlight_pattern = rf'/@{username}/spotlight/([A-Za-z0-9_-]+)'
spotlight_ids = list(set(re.findall(spotlight_pattern, content)))
result['spotlights'] = [
f"https://www.snapchat.com/@{username}/spotlight/{sid}"
for sid in spotlight_ids
]
self.log(f"Found {len(result['spotlights'])} spotlights", "info")
# Click Stories tab to get highlights
stories_tab = page.locator('[role="tab"]:has-text("Stories")').first
if stories_tab.count() > 0:
stories_tab.click()
time.sleep(2)
content = page.content()
highlight_pattern = rf'/@{username}/highlight/([A-Za-z0-9-]+)'
highlight_ids = list(set(re.findall(highlight_pattern, content)))
result['highlights'] = [
f"https://www.snapchat.com/@{username}/highlight/{hid}"
for hid in highlight_ids
]
self.log(f"Found {len(result['highlights'])} highlights", "info")
except Exception as e:
self.log(f"Error getting profile content: {e}", "error")
finally:
page.close()
return result
def get_spotlight_metadata(self, url: str) -> Optional[SnapCollection]:
"""Extract full metadata from a spotlight URL"""
import time
if not self.browser:
self._start_browser()
page = self.context.new_page()
try:
page.goto(url, wait_until='domcontentloaded', timeout=60000)
time.sleep(2)
data = self._get_next_data(page)
if not data:
return None
props = (data.get('props') or {}).get('pageProps') or {}
feed = props.get('spotlightFeed') or {}
stories = feed.get('spotlightStories') or []
if not stories:
return None
story_data = stories[0]
story = story_data.get('story') or {}
metadata = (story_data.get('metadata') or {}).get('videoMetadata') or {}
story_id = (story.get('storyId') or {}).get('value', '')
creator = (metadata.get('creator') or {}).get('personCreator') or {}
username = creator.get('username', '')
collection = SnapCollection(
collection_id=story_id,
collection_type='spotlight',
title=metadata.get('description', ''),
username=username,
url=url
)
for snap_data in story.get('snapList') or []:
snap_id = (snap_data.get('snapId') or {}).get('value', '')
snap_urls = snap_data.get('snapUrls') or {}
media_url = snap_urls.get('mediaUrl', '')
media_id = ''
if '/d/' in media_url:
media_id = media_url.split('/d/')[1].split('.')[0]
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
snap = SnapMedia(
media_id=media_id or snap_id,
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
media_url=media_url,
timestamp=timestamp,
index=snap_data.get('snapIndex', 0),
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
duration_ms=int(metadata.get('durationMs', 0)),
description=metadata.get('description', ''),
view_count=int(metadata.get('viewCount', 0)),
width=int(metadata.get('width', 540)),
height=int(metadata.get('height', 960))
)
collection.snaps.append(snap)
return collection
except Exception as e:
self.log(f"Error getting spotlight metadata: {e}", "error")
return None
finally:
page.close()
def get_highlight_metadata(self, url: str) -> Optional[SnapCollection]:
"""Extract full metadata from a highlight URL"""
import time
if not self.browser:
self._start_browser()
page = self.context.new_page()
try:
page.goto(url, wait_until='domcontentloaded', timeout=60000)
time.sleep(2)
data = self._get_next_data(page)
if not data:
return None
props = (data.get('props') or {}).get('pageProps') or {}
highlight = props.get('highlight') or {}
if not highlight:
return None
highlight_id = highlight.get('highlightId') or {}
if isinstance(highlight_id, dict):
highlight_id = highlight_id.get('value', '')
username_match = re.search(r'@([^/]+)', url)
username = username_match.group(1) if username_match else ''
title = highlight.get('storyTitle') or {}
if isinstance(title, dict):
title = title.get('value', '')
collection = SnapCollection(
collection_id=highlight_id,
collection_type='highlight',
title=title or 'Untitled Highlight',
username=username,
url=url
)
for snap_data in highlight.get('snapList') or []:
snap_urls = snap_data.get('snapUrls') or {}
media_url = snap_urls.get('mediaUrl', '')
media_id = ''
if '/d/' in media_url:
media_id = media_url.split('/d/')[1].split('.')[0]
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
lat = snap_data.get('lat')
lng = snap_data.get('lng')
snap = SnapMedia(
media_id=media_id,
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
media_url=media_url,
timestamp=timestamp,
index=snap_data.get('snapIndex', 0),
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
lat=float(lat) if lat else None,
lng=float(lng) if lng else None
)
collection.snaps.append(snap)
return collection
except Exception as e:
self.log(f"Error getting highlight metadata: {e}", "error")
return None
finally:
page.close()
def _download_media_file(self, snap: SnapMedia, output_path: str) -> bool:
"""Download a single media file"""
try:
url = snap.media_url.replace('&amp;', '&')
result = subprocess.run([
'curl', '-sL', '-o', output_path,
'-H', 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
url
], capture_output=True, timeout=60)
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
self._set_metadata(output_path, snap)
return True
return False
except Exception as e:
self.log(f"Error downloading media: {e}", "error")
return False
def _generate_filename(self, username: str, snap: SnapMedia, ext: str) -> str:
"""Generate filename with timestamp and media ID (FastDL format)"""
date_str = snap.timestamp.strftime('%Y%m%d_%H%M%S')
return f"{username}_{date_str}_{snap.media_id}.{ext}"
def _record_download(self, username: str, url: str, filename: str,
post_date=None, metadata: dict = None, file_path: str = None,
deferred: bool = False):
"""Record a download in the database"""
if deferred:
self.pending_downloads.append({
'username': username,
'url': url,
'filename': filename,
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
'file_path': file_path,
'metadata': metadata
})
return True
if not self.db:
return
try:
self.db.mark_downloaded(
username=username,
url=url,
filename=filename,
post_date=post_date,
metadata=metadata,
file_path=file_path
)
except Exception as e:
self.log(f"Failed to record download: {e}", "debug")
def get_pending_downloads(self):
"""Get list of downloads that were deferred"""
return self.pending_downloads.copy()
def clear_pending_downloads(self):
"""Clear the pending downloads list"""
self.pending_downloads = []
def _get_processed_posts(self, username: str) -> Set[str]:
"""Get set of media IDs that have been processed"""
processed = set()
if not self.db:
return processed
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute('''
SELECT filename, metadata FROM downloads
WHERE platform = 'snapchat'
AND source = ?
''', (username,))
for row in cursor.fetchall():
filename, metadata_str = row
if filename:
parts = filename.split('_')
if len(parts) >= 4:
media_id = '_'.join(parts[3:]).split('.')[0]
processed.add(media_id)
if metadata_str:
try:
metadata = json.loads(metadata_str)
if 'media_id' in metadata:
processed.add(metadata['media_id'])
except (json.JSONDecodeError, TypeError, KeyError):
pass # Invalid metadata, skip
except Exception as e:
self.log(f"Error loading processed posts: {e}", "debug")
return processed
def download(self, username: str, content_type: str = "all", days_back: int = 14,
max_downloads: int = 50, output_dir: str = None,
spotlight_dir: str = None, stories_dir: str = None,
stitch_highlights: bool = True, defer_database: bool = False,
phrase_config: dict = None):
"""
Download content from a user - compatible with media-downloader interface
Args:
username: Snapchat username
content_type: "spotlight", "stories", "highlights", or "all"
days_back: How many days back to download (filters by post date)
max_downloads: Maximum items to download per content type
output_dir: Default output directory (used if specific dirs not set)
spotlight_dir: Output directory for spotlights
stories_dir: Output directory for stories/highlights
stitch_highlights: Ignored (kept for backwards compatibility)
defer_database: If True, defer database recording
phrase_config: Not used (for interface compatibility)
Returns:
Number of files downloaded
"""
self.defer_database = defer_database
self.downloaded_files.clear()
# Set output directories
# If specific dirs provided, use them directly
# If only output_dir provided, use it directly (caller handles structure)
# If nothing provided, use default with subdirectories
if spotlight_dir:
spotlight_output = Path(spotlight_dir)
elif output_dir:
spotlight_output = Path(output_dir)
else:
spotlight_output = Path(f"/opt/media-downloader/downloads/snapchat/spotlight/{username}")
if stories_dir:
stories_output = Path(stories_dir)
elif output_dir:
stories_output = Path(output_dir)
else:
stories_output = Path(f"/opt/media-downloader/downloads/snapchat/stories/{username}")
spotlight_output.mkdir(parents=True, exist_ok=True)
stories_output.mkdir(parents=True, exist_ok=True)
# Update activity status
if self.activity_manager:
self.activity_manager.update_status("Checking Snapchat")
# Get processed posts
processed = self._get_processed_posts(username)
self.log(f"Loaded {len(processed)} processed posts from database", "debug")
cutoff_date = datetime.now() - timedelta(days=days_back)
downloaded_count = 0
# Crash recovery checkpoint
from modules.task_checkpoint import TaskCheckpoint
checkpoint = TaskCheckpoint(f'snapchat:{username}', 'scraping')
try:
# Start browser
self._start_browser()
# Get profile content
content = self.get_profile_content(username)
# Count total items for checkpoint
total_items = 0
if content_type in ['spotlight', 'all'] and content['spotlights']:
total_items += min(len(content['spotlights']), max_downloads)
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
total_items += min(len(content['highlights']), max_downloads)
checkpoint.start(total_items=total_items)
if checkpoint.is_recovering():
self.log(f"Snapchat @{username}: recovering — skipping already-processed URLs", "info")
# Download spotlights
if content_type in ['spotlight', 'all'] and content['spotlights']:
spotlight_items = content['spotlights'][:max_downloads]
self.log(f"Processing {len(spotlight_items)} spotlights...", "info")
if self.activity_manager:
self.activity_manager.update_status(
"Downloading spotlights",
progress_current=0,
progress_total=len(spotlight_items)
)
for spot_idx, url in enumerate(spotlight_items):
# Update progress at start of each iteration (fires even on skips)
if self.activity_manager:
self.activity_manager.update_status(
"Downloading spotlights",
progress_current=spot_idx + 1,
progress_total=len(spotlight_items)
)
if checkpoint.is_completed(url):
continue
checkpoint.set_current(url)
try:
spotlight = self.get_spotlight_metadata(url)
if not spotlight or not spotlight.snaps:
continue
snap = spotlight.snaps[0]
# Check date filter
if snap.timestamp < cutoff_date:
self.log(f"Spotlight {snap.media_id} is older than {days_back} days, skipping", "debug")
continue
# Check if already processed
if snap.media_id in processed or snap.media_id in self.downloaded_files:
self.log(f"Spotlight {snap.media_id} already processed, skipping", "debug")
continue
# Download
ext = 'mp4' if snap.media_type == 'video' else 'jpg'
filename = self._generate_filename(username, snap, ext)
output_path = str(spotlight_output / filename)
if self._download_media_file(snap, output_path):
self.downloaded_files.add(snap.media_id)
downloaded_count += 1
self.log(f"Downloaded spotlight: {filename}", "info")
self._record_download(
username=username,
url=url,
filename=filename,
post_date=snap.timestamp,
metadata={
'media_id': snap.media_id,
'description': snap.description,
'view_count': snap.view_count,
'content_type': 'spotlight'
},
file_path=output_path,
deferred=defer_database
)
except Exception as e:
self.log(f"Error processing spotlight: {e}", "error")
checkpoint.mark_completed(url)
# Download highlights (stories)
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
highlight_items = content['highlights'][:max_downloads]
self.log(f"Processing {len(highlight_items)} highlights...", "info")
if self.activity_manager:
self.activity_manager.update_status(
"Downloading highlights",
progress_current=0,
progress_total=len(highlight_items)
)
for hi_idx, url in enumerate(highlight_items):
# Update progress at start of each iteration (fires even on skips)
if self.activity_manager:
self.activity_manager.update_status(
"Downloading highlights",
progress_current=hi_idx + 1,
progress_total=len(highlight_items)
)
if checkpoint.is_completed(url):
continue
checkpoint.set_current(url)
try:
highlight = self.get_highlight_metadata(url)
if not highlight or not highlight.snaps:
continue
# Check if any snap is within date range
newest_snap = max(highlight.snaps, key=lambda s: s.timestamp)
if newest_snap.timestamp < cutoff_date:
self.log(f"Highlight {highlight.collection_id} is older than {days_back} days, skipping", "debug")
continue
# Check if already processed
if highlight.collection_id in processed or highlight.collection_id in self.downloaded_files:
self.log(f"Highlight {highlight.collection_id} already processed, skipping", "debug")
continue
# Separate videos and images
videos = [s for s in highlight.snaps if s.media_type == 'video']
images = [s for s in highlight.snaps if s.media_type == 'image']
# Download images individually
for snap in images:
if snap.timestamp < cutoff_date:
continue
if snap.media_id in processed or snap.media_id in self.downloaded_files:
continue
filename = self._generate_filename(username, snap, 'jpg')
output_path = str(stories_output / filename)
if self._download_media_file(snap, output_path):
self.downloaded_files.add(snap.media_id)
downloaded_count += 1
self.log(f"Downloaded image: {filename}", "info")
self._record_download(
username=username,
url=highlight.url,
filename=filename,
post_date=snap.timestamp,
metadata={
'media_id': snap.media_id,
'highlight_id': highlight.collection_id,
'content_type': 'highlight_image'
},
file_path=output_path,
deferred=defer_database
)
# Handle videos - download each clip individually
if videos:
for snap in videos:
if snap.timestamp < cutoff_date:
continue
if snap.media_id in processed or snap.media_id in self.downloaded_files:
continue
filename = self._generate_filename(username, snap, 'mp4')
output_path = str(stories_output / filename)
if self._download_media_file(snap, output_path):
self._set_metadata(output_path, snap)
self.downloaded_files.add(snap.media_id)
downloaded_count += 1
self.log(f"Downloaded video: {filename}", "info")
self._record_download(
username=username,
url=highlight.url,
filename=filename,
post_date=snap.timestamp,
metadata={
'media_id': snap.media_id,
'highlight_id': highlight.collection_id,
'content_type': 'highlight_video'
},
file_path=output_path,
deferred=defer_database
)
except Exception as e:
self.log(f"Error processing highlight: {e}", "error")
checkpoint.mark_completed(url)
except Exception as e:
self.log(f"Error during download: {e}", "error")
checkpoint.finish()
self.log(f"Downloaded {downloaded_count} files for @{username}", "info")
return downloaded_count
def test_scraper():
"""Test the scraper"""
print("=" * 60)
print("SNAPCHAT DIRECT SCRAPER TEST")
print("=" * 60)
with SnapchatDirectScraper(headless=True) as scraper:
username = "evalongoria"
# Test download
count = scraper.download(
username=username,
content_type="all",
days_back=30,
max_downloads=5,
spotlight_dir="/tmp/snap_test/spotlight",
stories_dir="/tmp/snap_test/stories",
stitch_highlights=True
)
print(f"\nDownloaded {count} files")
# Show files
import os
for root, dirs, files in os.walk("/tmp/snap_test"):
for f in files:
path = os.path.join(root, f)
size = os.path.getsize(path) / 1024
print(f" {path}: {size:.1f}KB")
print("=" * 60)
print("TEST COMPLETE")
print("=" * 60)
if __name__ == "__main__":
test_scraper()