#!/usr/bin/env python3 """ Snapchat Direct Scraper Module - Scrapes directly from Snapchat.com Uses Playwright to scrape profiles and extract: - Spotlight videos (540x960) - Stories/Highlights (480x852, stitched into single videos) Full metadata extraction including timestamps, media IDs, descriptions. Follows the same interface as the original snapchat_module.py """ import os import json import re import tempfile import subprocess import shutil import platform from datetime import datetime, timedelta from pathlib import Path from typing import Optional, Dict, List, Any, Set from dataclasses import dataclass, field # Set environment for Playwright os.environ.setdefault('PLAYWRIGHT_BROWSERS_PATH', '/root/.cache/ms-playwright') from modules.base_module import LoggingMixin from modules.cloudflare_handler import ( get_playwright_context_options, get_playwright_stealth_scripts, get_flaresolverr_user_agent ) @dataclass class SnapMedia: """Represents a single snap media item""" media_id: str media_type: str # 'video' or 'image' media_url: str timestamp: datetime index: int = 0 thumbnail_url: str = "" duration_ms: int = 0 description: str = "" view_count: int = 0 width: int = 0 height: int = 0 lat: Optional[float] = None lng: Optional[float] = None @dataclass class SnapCollection: """Represents a spotlight or highlight collection""" collection_id: str collection_type: str # 'spotlight' or 'highlight' title: str = "" username: str = "" snaps: List[SnapMedia] = field(default_factory=list) url: str = "" class SnapchatDirectScraper(LoggingMixin): """ Scrapes Snapchat profiles directly for media content. Follows the same interface as SnapchatDownloader for compatibility with the media-downloader system. """ def __init__(self, headless: bool = True, show_progress: bool = True, use_database: bool = True, log_callback=None, unified_db=None): """Initialize scraper compatible with media-downloader system""" self.headless = headless self.show_progress = show_progress self.use_database = use_database self.unified_db = unified_db self.scraper_id = 'snapchat_direct' self.download_count = 0 self.downloaded_files: Set[str] = set() self.pending_downloads = [] # Initialize logging via mixin self._init_logger('SnapchatDirect', log_callback, default_module='Download') # User-Agent to match FlareSolverr (dynamically fetched for consistency) self.user_agent = get_flaresolverr_user_agent() # Browser state self._playwright = None self.browser = None self.context = None # Database adapter if unified_db and use_database: from modules.unified_database import SnapchatDatabaseAdapter self.db = SnapchatDatabaseAdapter(unified_db) else: self.db = None self.use_database = False # Activity status manager try: from modules.activity_status import get_activity_manager self.activity_manager = get_activity_manager(unified_db) except ImportError: self.activity_manager = None # Load cookies from database self.cookies = self._load_cookies_from_db() # Load proxy configuration from database self.proxy_url = None if unified_db: try: scraper_config = unified_db.get_scraper('snapchat') if scraper_config and scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'): self.proxy_url = scraper_config['proxy_url'] self.log(f"Using proxy: {self.proxy_url}", "info") except Exception as e: self.log(f"Could not load proxy config: {e}", "debug") def _load_cookies_from_db(self) -> List[Dict]: """Load cookies from database""" if not self.unified_db: return self._get_default_cookies() try: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) if cookies: self.log(f"Loaded {len(cookies)} cookies from database", "debug") return cookies except Exception as e: self.log(f"Error loading cookies from database: {e}", "warning") # Try loading from original snapchat scraper try: cookies = self.unified_db.get_scraper_cookies('snapchat') if cookies: self.log(f"Using cookies from 'snapchat' scraper", "debug") return cookies except Exception as e: self.log(f"Error loading cookies from snapchat scraper: {e}", "debug") return self._get_default_cookies() def _get_default_cookies(self) -> List[Dict]: """Get default cookies for Snapchat""" return [ {"name": "sc-cookies-accepted", "value": "true", "domain": "www.snapchat.com", "path": "/"}, ] def _save_cookies_to_db(self, cookies: List[Dict], user_agent: str = None): """Save cookies to database Args: cookies: List of cookie dictionaries user_agent: User agent to associate with cookies (important for cf_clearance). If not provided, uses self.user_agent as fallback. """ if not self.unified_db: return try: # Use provided user_agent or fall back to self.user_agent ua = user_agent or self.user_agent self.unified_db.save_scraper_cookies( self.scraper_id, cookies, user_agent=ua, merge=True ) self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50]}...)", "debug") except Exception as e: self.log(f"Error saving cookies to database: {e}", "warning") def _parse_proxy_url(self, proxy_url: str) -> Optional[Dict]: """ Parse proxy URL into Playwright proxy config. Supports: protocol://user:pass@host:port or protocol://host:port """ import re try: # Match: protocol://[user:pass@]host:port match = re.match( r'^(https?|socks[45]?)://(?:([^:]+):([^@]+)@)?([^:]+):(\d+)$', proxy_url ) if match: protocol, username, password, host, port = match.groups() config = {'server': f'{protocol}://{host}:{port}'} if username and password: config['username'] = username config['password'] = password return config except Exception as e: self.log(f"Failed to parse proxy URL: {e}", "warning") return None def __enter__(self): """Context manager entry""" return self def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit""" self._close_browser() return False def _start_browser(self): """Start Playwright browser""" if self.browser is not None: return os.environ['DISPLAY'] = ':100' from playwright.sync_api import sync_playwright self._playwright = sync_playwright().start() self.browser = self._playwright.chromium.launch( headless=self.headless, args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'] ) # Build context options - use dynamic fingerprinting from FlareSolverr context_options = get_playwright_context_options() # IMPORTANT: If cookies have a stored user_agent, use THAT user_agent # Cloudflare cf_clearance cookies are fingerprinted to the browser that solved the challenge try: if self.unified_db: stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id) if stored_user_agent: self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug", module="Browser") context_options['user_agent'] = stored_user_agent else: self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser") else: self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser") except Exception as e: self.log(f"Error getting stored user_agent, using default: {e}", "debug", module="Browser") # Add proxy if configured if self.proxy_url: proxy_config = self._parse_proxy_url(self.proxy_url) if proxy_config: context_options['proxy'] = proxy_config self.log(f"Browser using proxy: {proxy_config.get('server')}", "info", module="Browser") self.context = self.browser.new_context(**context_options) # Add anti-detection scripts to all pages in this context self.context.add_init_script(get_playwright_stealth_scripts()) # Add cookies if self.cookies: # Clean cookies for Playwright and convert expiry->expires cleaned = [] for c in self.cookies: clean = {k: v for k, v in c.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']} # FlareSolverr uses 'expiry' but Playwright uses 'expires' if 'expiry' in clean and 'expires' not in clean: clean['expires'] = clean.pop('expiry') cleaned.append(clean) # CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect try: self.context.clear_cookies() except Exception: pass self.context.add_cookies(cleaned) self.log("Browser started", "info", module="Browser") def _close_browser(self): """Close browser and cleanup""" if self.context: try: self.context.close() except Exception as e: self.log(f"Error closing browser context: {e}", "debug") self.context = None if self.browser: try: self.browser.close() except Exception as e: self.log(f"Error closing browser: {e}", "debug") self.browser = None if self._playwright: try: self._playwright.stop() except Exception as e: self.log(f"Error stopping playwright: {e}", "debug") self._playwright = None def _get_next_data(self, page) -> Optional[Dict]: """Extract __NEXT_DATA__ JSON from page""" try: next_data_elem = page.locator('script#__NEXT_DATA__').first if next_data_elem.count() > 0: return json.loads(next_data_elem.inner_text()) except Exception as e: self.log(f"Error extracting __NEXT_DATA__: {e}", "debug") return None def _set_metadata(self, file_path: str, snap: SnapMedia, description: str = None): """Set EXIF metadata and file timestamp""" try: date_str = snap.timestamp.strftime('%Y:%m:%d %H:%M:%S') desc = description or snap.description or "" if snap.view_count: desc += f" [Views: {snap.view_count}]" desc = desc.strip() ext = os.path.splitext(file_path)[1].lower() is_video = ext in ['.mp4', '.mov', '.avi', '.webm'] is_image = ext in ['.jpg', '.jpeg', '.png', '.webp'] exif_args = [ 'exiftool', '-overwrite_original', '-ignoreMinorErrors', f'-FileModifyDate={date_str}', ] if is_image: exif_args.extend([ f'-DateTimeOriginal={date_str}', f'-CreateDate={date_str}', f'-ModifyDate={date_str}', f'-MetadataDate={date_str}', ]) if desc: exif_args.extend([ f'-ImageDescription={desc}', f'-XPComment={desc}', f'-UserComment={desc}', ]) if snap.lat and snap.lng: lat_ref = 'N' if snap.lat >= 0 else 'S' lng_ref = 'E' if snap.lng >= 0 else 'W' exif_args.extend([ f'-GPSLatitude={abs(snap.lat)}', f'-GPSLatitudeRef={lat_ref}', f'-GPSLongitude={abs(snap.lng)}', f'-GPSLongitudeRef={lng_ref}', ]) elif is_video: exif_args.extend([ f'-CreateDate={date_str}', f'-ModifyDate={date_str}', f'-MediaCreateDate={date_str}', f'-MediaModifyDate={date_str}', f'-TrackCreateDate={date_str}', f'-TrackModifyDate={date_str}', ]) if desc: exif_args.extend([ f'-Description={desc}', f'-Comment={desc}', ]) exif_args.append(file_path) subprocess.run(exif_args, capture_output=True, timeout=30) # Set filesystem modification time ts = snap.timestamp.timestamp() os.utime(file_path, (ts, ts)) except Exception as e: self.log(f"Warning: Could not set metadata for {file_path}: {e}", "debug") def get_profile_content(self, username: str) -> Dict[str, List[str]]: """Get all spotlight and highlight URLs from a profile""" import time if not self.browser: self._start_browser() page = self.context.new_page() result = {'spotlights': [], 'highlights': []} try: url = f"https://www.snapchat.com/@{username}" self.log(f"Navigating to profile @{username}", "info") page.goto(url, wait_until='networkidle', timeout=30000) time.sleep(2) content = page.content() # Extract spotlight URLs spotlight_pattern = rf'/@{username}/spotlight/([A-Za-z0-9_-]+)' spotlight_ids = list(set(re.findall(spotlight_pattern, content))) result['spotlights'] = [ f"https://www.snapchat.com/@{username}/spotlight/{sid}" for sid in spotlight_ids ] self.log(f"Found {len(result['spotlights'])} spotlights", "info") # Click Stories tab to get highlights stories_tab = page.locator('[role="tab"]:has-text("Stories")').first if stories_tab.count() > 0: stories_tab.click() time.sleep(2) content = page.content() highlight_pattern = rf'/@{username}/highlight/([A-Za-z0-9-]+)' highlight_ids = list(set(re.findall(highlight_pattern, content))) result['highlights'] = [ f"https://www.snapchat.com/@{username}/highlight/{hid}" for hid in highlight_ids ] self.log(f"Found {len(result['highlights'])} highlights", "info") except Exception as e: self.log(f"Error getting profile content: {e}", "error") finally: page.close() return result def get_spotlight_metadata(self, url: str) -> Optional[SnapCollection]: """Extract full metadata from a spotlight URL""" import time if not self.browser: self._start_browser() page = self.context.new_page() try: page.goto(url, wait_until='domcontentloaded', timeout=60000) time.sleep(2) data = self._get_next_data(page) if not data: return None props = (data.get('props') or {}).get('pageProps') or {} feed = props.get('spotlightFeed') or {} stories = feed.get('spotlightStories') or [] if not stories: return None story_data = stories[0] story = story_data.get('story') or {} metadata = (story_data.get('metadata') or {}).get('videoMetadata') or {} story_id = (story.get('storyId') or {}).get('value', '') creator = (metadata.get('creator') or {}).get('personCreator') or {} username = creator.get('username', '') collection = SnapCollection( collection_id=story_id, collection_type='spotlight', title=metadata.get('description', ''), username=username, url=url ) for snap_data in story.get('snapList') or []: snap_id = (snap_data.get('snapId') or {}).get('value', '') snap_urls = snap_data.get('snapUrls') or {} media_url = snap_urls.get('mediaUrl', '') media_id = '' if '/d/' in media_url: media_id = media_url.split('/d/')[1].split('.')[0] ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0') timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now() snap = SnapMedia( media_id=media_id or snap_id, media_type='video' if snap_data.get('snapMediaType') == 1 else 'image', media_url=media_url, timestamp=timestamp, index=snap_data.get('snapIndex', 0), thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''), duration_ms=int(metadata.get('durationMs', 0)), description=metadata.get('description', ''), view_count=int(metadata.get('viewCount', 0)), width=int(metadata.get('width', 540)), height=int(metadata.get('height', 960)) ) collection.snaps.append(snap) return collection except Exception as e: self.log(f"Error getting spotlight metadata: {e}", "error") return None finally: page.close() def get_highlight_metadata(self, url: str) -> Optional[SnapCollection]: """Extract full metadata from a highlight URL""" import time if not self.browser: self._start_browser() page = self.context.new_page() try: page.goto(url, wait_until='domcontentloaded', timeout=60000) time.sleep(2) data = self._get_next_data(page) if not data: return None props = (data.get('props') or {}).get('pageProps') or {} highlight = props.get('highlight') or {} if not highlight: return None highlight_id = highlight.get('highlightId') or {} if isinstance(highlight_id, dict): highlight_id = highlight_id.get('value', '') username_match = re.search(r'@([^/]+)', url) username = username_match.group(1) if username_match else '' title = highlight.get('storyTitle') or {} if isinstance(title, dict): title = title.get('value', '') collection = SnapCollection( collection_id=highlight_id, collection_type='highlight', title=title or 'Untitled Highlight', username=username, url=url ) for snap_data in highlight.get('snapList') or []: snap_urls = snap_data.get('snapUrls') or {} media_url = snap_urls.get('mediaUrl', '') media_id = '' if '/d/' in media_url: media_id = media_url.split('/d/')[1].split('.')[0] ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0') timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now() lat = snap_data.get('lat') lng = snap_data.get('lng') snap = SnapMedia( media_id=media_id, media_type='video' if snap_data.get('snapMediaType') == 1 else 'image', media_url=media_url, timestamp=timestamp, index=snap_data.get('snapIndex', 0), thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''), lat=float(lat) if lat else None, lng=float(lng) if lng else None ) collection.snaps.append(snap) return collection except Exception as e: self.log(f"Error getting highlight metadata: {e}", "error") return None finally: page.close() def _download_media_file(self, snap: SnapMedia, output_path: str) -> bool: """Download a single media file""" try: url = snap.media_url.replace('&', '&') result = subprocess.run([ 'curl', '-sL', '-o', output_path, '-H', 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', url ], capture_output=True, timeout=60) if os.path.exists(output_path) and os.path.getsize(output_path) > 0: self._set_metadata(output_path, snap) return True return False except Exception as e: self.log(f"Error downloading media: {e}", "error") return False def _generate_filename(self, username: str, snap: SnapMedia, ext: str) -> str: """Generate filename with timestamp and media ID (FastDL format)""" date_str = snap.timestamp.strftime('%Y%m%d_%H%M%S') return f"{username}_{date_str}_{snap.media_id}.{ext}" def _record_download(self, username: str, url: str, filename: str, post_date=None, metadata: dict = None, file_path: str = None, deferred: bool = False): """Record a download in the database""" if deferred: self.pending_downloads.append({ 'username': username, 'url': url, 'filename': filename, 'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date, 'file_path': file_path, 'metadata': metadata }) return True if not self.db: return try: self.db.mark_downloaded( username=username, url=url, filename=filename, post_date=post_date, metadata=metadata, file_path=file_path ) except Exception as e: self.log(f"Failed to record download: {e}", "debug") def get_pending_downloads(self): """Get list of downloads that were deferred""" return self.pending_downloads.copy() def clear_pending_downloads(self): """Clear the pending downloads list""" self.pending_downloads = [] def _get_processed_posts(self, username: str) -> Set[str]: """Get set of media IDs that have been processed""" processed = set() if not self.db: return processed try: with self.db.get_connection() as conn: cursor = conn.cursor() cursor.execute(''' SELECT filename, metadata FROM downloads WHERE platform = 'snapchat' AND source = ? ''', (username,)) for row in cursor.fetchall(): filename, metadata_str = row if filename: parts = filename.split('_') if len(parts) >= 4: media_id = '_'.join(parts[3:]).split('.')[0] processed.add(media_id) if metadata_str: try: metadata = json.loads(metadata_str) if 'media_id' in metadata: processed.add(metadata['media_id']) except (json.JSONDecodeError, TypeError, KeyError): pass # Invalid metadata, skip except Exception as e: self.log(f"Error loading processed posts: {e}", "debug") return processed def download(self, username: str, content_type: str = "all", days_back: int = 14, max_downloads: int = 50, output_dir: str = None, spotlight_dir: str = None, stories_dir: str = None, stitch_highlights: bool = True, defer_database: bool = False, phrase_config: dict = None): """ Download content from a user - compatible with media-downloader interface Args: username: Snapchat username content_type: "spotlight", "stories", "highlights", or "all" days_back: How many days back to download (filters by post date) max_downloads: Maximum items to download per content type output_dir: Default output directory (used if specific dirs not set) spotlight_dir: Output directory for spotlights stories_dir: Output directory for stories/highlights stitch_highlights: Ignored (kept for backwards compatibility) defer_database: If True, defer database recording phrase_config: Not used (for interface compatibility) Returns: Number of files downloaded """ self.defer_database = defer_database self.downloaded_files.clear() # Set output directories # If specific dirs provided, use them directly # If only output_dir provided, use it directly (caller handles structure) # If nothing provided, use default with subdirectories if spotlight_dir: spotlight_output = Path(spotlight_dir) elif output_dir: spotlight_output = Path(output_dir) else: spotlight_output = Path(f"/opt/media-downloader/downloads/snapchat/spotlight/{username}") if stories_dir: stories_output = Path(stories_dir) elif output_dir: stories_output = Path(output_dir) else: stories_output = Path(f"/opt/media-downloader/downloads/snapchat/stories/{username}") spotlight_output.mkdir(parents=True, exist_ok=True) stories_output.mkdir(parents=True, exist_ok=True) # Update activity status if self.activity_manager: self.activity_manager.update_status("Checking Snapchat") # Get processed posts processed = self._get_processed_posts(username) self.log(f"Loaded {len(processed)} processed posts from database", "debug") cutoff_date = datetime.now() - timedelta(days=days_back) downloaded_count = 0 # Crash recovery checkpoint from modules.task_checkpoint import TaskCheckpoint checkpoint = TaskCheckpoint(f'snapchat:{username}', 'scraping') try: # Start browser self._start_browser() # Get profile content content = self.get_profile_content(username) # Count total items for checkpoint total_items = 0 if content_type in ['spotlight', 'all'] and content['spotlights']: total_items += min(len(content['spotlights']), max_downloads) if content_type in ['stories', 'highlights', 'all'] and content['highlights']: total_items += min(len(content['highlights']), max_downloads) checkpoint.start(total_items=total_items) if checkpoint.is_recovering(): self.log(f"Snapchat @{username}: recovering — skipping already-processed URLs", "info") # Download spotlights if content_type in ['spotlight', 'all'] and content['spotlights']: spotlight_items = content['spotlights'][:max_downloads] self.log(f"Processing {len(spotlight_items)} spotlights...", "info") if self.activity_manager: self.activity_manager.update_status( "Downloading spotlights", progress_current=0, progress_total=len(spotlight_items) ) for spot_idx, url in enumerate(spotlight_items): # Update progress at start of each iteration (fires even on skips) if self.activity_manager: self.activity_manager.update_status( "Downloading spotlights", progress_current=spot_idx + 1, progress_total=len(spotlight_items) ) if checkpoint.is_completed(url): continue checkpoint.set_current(url) try: spotlight = self.get_spotlight_metadata(url) if not spotlight or not spotlight.snaps: continue snap = spotlight.snaps[0] # Check date filter if snap.timestamp < cutoff_date: self.log(f"Spotlight {snap.media_id} is older than {days_back} days, skipping", "debug") continue # Check if already processed if snap.media_id in processed or snap.media_id in self.downloaded_files: self.log(f"Spotlight {snap.media_id} already processed, skipping", "debug") continue # Download ext = 'mp4' if snap.media_type == 'video' else 'jpg' filename = self._generate_filename(username, snap, ext) output_path = str(spotlight_output / filename) if self._download_media_file(snap, output_path): self.downloaded_files.add(snap.media_id) downloaded_count += 1 self.log(f"Downloaded spotlight: {filename}", "info") self._record_download( username=username, url=url, filename=filename, post_date=snap.timestamp, metadata={ 'media_id': snap.media_id, 'description': snap.description, 'view_count': snap.view_count, 'content_type': 'spotlight' }, file_path=output_path, deferred=defer_database ) except Exception as e: self.log(f"Error processing spotlight: {e}", "error") checkpoint.mark_completed(url) # Download highlights (stories) if content_type in ['stories', 'highlights', 'all'] and content['highlights']: highlight_items = content['highlights'][:max_downloads] self.log(f"Processing {len(highlight_items)} highlights...", "info") if self.activity_manager: self.activity_manager.update_status( "Downloading highlights", progress_current=0, progress_total=len(highlight_items) ) for hi_idx, url in enumerate(highlight_items): # Update progress at start of each iteration (fires even on skips) if self.activity_manager: self.activity_manager.update_status( "Downloading highlights", progress_current=hi_idx + 1, progress_total=len(highlight_items) ) if checkpoint.is_completed(url): continue checkpoint.set_current(url) try: highlight = self.get_highlight_metadata(url) if not highlight or not highlight.snaps: continue # Check if any snap is within date range newest_snap = max(highlight.snaps, key=lambda s: s.timestamp) if newest_snap.timestamp < cutoff_date: self.log(f"Highlight {highlight.collection_id} is older than {days_back} days, skipping", "debug") continue # Check if already processed if highlight.collection_id in processed or highlight.collection_id in self.downloaded_files: self.log(f"Highlight {highlight.collection_id} already processed, skipping", "debug") continue # Separate videos and images videos = [s for s in highlight.snaps if s.media_type == 'video'] images = [s for s in highlight.snaps if s.media_type == 'image'] # Download images individually for snap in images: if snap.timestamp < cutoff_date: continue if snap.media_id in processed or snap.media_id in self.downloaded_files: continue filename = self._generate_filename(username, snap, 'jpg') output_path = str(stories_output / filename) if self._download_media_file(snap, output_path): self.downloaded_files.add(snap.media_id) downloaded_count += 1 self.log(f"Downloaded image: {filename}", "info") self._record_download( username=username, url=highlight.url, filename=filename, post_date=snap.timestamp, metadata={ 'media_id': snap.media_id, 'highlight_id': highlight.collection_id, 'content_type': 'highlight_image' }, file_path=output_path, deferred=defer_database ) # Handle videos - download each clip individually if videos: for snap in videos: if snap.timestamp < cutoff_date: continue if snap.media_id in processed or snap.media_id in self.downloaded_files: continue filename = self._generate_filename(username, snap, 'mp4') output_path = str(stories_output / filename) if self._download_media_file(snap, output_path): self._set_metadata(output_path, snap) self.downloaded_files.add(snap.media_id) downloaded_count += 1 self.log(f"Downloaded video: {filename}", "info") self._record_download( username=username, url=highlight.url, filename=filename, post_date=snap.timestamp, metadata={ 'media_id': snap.media_id, 'highlight_id': highlight.collection_id, 'content_type': 'highlight_video' }, file_path=output_path, deferred=defer_database ) except Exception as e: self.log(f"Error processing highlight: {e}", "error") checkpoint.mark_completed(url) except Exception as e: self.log(f"Error during download: {e}", "error") checkpoint.finish() self.log(f"Downloaded {downloaded_count} files for @{username}", "info") return downloaded_count def test_scraper(): """Test the scraper""" print("=" * 60) print("SNAPCHAT DIRECT SCRAPER TEST") print("=" * 60) with SnapchatDirectScraper(headless=True) as scraper: username = "evalongoria" # Test download count = scraper.download( username=username, content_type="all", days_back=30, max_downloads=5, spotlight_dir="/tmp/snap_test/spotlight", stories_dir="/tmp/snap_test/stories", stitch_highlights=True ) print(f"\nDownloaded {count} files") # Show files import os for root, dirs, files in os.walk("/tmp/snap_test"): for f in files: path = os.path.join(root, f) size = os.path.getsize(path) / 1024 print(f" {path}: {size:.1f}KB") print("=" * 60) print("TEST COMPLETE") print("=" * 60) if __name__ == "__main__": test_scraper()