Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/modules/snapchat_scraper.py
+++ b/modules/snapchat_scraper.py
@@ -0,0 +1,985 @@
+#!/usr/bin/env python3
+"""
+Snapchat Direct Scraper Module - Scrapes directly from Snapchat.com
+
+Uses Playwright to scrape profiles and extract:
+- Spotlight videos (540x960)
+- Stories/Highlights (480x852, stitched into single videos)
+
+Full metadata extraction including timestamps, media IDs, descriptions.
+Follows the same interface as the original snapchat_module.py
+"""
+
+import os
+import json
+import re
+import tempfile
+import subprocess
+import shutil
+import platform
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Optional, Dict, List, Any, Set
+from dataclasses import dataclass, field
+
+# Set environment for Playwright
+os.environ.setdefault('PLAYWRIGHT_BROWSERS_PATH', '/root/.cache/ms-playwright')
+
+from modules.base_module import LoggingMixin
+from modules.cloudflare_handler import (
+    get_playwright_context_options,
+    get_playwright_stealth_scripts,
+    get_flaresolverr_user_agent
+)
+
+
+@dataclass
+class SnapMedia:
+    """Represents a single snap media item"""
+    media_id: str
+    media_type: str  # 'video' or 'image'
+    media_url: str
+    timestamp: datetime
+    index: int = 0
+    thumbnail_url: str = ""
+    duration_ms: int = 0
+    description: str = ""
+    view_count: int = 0
+    width: int = 0
+    height: int = 0
+    lat: Optional[float] = None
+    lng: Optional[float] = None
+
+
+@dataclass
+class SnapCollection:
+    """Represents a spotlight or highlight collection"""
+    collection_id: str
+    collection_type: str  # 'spotlight' or 'highlight'
+    title: str = ""
+    username: str = ""
+    snaps: List[SnapMedia] = field(default_factory=list)
+    url: str = ""
+
+
+class SnapchatDirectScraper(LoggingMixin):
+    """
+    Scrapes Snapchat profiles directly for media content.
+
+    Follows the same interface as SnapchatDownloader for compatibility
+    with the media-downloader system.
+    """
+
+    def __init__(self,
+                 headless: bool = True,
+                 show_progress: bool = True,
+                 use_database: bool = True,
+                 log_callback=None,
+                 unified_db=None):
+        """Initialize scraper compatible with media-downloader system"""
+        self.headless = headless
+        self.show_progress = show_progress
+        self.use_database = use_database
+        self.unified_db = unified_db
+        self.scraper_id = 'snapchat_direct'
+        self.download_count = 0
+        self.downloaded_files: Set[str] = set()
+        self.pending_downloads = []
+
+        # Initialize logging via mixin
+        self._init_logger('SnapchatDirect', log_callback, default_module='Download')
+
+        # User-Agent to match FlareSolverr (dynamically fetched for consistency)
+        self.user_agent = get_flaresolverr_user_agent()
+
+        # Browser state
+        self._playwright = None
+        self.browser = None
+        self.context = None
+
+        # Database adapter
+        if unified_db and use_database:
+            from modules.unified_database import SnapchatDatabaseAdapter
+            self.db = SnapchatDatabaseAdapter(unified_db)
+        else:
+            self.db = None
+            self.use_database = False
+
+        # Activity status manager
+        try:
+            from modules.activity_status import get_activity_manager
+            self.activity_manager = get_activity_manager(unified_db)
+        except ImportError:
+            self.activity_manager = None
+
+        # Load cookies from database
+        self.cookies = self._load_cookies_from_db()
+
+        # Load proxy configuration from database
+        self.proxy_url = None
+        if unified_db:
+            try:
+                scraper_config = unified_db.get_scraper('snapchat')
+                if scraper_config and scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
+                    self.proxy_url = scraper_config['proxy_url']
+                    self.log(f"Using proxy: {self.proxy_url}", "info")
+            except Exception as e:
+                self.log(f"Could not load proxy config: {e}", "debug")
+
+    def _load_cookies_from_db(self) -> List[Dict]:
+        """Load cookies from database"""
+        if not self.unified_db:
+            return self._get_default_cookies()
+
+        try:
+            cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
+            if cookies:
+                self.log(f"Loaded {len(cookies)} cookies from database", "debug")
+                return cookies
+        except Exception as e:
+            self.log(f"Error loading cookies from database: {e}", "warning")
+
+        # Try loading from original snapchat scraper
+        try:
+            cookies = self.unified_db.get_scraper_cookies('snapchat')
+            if cookies:
+                self.log(f"Using cookies from 'snapchat' scraper", "debug")
+                return cookies
+        except Exception as e:
+            self.log(f"Error loading cookies from snapchat scraper: {e}", "debug")
+
+        return self._get_default_cookies()
+
+    def _get_default_cookies(self) -> List[Dict]:
+        """Get default cookies for Snapchat"""
+        return [
+            {"name": "sc-cookies-accepted", "value": "true", "domain": "www.snapchat.com", "path": "/"},
+        ]
+
+    def _save_cookies_to_db(self, cookies: List[Dict], user_agent: str = None):
+        """Save cookies to database
+
+        Args:
+            cookies: List of cookie dictionaries
+            user_agent: User agent to associate with cookies (important for cf_clearance).
+                       If not provided, uses self.user_agent as fallback.
+        """
+        if not self.unified_db:
+            return
+
+        try:
+            # Use provided user_agent or fall back to self.user_agent
+            ua = user_agent or self.user_agent
+            self.unified_db.save_scraper_cookies(
+                self.scraper_id,
+                cookies,
+                user_agent=ua,
+                merge=True
+            )
+            self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50]}...)", "debug")
+        except Exception as e:
+            self.log(f"Error saving cookies to database: {e}", "warning")
+
+    def _parse_proxy_url(self, proxy_url: str) -> Optional[Dict]:
+        """
+        Parse proxy URL into Playwright proxy config.
+        Supports: protocol://user:pass@host:port or protocol://host:port
+        """
+        import re
+        try:
+            # Match: protocol://[user:pass@]host:port
+            match = re.match(
+                r'^(https?|socks[45]?)://(?:([^:]+):([^@]+)@)?([^:]+):(\d+)$',
+                proxy_url
+            )
+            if match:
+                protocol, username, password, host, port = match.groups()
+                config = {'server': f'{protocol}://{host}:{port}'}
+                if username and password:
+                    config['username'] = username
+                    config['password'] = password
+                return config
+        except Exception as e:
+            self.log(f"Failed to parse proxy URL: {e}", "warning")
+        return None
+
+    def __enter__(self):
+        """Context manager entry"""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit"""
+        self._close_browser()
+        return False
+
+    def _start_browser(self):
+        """Start Playwright browser"""
+        if self.browser is not None:
+            return
+
+        os.environ['DISPLAY'] = ':100'
+
+        from playwright.sync_api import sync_playwright
+        self._playwright = sync_playwright().start()
+        self.browser = self._playwright.chromium.launch(
+            headless=self.headless,
+            args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
+        )
+
+        # Build context options - use dynamic fingerprinting from FlareSolverr
+        context_options = get_playwright_context_options()
+
+        # IMPORTANT: If cookies have a stored user_agent, use THAT user_agent
+        # Cloudflare cf_clearance cookies are fingerprinted to the browser that solved the challenge
+        try:
+            if self.unified_db:
+                stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
+                if stored_user_agent:
+                    self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug", module="Browser")
+                    context_options['user_agent'] = stored_user_agent
+                else:
+                    self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser")
+            else:
+                self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser")
+        except Exception as e:
+            self.log(f"Error getting stored user_agent, using default: {e}", "debug", module="Browser")
+
+        # Add proxy if configured
+        if self.proxy_url:
+            proxy_config = self._parse_proxy_url(self.proxy_url)
+            if proxy_config:
+                context_options['proxy'] = proxy_config
+                self.log(f"Browser using proxy: {proxy_config.get('server')}", "info", module="Browser")
+
+        self.context = self.browser.new_context(**context_options)
+
+        # Add anti-detection scripts to all pages in this context
+        self.context.add_init_script(get_playwright_stealth_scripts())
+
+        # Add cookies
+        if self.cookies:
+            # Clean cookies for Playwright and convert expiry->expires
+            cleaned = []
+            for c in self.cookies:
+                clean = {k: v for k, v in c.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
+                # FlareSolverr uses 'expiry' but Playwright uses 'expires'
+                if 'expiry' in clean and 'expires' not in clean:
+                    clean['expires'] = clean.pop('expiry')
+                cleaned.append(clean)
+
+            # CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect
+            try:
+                self.context.clear_cookies()
+            except Exception:
+                pass
+
+            self.context.add_cookies(cleaned)
+
+        self.log("Browser started", "info", module="Browser")
+
+    def _close_browser(self):
+        """Close browser and cleanup"""
+        if self.context:
+            try:
+                self.context.close()
+            except Exception as e:
+                self.log(f"Error closing browser context: {e}", "debug")
+            self.context = None
+
+        if self.browser:
+            try:
+                self.browser.close()
+            except Exception as e:
+                self.log(f"Error closing browser: {e}", "debug")
+            self.browser = None
+
+        if self._playwright:
+            try:
+                self._playwright.stop()
+            except Exception as e:
+                self.log(f"Error stopping playwright: {e}", "debug")
+            self._playwright = None
+
+    def _get_next_data(self, page) -> Optional[Dict]:
+        """Extract __NEXT_DATA__ JSON from page"""
+        try:
+            next_data_elem = page.locator('script#__NEXT_DATA__').first
+            if next_data_elem.count() > 0:
+                return json.loads(next_data_elem.inner_text())
+        except Exception as e:
+            self.log(f"Error extracting __NEXT_DATA__: {e}", "debug")
+        return None
+
+    def _set_metadata(self, file_path: str, snap: SnapMedia, description: str = None):
+        """Set EXIF metadata and file timestamp"""
+        try:
+            date_str = snap.timestamp.strftime('%Y:%m:%d %H:%M:%S')
+            desc = description or snap.description or ""
+            if snap.view_count:
+                desc += f" [Views: {snap.view_count}]"
+            desc = desc.strip()
+
+            ext = os.path.splitext(file_path)[1].lower()
+            is_video = ext in ['.mp4', '.mov', '.avi', '.webm']
+            is_image = ext in ['.jpg', '.jpeg', '.png', '.webp']
+
+            exif_args = [
+                'exiftool', '-overwrite_original', '-ignoreMinorErrors',
+                f'-FileModifyDate={date_str}',
+            ]
+
+            if is_image:
+                exif_args.extend([
+                    f'-DateTimeOriginal={date_str}',
+                    f'-CreateDate={date_str}',
+                    f'-ModifyDate={date_str}',
+                    f'-MetadataDate={date_str}',
+                ])
+                if desc:
+                    exif_args.extend([
+                        f'-ImageDescription={desc}',
+                        f'-XPComment={desc}',
+                        f'-UserComment={desc}',
+                    ])
+                if snap.lat and snap.lng:
+                    lat_ref = 'N' if snap.lat >= 0 else 'S'
+                    lng_ref = 'E' if snap.lng >= 0 else 'W'
+                    exif_args.extend([
+                        f'-GPSLatitude={abs(snap.lat)}',
+                        f'-GPSLatitudeRef={lat_ref}',
+                        f'-GPSLongitude={abs(snap.lng)}',
+                        f'-GPSLongitudeRef={lng_ref}',
+                    ])
+
+            elif is_video:
+                exif_args.extend([
+                    f'-CreateDate={date_str}',
+                    f'-ModifyDate={date_str}',
+                    f'-MediaCreateDate={date_str}',
+                    f'-MediaModifyDate={date_str}',
+                    f'-TrackCreateDate={date_str}',
+                    f'-TrackModifyDate={date_str}',
+                ])
+                if desc:
+                    exif_args.extend([
+                        f'-Description={desc}',
+                        f'-Comment={desc}',
+                    ])
+
+            exif_args.append(file_path)
+            subprocess.run(exif_args, capture_output=True, timeout=30)
+
+            # Set filesystem modification time
+            ts = snap.timestamp.timestamp()
+            os.utime(file_path, (ts, ts))
+
+        except Exception as e:
+            self.log(f"Warning: Could not set metadata for {file_path}: {e}", "debug")
+
+    def get_profile_content(self, username: str) -> Dict[str, List[str]]:
+        """Get all spotlight and highlight URLs from a profile"""
+        import time
+
+        if not self.browser:
+            self._start_browser()
+
+        page = self.context.new_page()
+        result = {'spotlights': [], 'highlights': []}
+
+        try:
+            url = f"https://www.snapchat.com/@{username}"
+            self.log(f"Navigating to profile @{username}", "info")
+            page.goto(url, wait_until='networkidle', timeout=30000)
+            time.sleep(2)
+
+            content = page.content()
+
+            # Extract spotlight URLs
+            spotlight_pattern = rf'/@{username}/spotlight/([A-Za-z0-9_-]+)'
+            spotlight_ids = list(set(re.findall(spotlight_pattern, content)))
+            result['spotlights'] = [
+                f"https://www.snapchat.com/@{username}/spotlight/{sid}"
+                for sid in spotlight_ids
+            ]
+            self.log(f"Found {len(result['spotlights'])} spotlights", "info")
+
+            # Click Stories tab to get highlights
+            stories_tab = page.locator('[role="tab"]:has-text("Stories")').first
+            if stories_tab.count() > 0:
+                stories_tab.click()
+                time.sleep(2)
+
+                content = page.content()
+                highlight_pattern = rf'/@{username}/highlight/([A-Za-z0-9-]+)'
+                highlight_ids = list(set(re.findall(highlight_pattern, content)))
+                result['highlights'] = [
+                    f"https://www.snapchat.com/@{username}/highlight/{hid}"
+                    for hid in highlight_ids
+                ]
+                self.log(f"Found {len(result['highlights'])} highlights", "info")
+
+        except Exception as e:
+            self.log(f"Error getting profile content: {e}", "error")
+        finally:
+            page.close()
+
+        return result
+
+    def get_spotlight_metadata(self, url: str) -> Optional[SnapCollection]:
+        """Extract full metadata from a spotlight URL"""
+        import time
+
+        if not self.browser:
+            self._start_browser()
+
+        page = self.context.new_page()
+
+        try:
+            page.goto(url, wait_until='domcontentloaded', timeout=60000)
+            time.sleep(2)
+
+            data = self._get_next_data(page)
+            if not data:
+                return None
+
+            props = (data.get('props') or {}).get('pageProps') or {}
+            feed = props.get('spotlightFeed') or {}
+            stories = feed.get('spotlightStories') or []
+
+            if not stories:
+                return None
+
+            story_data = stories[0]
+            story = story_data.get('story') or {}
+            metadata = (story_data.get('metadata') or {}).get('videoMetadata') or {}
+
+            story_id = (story.get('storyId') or {}).get('value', '')
+            creator = (metadata.get('creator') or {}).get('personCreator') or {}
+            username = creator.get('username', '')
+
+            collection = SnapCollection(
+                collection_id=story_id,
+                collection_type='spotlight',
+                title=metadata.get('description', ''),
+                username=username,
+                url=url
+            )
+
+            for snap_data in story.get('snapList') or []:
+                snap_id = (snap_data.get('snapId') or {}).get('value', '')
+                snap_urls = snap_data.get('snapUrls') or {}
+                media_url = snap_urls.get('mediaUrl', '')
+
+                media_id = ''
+                if '/d/' in media_url:
+                    media_id = media_url.split('/d/')[1].split('.')[0]
+
+                ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
+                timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
+
+                snap = SnapMedia(
+                    media_id=media_id or snap_id,
+                    media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
+                    media_url=media_url,
+                    timestamp=timestamp,
+                    index=snap_data.get('snapIndex', 0),
+                    thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
+                    duration_ms=int(metadata.get('durationMs', 0)),
+                    description=metadata.get('description', ''),
+                    view_count=int(metadata.get('viewCount', 0)),
+                    width=int(metadata.get('width', 540)),
+                    height=int(metadata.get('height', 960))
+                )
+                collection.snaps.append(snap)
+
+            return collection
+
+        except Exception as e:
+            self.log(f"Error getting spotlight metadata: {e}", "error")
+            return None
+        finally:
+            page.close()
+
+    def get_highlight_metadata(self, url: str) -> Optional[SnapCollection]:
+        """Extract full metadata from a highlight URL"""
+        import time
+
+        if not self.browser:
+            self._start_browser()
+
+        page = self.context.new_page()
+
+        try:
+            page.goto(url, wait_until='domcontentloaded', timeout=60000)
+            time.sleep(2)
+
+            data = self._get_next_data(page)
+            if not data:
+                return None
+
+            props = (data.get('props') or {}).get('pageProps') or {}
+            highlight = props.get('highlight') or {}
+
+            if not highlight:
+                return None
+
+            highlight_id = highlight.get('highlightId') or {}
+            if isinstance(highlight_id, dict):
+                highlight_id = highlight_id.get('value', '')
+
+            username_match = re.search(r'@([^/]+)', url)
+            username = username_match.group(1) if username_match else ''
+
+            title = highlight.get('storyTitle') or {}
+            if isinstance(title, dict):
+                title = title.get('value', '')
+
+            collection = SnapCollection(
+                collection_id=highlight_id,
+                collection_type='highlight',
+                title=title or 'Untitled Highlight',
+                username=username,
+                url=url
+            )
+
+            for snap_data in highlight.get('snapList') or []:
+                snap_urls = snap_data.get('snapUrls') or {}
+                media_url = snap_urls.get('mediaUrl', '')
+
+                media_id = ''
+                if '/d/' in media_url:
+                    media_id = media_url.split('/d/')[1].split('.')[0]
+
+                ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
+                timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
+
+                lat = snap_data.get('lat')
+                lng = snap_data.get('lng')
+
+                snap = SnapMedia(
+                    media_id=media_id,
+                    media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
+                    media_url=media_url,
+                    timestamp=timestamp,
+                    index=snap_data.get('snapIndex', 0),
+                    thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
+                    lat=float(lat) if lat else None,
+                    lng=float(lng) if lng else None
+                )
+                collection.snaps.append(snap)
+
+            return collection
+
+        except Exception as e:
+            self.log(f"Error getting highlight metadata: {e}", "error")
+            return None
+        finally:
+            page.close()
+
+    def _download_media_file(self, snap: SnapMedia, output_path: str) -> bool:
+        """Download a single media file"""
+        try:
+            url = snap.media_url.replace('&amp;', '&')
+
+            result = subprocess.run([
+                'curl', '-sL', '-o', output_path,
+                '-H', 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+                url
+            ], capture_output=True, timeout=60)
+
+            if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+                self._set_metadata(output_path, snap)
+                return True
+            return False
+
+        except Exception as e:
+            self.log(f"Error downloading media: {e}", "error")
+            return False
+
+    def _generate_filename(self, username: str, snap: SnapMedia, ext: str) -> str:
+        """Generate filename with timestamp and media ID (FastDL format)"""
+        date_str = snap.timestamp.strftime('%Y%m%d_%H%M%S')
+        return f"{username}_{date_str}_{snap.media_id}.{ext}"
+
+    def _record_download(self, username: str, url: str, filename: str,
+                         post_date=None, metadata: dict = None, file_path: str = None,
+                         deferred: bool = False):
+        """Record a download in the database"""
+        if deferred:
+            self.pending_downloads.append({
+                'username': username,
+                'url': url,
+                'filename': filename,
+                'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
+                'file_path': file_path,
+                'metadata': metadata
+            })
+            return True
+
+        if not self.db:
+            return
+
+        try:
+            self.db.mark_downloaded(
+                username=username,
+                url=url,
+                filename=filename,
+                post_date=post_date,
+                metadata=metadata,
+                file_path=file_path
+            )
+        except Exception as e:
+            self.log(f"Failed to record download: {e}", "debug")
+
+    def get_pending_downloads(self):
+        """Get list of downloads that were deferred"""
+        return self.pending_downloads.copy()
+
+    def clear_pending_downloads(self):
+        """Clear the pending downloads list"""
+        self.pending_downloads = []
+
+    def _get_processed_posts(self, username: str) -> Set[str]:
+        """Get set of media IDs that have been processed"""
+        processed = set()
+        if not self.db:
+            return processed
+
+        try:
+            with self.db.get_connection() as conn:
+                cursor = conn.cursor()
+                cursor.execute('''
+                    SELECT filename, metadata FROM downloads
+                    WHERE platform = 'snapchat'
+                    AND source = ?
+                ''', (username,))
+
+                for row in cursor.fetchall():
+                    filename, metadata_str = row
+                    if filename:
+                        parts = filename.split('_')
+                        if len(parts) >= 4:
+                            media_id = '_'.join(parts[3:]).split('.')[0]
+                            processed.add(media_id)
+
+                    if metadata_str:
+                        try:
+                            metadata = json.loads(metadata_str)
+                            if 'media_id' in metadata:
+                                processed.add(metadata['media_id'])
+                        except (json.JSONDecodeError, TypeError, KeyError):
+                            pass  # Invalid metadata, skip
+
+        except Exception as e:
+            self.log(f"Error loading processed posts: {e}", "debug")
+
+        return processed
+
+    def download(self, username: str, content_type: str = "all", days_back: int = 14,
+                 max_downloads: int = 50, output_dir: str = None,
+                 spotlight_dir: str = None, stories_dir: str = None,
+                 stitch_highlights: bool = True, defer_database: bool = False,
+                 phrase_config: dict = None):
+        """
+        Download content from a user - compatible with media-downloader interface
+
+        Args:
+            username: Snapchat username
+            content_type: "spotlight", "stories", "highlights", or "all"
+            days_back: How many days back to download (filters by post date)
+            max_downloads: Maximum items to download per content type
+            output_dir: Default output directory (used if specific dirs not set)
+            spotlight_dir: Output directory for spotlights
+            stories_dir: Output directory for stories/highlights
+            stitch_highlights: Ignored (kept for backwards compatibility)
+            defer_database: If True, defer database recording
+            phrase_config: Not used (for interface compatibility)
+
+        Returns:
+            Number of files downloaded
+        """
+        self.defer_database = defer_database
+        self.downloaded_files.clear()
+
+        # Set output directories
+        # If specific dirs provided, use them directly
+        # If only output_dir provided, use it directly (caller handles structure)
+        # If nothing provided, use default with subdirectories
+        if spotlight_dir:
+            spotlight_output = Path(spotlight_dir)
+        elif output_dir:
+            spotlight_output = Path(output_dir)
+        else:
+            spotlight_output = Path(f"/opt/media-downloader/downloads/snapchat/spotlight/{username}")
+
+        if stories_dir:
+            stories_output = Path(stories_dir)
+        elif output_dir:
+            stories_output = Path(output_dir)
+        else:
+            stories_output = Path(f"/opt/media-downloader/downloads/snapchat/stories/{username}")
+
+        spotlight_output.mkdir(parents=True, exist_ok=True)
+        stories_output.mkdir(parents=True, exist_ok=True)
+
+        # Update activity status
+        if self.activity_manager:
+            self.activity_manager.update_status("Checking Snapchat")
+
+        # Get processed posts
+        processed = self._get_processed_posts(username)
+        self.log(f"Loaded {len(processed)} processed posts from database", "debug")
+
+        cutoff_date = datetime.now() - timedelta(days=days_back)
+        downloaded_count = 0
+
+        # Crash recovery checkpoint
+        from modules.task_checkpoint import TaskCheckpoint
+        checkpoint = TaskCheckpoint(f'snapchat:{username}', 'scraping')
+
+        try:
+            # Start browser
+            self._start_browser()
+
+            # Get profile content
+            content = self.get_profile_content(username)
+
+            # Count total items for checkpoint
+            total_items = 0
+            if content_type in ['spotlight', 'all'] and content['spotlights']:
+                total_items += min(len(content['spotlights']), max_downloads)
+            if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
+                total_items += min(len(content['highlights']), max_downloads)
+            checkpoint.start(total_items=total_items)
+            if checkpoint.is_recovering():
+                self.log(f"Snapchat @{username}: recovering — skipping already-processed URLs", "info")
+
+            # Download spotlights
+            if content_type in ['spotlight', 'all'] and content['spotlights']:
+                spotlight_items = content['spotlights'][:max_downloads]
+                self.log(f"Processing {len(spotlight_items)} spotlights...", "info")
+
+                if self.activity_manager:
+                    self.activity_manager.update_status(
+                        "Downloading spotlights",
+                        progress_current=0,
+                        progress_total=len(spotlight_items)
+                    )
+
+                for spot_idx, url in enumerate(spotlight_items):
+                    # Update progress at start of each iteration (fires even on skips)
+                    if self.activity_manager:
+                        self.activity_manager.update_status(
+                            "Downloading spotlights",
+                            progress_current=spot_idx + 1,
+                            progress_total=len(spotlight_items)
+                        )
+
+                    if checkpoint.is_completed(url):
+                        continue
+
+                    checkpoint.set_current(url)
+
+                    try:
+                        spotlight = self.get_spotlight_metadata(url)
+                        if not spotlight or not spotlight.snaps:
+                            continue
+
+                        snap = spotlight.snaps[0]
+
+                        # Check date filter
+                        if snap.timestamp < cutoff_date:
+                            self.log(f"Spotlight {snap.media_id} is older than {days_back} days, skipping", "debug")
+                            continue
+
+                        # Check if already processed
+                        if snap.media_id in processed or snap.media_id in self.downloaded_files:
+                            self.log(f"Spotlight {snap.media_id} already processed, skipping", "debug")
+                            continue
+
+                        # Download
+                        ext = 'mp4' if snap.media_type == 'video' else 'jpg'
+                        filename = self._generate_filename(username, snap, ext)
+                        output_path = str(spotlight_output / filename)
+
+                        if self._download_media_file(snap, output_path):
+                            self.downloaded_files.add(snap.media_id)
+                            downloaded_count += 1
+                            self.log(f"Downloaded spotlight: {filename}", "info")
+
+                            self._record_download(
+                                username=username,
+                                url=url,
+                                filename=filename,
+                                post_date=snap.timestamp,
+                                metadata={
+                                    'media_id': snap.media_id,
+                                    'description': snap.description,
+                                    'view_count': snap.view_count,
+                                    'content_type': 'spotlight'
+                                },
+                                file_path=output_path,
+                                deferred=defer_database
+                            )
+
+                    except Exception as e:
+                        self.log(f"Error processing spotlight: {e}", "error")
+
+                    checkpoint.mark_completed(url)
+
+            # Download highlights (stories)
+            if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
+                highlight_items = content['highlights'][:max_downloads]
+                self.log(f"Processing {len(highlight_items)} highlights...", "info")
+
+                if self.activity_manager:
+                    self.activity_manager.update_status(
+                        "Downloading highlights",
+                        progress_current=0,
+                        progress_total=len(highlight_items)
+                    )
+
+                for hi_idx, url in enumerate(highlight_items):
+                    # Update progress at start of each iteration (fires even on skips)
+                    if self.activity_manager:
+                        self.activity_manager.update_status(
+                            "Downloading highlights",
+                            progress_current=hi_idx + 1,
+                            progress_total=len(highlight_items)
+                        )
+
+                    if checkpoint.is_completed(url):
+                        continue
+
+                    checkpoint.set_current(url)
+
+                    try:
+                        highlight = self.get_highlight_metadata(url)
+                        if not highlight or not highlight.snaps:
+                            continue
+
+                        # Check if any snap is within date range
+                        newest_snap = max(highlight.snaps, key=lambda s: s.timestamp)
+                        if newest_snap.timestamp < cutoff_date:
+                            self.log(f"Highlight {highlight.collection_id} is older than {days_back} days, skipping", "debug")
+                            continue
+
+                        # Check if already processed
+                        if highlight.collection_id in processed or highlight.collection_id in self.downloaded_files:
+                            self.log(f"Highlight {highlight.collection_id} already processed, skipping", "debug")
+                            continue
+
+                        # Separate videos and images
+                        videos = [s for s in highlight.snaps if s.media_type == 'video']
+                        images = [s for s in highlight.snaps if s.media_type == 'image']
+
+                        # Download images individually
+                        for snap in images:
+                            if snap.timestamp < cutoff_date:
+                                continue
+                            if snap.media_id in processed or snap.media_id in self.downloaded_files:
+                                continue
+
+                            filename = self._generate_filename(username, snap, 'jpg')
+                            output_path = str(stories_output / filename)
+
+                            if self._download_media_file(snap, output_path):
+                                self.downloaded_files.add(snap.media_id)
+                                downloaded_count += 1
+                                self.log(f"Downloaded image: {filename}", "info")
+
+                                self._record_download(
+                                    username=username,
+                                    url=highlight.url,
+                                    filename=filename,
+                                    post_date=snap.timestamp,
+                                    metadata={
+                                        'media_id': snap.media_id,
+                                        'highlight_id': highlight.collection_id,
+                                        'content_type': 'highlight_image'
+                                    },
+                                    file_path=output_path,
+                                    deferred=defer_database
+                                )
+
+                        # Handle videos - download each clip individually
+                        if videos:
+                            for snap in videos:
+                                if snap.timestamp < cutoff_date:
+                                    continue
+                                if snap.media_id in processed or snap.media_id in self.downloaded_files:
+                                    continue
+
+                                filename = self._generate_filename(username, snap, 'mp4')
+                                output_path = str(stories_output / filename)
+
+                                if self._download_media_file(snap, output_path):
+                                    self._set_metadata(output_path, snap)
+                                    self.downloaded_files.add(snap.media_id)
+                                    downloaded_count += 1
+                                    self.log(f"Downloaded video: {filename}", "info")
+
+                                    self._record_download(
+                                        username=username,
+                                        url=highlight.url,
+                                        filename=filename,
+                                        post_date=snap.timestamp,
+                                        metadata={
+                                            'media_id': snap.media_id,
+                                            'highlight_id': highlight.collection_id,
+                                            'content_type': 'highlight_video'
+                                        },
+                                        file_path=output_path,
+                                        deferred=defer_database
+                                    )
+
+                    except Exception as e:
+                        self.log(f"Error processing highlight: {e}", "error")
+
+                    checkpoint.mark_completed(url)
+
+        except Exception as e:
+            self.log(f"Error during download: {e}", "error")
+
+        checkpoint.finish()
+        self.log(f"Downloaded {downloaded_count} files for @{username}", "info")
+        return downloaded_count
+
+
+def test_scraper():
+    """Test the scraper"""
+    print("=" * 60)
+    print("SNAPCHAT DIRECT SCRAPER TEST")
+    print("=" * 60)
+
+    with SnapchatDirectScraper(headless=True) as scraper:
+        username = "evalongoria"
+
+        # Test download
+        count = scraper.download(
+            username=username,
+            content_type="all",
+            days_back=30,
+            max_downloads=5,
+            spotlight_dir="/tmp/snap_test/spotlight",
+            stories_dir="/tmp/snap_test/stories",
+            stitch_highlights=True
+        )
+
+        print(f"\nDownloaded {count} files")
+
+        # Show files
+        import os
+        for root, dirs, files in os.walk("/tmp/snap_test"):
+            for f in files:
+                path = os.path.join(root, f)
+                size = os.path.getsize(path) / 1024
+                print(f"  {path}: {size:.1f}KB")
+
+    print("=" * 60)
+    print("TEST COMPLETE")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    test_scraper()