media-downloader/modules/snapchat_scraper.py

#!/usr/bin/env python3
"""
Snapchat Direct Scraper Module - Scrapes directly from Snapchat.com

Uses Playwright to scrape profiles and extract:
- Spotlight videos (540x960)
- Stories/Highlights (480x852, stitched into single videos)

Full metadata extraction including timestamps, media IDs, descriptions.
Follows the same interface as the original snapchat_module.py
"""

import os
import json
import re
import tempfile
import subprocess
import shutil
import platform
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Dict, List, Any, Set
from dataclasses import dataclass, field

# Set environment for Playwright
os.environ.setdefault('PLAYWRIGHT_BROWSERS_PATH', '/root/.cache/ms-playwright')

from modules.base_module import LoggingMixin
from modules.cloudflare_handler import (
    get_playwright_context_options,
    get_playwright_stealth_scripts,
    get_flaresolverr_user_agent
)


@dataclass
class SnapMedia:
    """Represents a single snap media item"""
    media_id: str
    media_type: str  # 'video' or 'image'
    media_url: str
    timestamp: datetime
    index: int = 0
    thumbnail_url: str = ""
    duration_ms: int = 0
    description: str = ""
    view_count: int = 0
    width: int = 0
    height: int = 0
    lat: Optional[float] = None
    lng: Optional[float] = None


@dataclass
class SnapCollection:
    """Represents a spotlight or highlight collection"""
    collection_id: str
    collection_type: str  # 'spotlight' or 'highlight'
    title: str = ""
    username: str = ""
    snaps: List[SnapMedia] = field(default_factory=list)
    url: str = ""


class SnapchatDirectScraper(LoggingMixin):
    """
    Scrapes Snapchat profiles directly for media content.

    Follows the same interface as SnapchatDownloader for compatibility
    with the media-downloader system.
    """

    def __init__(self,
                 headless: bool = True,
                 show_progress: bool = True,
                 use_database: bool = True,
                 log_callback=None,
                 unified_db=None):
        """Initialize scraper compatible with media-downloader system"""
        self.headless = headless
        self.show_progress = show_progress
        self.use_database = use_database
        self.unified_db = unified_db
        self.scraper_id = 'snapchat_direct'
        self.download_count = 0
        self.downloaded_files: Set[str] = set()
        self.pending_downloads = []

        # Initialize logging via mixin
        self._init_logger('SnapchatDirect', log_callback, default_module='Download')

        # User-Agent to match FlareSolverr (dynamically fetched for consistency)
        self.user_agent = get_flaresolverr_user_agent()

        # Browser state
        self._playwright = None
        self.browser = None
        self.context = None

        # Database adapter
        if unified_db and use_database:
            from modules.unified_database import SnapchatDatabaseAdapter
            self.db = SnapchatDatabaseAdapter(unified_db)
        else:
            self.db = None
            self.use_database = False

        # Activity status manager
        try:
            from modules.activity_status import get_activity_manager
            self.activity_manager = get_activity_manager(unified_db)
        except ImportError:
            self.activity_manager = None

        # Load cookies from database
        self.cookies = self._load_cookies_from_db()

        # Load proxy configuration from database
        self.proxy_url = None
        if unified_db:
            try:
                scraper_config = unified_db.get_scraper('snapchat')
                if scraper_config and scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
                    self.proxy_url = scraper_config['proxy_url']
                    self.log(f"Using proxy: {self.proxy_url}", "info")
            except Exception as e:
                self.log(f"Could not load proxy config: {e}", "debug")

    def _load_cookies_from_db(self) -> List[Dict]:
        """Load cookies from database"""
        if not self.unified_db:
            return self._get_default_cookies()

        try:
            cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
            if cookies:
                self.log(f"Loaded {len(cookies)} cookies from database", "debug")
                return cookies
        except Exception as e:
            self.log(f"Error loading cookies from database: {e}", "warning")

        # Try loading from original snapchat scraper
        try:
            cookies = self.unified_db.get_scraper_cookies('snapchat')
            if cookies:
                self.log(f"Using cookies from 'snapchat' scraper", "debug")
                return cookies
        except Exception as e:
            self.log(f"Error loading cookies from snapchat scraper: {e}", "debug")

        return self._get_default_cookies()

    def _get_default_cookies(self) -> List[Dict]:
        """Get default cookies for Snapchat"""
        return [
            {"name": "sc-cookies-accepted", "value": "true", "domain": "www.snapchat.com", "path": "/"},
        ]

    def _save_cookies_to_db(self, cookies: List[Dict], user_agent: str = None):
        """Save cookies to database

        Args:
            cookies: List of cookie dictionaries
            user_agent: User agent to associate with cookies (important for cf_clearance).
                       If not provided, uses self.user_agent as fallback.
        """
        if not self.unified_db:
            return

        try:
            # Use provided user_agent or fall back to self.user_agent
            ua = user_agent or self.user_agent
            self.unified_db.save_scraper_cookies(
                self.scraper_id,
                cookies,
                user_agent=ua,
                merge=True
            )
            self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50]}...)", "debug")
        except Exception as e:
            self.log(f"Error saving cookies to database: {e}", "warning")

    def _parse_proxy_url(self, proxy_url: str) -> Optional[Dict]:
        """
        Parse proxy URL into Playwright proxy config.
        Supports: protocol://user:pass@host:port or protocol://host:port
        """
        import re
        try:
            # Match: protocol://[user:pass@]host:port
            match = re.match(
                r'^(https?|socks[45]?)://(?:([^:]+):([^@]+)@)?([^:]+):(\d+)$',
                proxy_url
            )
            if match:
                protocol, username, password, host, port = match.groups()
                config = {'server': f'{protocol}://{host}:{port}'}
                if username and password:
                    config['username'] = username
                    config['password'] = password
                return config
        except Exception as e:
            self.log(f"Failed to parse proxy URL: {e}", "warning")
        return None

    def __enter__(self):
        """Context manager entry"""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit"""
        self._close_browser()
        return False

    def _start_browser(self):
        """Start Playwright browser"""
        if self.browser is not None:
            return

        os.environ['DISPLAY'] = ':100'

        from playwright.sync_api import sync_playwright
        self._playwright = sync_playwright().start()
        self.browser = self._playwright.chromium.launch(
            headless=self.headless,
            args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
        )

        # Build context options - use dynamic fingerprinting from FlareSolverr
        context_options = get_playwright_context_options()

        # IMPORTANT: If cookies have a stored user_agent, use THAT user_agent
        # Cloudflare cf_clearance cookies are fingerprinted to the browser that solved the challenge
        try:
            if self.unified_db:
                stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
                if stored_user_agent:
                    self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug", module="Browser")
                    context_options['user_agent'] = stored_user_agent
                else:
                    self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser")
            else:
                self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser")
        except Exception as e:
            self.log(f"Error getting stored user_agent, using default: {e}", "debug", module="Browser")

        # Add proxy if configured
        if self.proxy_url:
            proxy_config = self._parse_proxy_url(self.proxy_url)
            if proxy_config:
                context_options['proxy'] = proxy_config
                self.log(f"Browser using proxy: {proxy_config.get('server')}", "info", module="Browser")

        self.context = self.browser.new_context(**context_options)

        # Add anti-detection scripts to all pages in this context
        self.context.add_init_script(get_playwright_stealth_scripts())

        # Add cookies
        if self.cookies:
            # Clean cookies for Playwright and convert expiry->expires
            cleaned = []
            for c in self.cookies:
                clean = {k: v for k, v in c.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
                # FlareSolverr uses 'expiry' but Playwright uses 'expires'
                if 'expiry' in clean and 'expires' not in clean:
                    clean['expires'] = clean.pop('expiry')
                cleaned.append(clean)

            # CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect
            try:
                self.context.clear_cookies()
            except Exception:
                pass

            self.context.add_cookies(cleaned)

        self.log("Browser started", "info", module="Browser")

    def _close_browser(self):
        """Close browser and cleanup"""
        if self.context:
            try:
                self.context.close()
            except Exception as e:
                self.log(f"Error closing browser context: {e}", "debug")
            self.context = None

        if self.browser:
            try:
                self.browser.close()
            except Exception as e:
                self.log(f"Error closing browser: {e}", "debug")
            self.browser = None

        if self._playwright:
            try:
                self._playwright.stop()
            except Exception as e:
                self.log(f"Error stopping playwright: {e}", "debug")
            self._playwright = None

    def _get_next_data(self, page) -> Optional[Dict]:
        """Extract __NEXT_DATA__ JSON from page"""
        try:
            next_data_elem = page.locator('script#__NEXT_DATA__').first
            if next_data_elem.count() > 0:
                return json.loads(next_data_elem.inner_text())
        except Exception as e:
            self.log(f"Error extracting __NEXT_DATA__: {e}", "debug")
        return None

    def _set_metadata(self, file_path: str, snap: SnapMedia, description: str = None):
        """Set EXIF metadata and file timestamp"""
        try:
            date_str = snap.timestamp.strftime('%Y:%m:%d %H:%M:%S')
            desc = description or snap.description or ""
            if snap.view_count:
                desc += f" [Views: {snap.view_count}]"
            desc = desc.strip()

            ext = os.path.splitext(file_path)[1].lower()
            is_video = ext in ['.mp4', '.mov', '.avi', '.webm']
            is_image = ext in ['.jpg', '.jpeg', '.png', '.webp']

            exif_args = [
                'exiftool', '-overwrite_original', '-ignoreMinorErrors',
                f'-FileModifyDate={date_str}',
            ]

            if is_image:
                exif_args.extend([
                    f'-DateTimeOriginal={date_str}',
                    f'-CreateDate={date_str}',
                    f'-ModifyDate={date_str}',
                    f'-MetadataDate={date_str}',
                ])
                if desc:
                    exif_args.extend([
                        f'-ImageDescription={desc}',
                        f'-XPComment={desc}',
                        f'-UserComment={desc}',
                    ])
                if snap.lat and snap.lng:
                    lat_ref = 'N' if snap.lat >= 0 else 'S'
                    lng_ref = 'E' if snap.lng >= 0 else 'W'
                    exif_args.extend([
                        f'-GPSLatitude={abs(snap.lat)}',
                        f'-GPSLatitudeRef={lat_ref}',
                        f'-GPSLongitude={abs(snap.lng)}',
                        f'-GPSLongitudeRef={lng_ref}',
                    ])

            elif is_video:
                exif_args.extend([
                    f'-CreateDate={date_str}',
                    f'-ModifyDate={date_str}',
                    f'-MediaCreateDate={date_str}',
                    f'-MediaModifyDate={date_str}',
                    f'-TrackCreateDate={date_str}',
                    f'-TrackModifyDate={date_str}',
                ])
                if desc:
                    exif_args.extend([
                        f'-Description={desc}',
                        f'-Comment={desc}',
                    ])

            exif_args.append(file_path)
            subprocess.run(exif_args, capture_output=True, timeout=30)

            # Set filesystem modification time
            ts = snap.timestamp.timestamp()
            os.utime(file_path, (ts, ts))

        except Exception as e:
            self.log(f"Warning: Could not set metadata for {file_path}: {e}", "debug")

    def get_profile_content(self, username: str) -> Dict[str, List[str]]:
        """Get all spotlight and highlight URLs from a profile"""
        import time

        if not self.browser:
            self._start_browser()

        page = self.context.new_page()
        result = {'spotlights': [], 'highlights': []}

        try:
            url = f"https://www.snapchat.com/@{username}"
            self.log(f"Navigating to profile @{username}", "info")
            page.goto(url, wait_until='networkidle', timeout=30000)
            time.sleep(2)

            content = page.content()

            # Extract spotlight URLs
            spotlight_pattern = rf'/@{username}/spotlight/([A-Za-z0-9_-]+)'
            spotlight_ids = list(set(re.findall(spotlight_pattern, content)))
            result['spotlights'] = [
                f"https://www.snapchat.com/@{username}/spotlight/{sid}"
                for sid in spotlight_ids
            ]
            self.log(f"Found {len(result['spotlights'])} spotlights", "info")

            # Click Stories tab to get highlights
            stories_tab = page.locator('[role="tab"]:has-text("Stories")').first
            if stories_tab.count() > 0:
                stories_tab.click()
                time.sleep(2)

                content = page.content()
                highlight_pattern = rf'/@{username}/highlight/([A-Za-z0-9-]+)'
                highlight_ids = list(set(re.findall(highlight_pattern, content)))
                result['highlights'] = [
                    f"https://www.snapchat.com/@{username}/highlight/{hid}"
                    for hid in highlight_ids
                ]
                self.log(f"Found {len(result['highlights'])} highlights", "info")

        except Exception as e:
            self.log(f"Error getting profile content: {e}", "error")
        finally:
            page.close()

        return result

    def get_spotlight_metadata(self, url: str) -> Optional[SnapCollection]:
        """Extract full metadata from a spotlight URL"""
        import time

        if not self.browser:
            self._start_browser()

        page = self.context.new_page()

        try:
            page.goto(url, wait_until='domcontentloaded', timeout=60000)
            time.sleep(2)

            data = self._get_next_data(page)
            if not data:
                return None

            props = (data.get('props') or {}).get('pageProps') or {}
            feed = props.get('spotlightFeed') or {}
            stories = feed.get('spotlightStories') or []

            if not stories:
                return None

            story_data = stories[0]
            story = story_data.get('story') or {}
            metadata = (story_data.get('metadata') or {}).get('videoMetadata') or {}

            story_id = (story.get('storyId') or {}).get('value', '')
            creator = (metadata.get('creator') or {}).get('personCreator') or {}
            username = creator.get('username', '')

            collection = SnapCollection(
                collection_id=story_id,
                collection_type='spotlight',
                title=metadata.get('description', ''),
                username=username,
                url=url
            )

            for snap_data in story.get('snapList') or []:
                snap_id = (snap_data.get('snapId') or {}).get('value', '')
                snap_urls = snap_data.get('snapUrls') or {}
                media_url = snap_urls.get('mediaUrl', '')

                media_id = ''
                if '/d/' in media_url:
                    media_id = media_url.split('/d/')[1].split('.')[0]

                ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
                timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()

                snap = SnapMedia(
                    media_id=media_id or snap_id,
                    media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
                    media_url=media_url,
                    timestamp=timestamp,
                    index=snap_data.get('snapIndex', 0),
                    thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
                    duration_ms=int(metadata.get('durationMs', 0)),
                    description=metadata.get('description', ''),
                    view_count=int(metadata.get('viewCount', 0)),
                    width=int(metadata.get('width', 540)),
                    height=int(metadata.get('height', 960))
                )
                collection.snaps.append(snap)

            return collection

        except Exception as e:
            self.log(f"Error getting spotlight metadata: {e}", "error")
            return None
        finally:
            page.close()

    def get_highlight_metadata(self, url: str) -> Optional[SnapCollection]:
        """Extract full metadata from a highlight URL"""
        import time

        if not self.browser:
            self._start_browser()

        page = self.context.new_page()

        try:
            page.goto(url, wait_until='domcontentloaded', timeout=60000)
            time.sleep(2)

            data = self._get_next_data(page)
            if not data:
                return None

            props = (data.get('props') or {}).get('pageProps') or {}
            highlight = props.get('highlight') or {}

            if not highlight:
                return None

            highlight_id = highlight.get('highlightId') or {}
            if isinstance(highlight_id, dict):
                highlight_id = highlight_id.get('value', '')

            username_match = re.search(r'@([^/]+)', url)
            username = username_match.group(1) if username_match else ''

            title = highlight.get('storyTitle') or {}
            if isinstance(title, dict):
                title = title.get('value', '')

            collection = SnapCollection(
                collection_id=highlight_id,
                collection_type='highlight',
                title=title or 'Untitled Highlight',
                username=username,
                url=url
            )

            for snap_data in highlight.get('snapList') or []:
                snap_urls = snap_data.get('snapUrls') or {}
                media_url = snap_urls.get('mediaUrl', '')

                media_id = ''
                if '/d/' in media_url:
                    media_id = media_url.split('/d/')[1].split('.')[0]

                ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
                timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()

                lat = snap_data.get('lat')
                lng = snap_data.get('lng')

                snap = SnapMedia(
                    media_id=media_id,
                    media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
                    media_url=media_url,
                    timestamp=timestamp,
                    index=snap_data.get('snapIndex', 0),
                    thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
                    lat=float(lat) if lat else None,
                    lng=float(lng) if lng else None
                )
                collection.snaps.append(snap)

            return collection

        except Exception as e:
            self.log(f"Error getting highlight metadata: {e}", "error")
            return None
        finally:
            page.close()

    def _download_media_file(self, snap: SnapMedia, output_path: str) -> bool:
        """Download a single media file"""
        try:
            url = snap.media_url.replace('&amp;', '&')

            result = subprocess.run([
                'curl', '-sL', '-o', output_path,
                '-H', 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                url
            ], capture_output=True, timeout=60)

            if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
                self._set_metadata(output_path, snap)
                return True
            return False

        except Exception as e:
            self.log(f"Error downloading media: {e}", "error")
            return False

    def _generate_filename(self, username: str, snap: SnapMedia, ext: str) -> str:
        """Generate filename with timestamp and media ID (FastDL format)"""
        date_str = snap.timestamp.strftime('%Y%m%d_%H%M%S')
        return f"{username}_{date_str}_{snap.media_id}.{ext}"

    def _record_download(self, username: str, url: str, filename: str,
                         post_date=None, metadata: dict = None, file_path: str = None,
                         deferred: bool = False):
        """Record a download in the database"""
        if deferred:
            self.pending_downloads.append({
                'username': username,
                'url': url,
                'filename': filename,
                'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
                'file_path': file_path,
                'metadata': metadata
            })
            return True

        if not self.db:
            return

        try:
            self.db.mark_downloaded(
                username=username,
                url=url,
                filename=filename,
                post_date=post_date,
                metadata=metadata,
                file_path=file_path
            )
        except Exception as e:
            self.log(f"Failed to record download: {e}", "debug")

    def get_pending_downloads(self):
        """Get list of downloads that were deferred"""
        return self.pending_downloads.copy()

    def clear_pending_downloads(self):
        """Clear the pending downloads list"""
        self.pending_downloads = []

    def _get_processed_posts(self, username: str) -> Set[str]:
        """Get set of media IDs that have been processed"""
        processed = set()
        if not self.db:
            return processed

        try:
            with self.db.get_connection() as conn:
                cursor = conn.cursor()
                cursor.execute('''
                    SELECT filename, metadata FROM downloads
                    WHERE platform = 'snapchat'
                    AND source = ?
                ''', (username,))

                for row in cursor.fetchall():
                    filename, metadata_str = row
                    if filename:
                        parts = filename.split('_')
                        if len(parts) >= 4:
                            media_id = '_'.join(parts[3:]).split('.')[0]
                            processed.add(media_id)

                    if metadata_str:
                        try:
                            metadata = json.loads(metadata_str)
                            if 'media_id' in metadata:
                                processed.add(metadata['media_id'])
                        except (json.JSONDecodeError, TypeError, KeyError):
                            pass  # Invalid metadata, skip

        except Exception as e:
            self.log(f"Error loading processed posts: {e}", "debug")

        return processed

    def download(self, username: str, content_type: str = "all", days_back: int = 14,
                 max_downloads: int = 50, output_dir: str = None,
                 spotlight_dir: str = None, stories_dir: str = None,
                 stitch_highlights: bool = True, defer_database: bool = False,
                 phrase_config: dict = None):
        """
        Download content from a user - compatible with media-downloader interface

        Args:
            username: Snapchat username
            content_type: "spotlight", "stories", "highlights", or "all"
            days_back: How many days back to download (filters by post date)
            max_downloads: Maximum items to download per content type
            output_dir: Default output directory (used if specific dirs not set)
            spotlight_dir: Output directory for spotlights
            stories_dir: Output directory for stories/highlights
            stitch_highlights: Ignored (kept for backwards compatibility)
            defer_database: If True, defer database recording
            phrase_config: Not used (for interface compatibility)

        Returns:
            Number of files downloaded
        """
        self.defer_database = defer_database
        self.downloaded_files.clear()

        # Set output directories
        # If specific dirs provided, use them directly
        # If only output_dir provided, use it directly (caller handles structure)
        # If nothing provided, use default with subdirectories
        if spotlight_dir:
            spotlight_output = Path(spotlight_dir)
        elif output_dir:
            spotlight_output = Path(output_dir)
        else:
            spotlight_output = Path(f"/opt/media-downloader/downloads/snapchat/spotlight/{username}")

        if stories_dir:
            stories_output = Path(stories_dir)
        elif output_dir:
            stories_output = Path(output_dir)
        else:
            stories_output = Path(f"/opt/media-downloader/downloads/snapchat/stories/{username}")

        spotlight_output.mkdir(parents=True, exist_ok=True)
        stories_output.mkdir(parents=True, exist_ok=True)

        # Update activity status
        if self.activity_manager:
            self.activity_manager.update_status("Checking Snapchat")

        # Get processed posts
        processed = self._get_processed_posts(username)
        self.log(f"Loaded {len(processed)} processed posts from database", "debug")

        cutoff_date = datetime.now() - timedelta(days=days_back)
        downloaded_count = 0

        # Crash recovery checkpoint
        from modules.task_checkpoint import TaskCheckpoint
        checkpoint = TaskCheckpoint(f'snapchat:{username}', 'scraping')

        try:
            # Start browser
            self._start_browser()

            # Get profile content
            content = self.get_profile_content(username)

            # Count total items for checkpoint
            total_items = 0
            if content_type in ['spotlight', 'all'] and content['spotlights']:
                total_items += min(len(content['spotlights']), max_downloads)
            if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
                total_items += min(len(content['highlights']), max_downloads)
            checkpoint.start(total_items=total_items)
            if checkpoint.is_recovering():
                self.log(f"Snapchat @{username}: recovering — skipping already-processed URLs", "info")

            # Download spotlights
            if content_type in ['spotlight', 'all'] and content['spotlights']:
                spotlight_items = content['spotlights'][:max_downloads]
                self.log(f"Processing {len(spotlight_items)} spotlights...", "info")

                if self.activity_manager:
                    self.activity_manager.update_status(
                        "Downloading spotlights",
                        progress_current=0,
                        progress_total=len(spotlight_items)
                    )

                for spot_idx, url in enumerate(spotlight_items):
                    # Update progress at start of each iteration (fires even on skips)
                    if self.activity_manager:
                        self.activity_manager.update_status(
                            "Downloading spotlights",
                            progress_current=spot_idx + 1,
                            progress_total=len(spotlight_items)
                        )

                    if checkpoint.is_completed(url):
                        continue

                    checkpoint.set_current(url)

                    try:
                        spotlight = self.get_spotlight_metadata(url)
                        if not spotlight or not spotlight.snaps:
                            continue

                        snap = spotlight.snaps[0]

                        # Check date filter
                        if snap.timestamp < cutoff_date:
                            self.log(f"Spotlight {snap.media_id} is older than {days_back} days, skipping", "debug")
                            continue

                        # Check if already processed
                        if snap.media_id in processed or snap.media_id in self.downloaded_files:
                            self.log(f"Spotlight {snap.media_id} already processed, skipping", "debug")
                            continue

                        # Download
                        ext = 'mp4' if snap.media_type == 'video' else 'jpg'
                        filename = self._generate_filename(username, snap, ext)
                        output_path = str(spotlight_output / filename)

                        if self._download_media_file(snap, output_path):
                            self.downloaded_files.add(snap.media_id)
                            downloaded_count += 1
                            self.log(f"Downloaded spotlight: {filename}", "info")

                            self._record_download(
                                username=username,
                                url=url,
                                filename=filename,
                                post_date=snap.timestamp,
                                metadata={
                                    'media_id': snap.media_id,
                                    'description': snap.description,
                                    'view_count': snap.view_count,
                                    'content_type': 'spotlight'
                                },
                                file_path=output_path,
                                deferred=defer_database
                            )

                    except Exception as e:
                        self.log(f"Error processing spotlight: {e}", "error")

                    checkpoint.mark_completed(url)

            # Download highlights (stories)
            if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
                highlight_items = content['highlights'][:max_downloads]
                self.log(f"Processing {len(highlight_items)} highlights...", "info")

                if self.activity_manager:
                    self.activity_manager.update_status(
                        "Downloading highlights",
                        progress_current=0,
                        progress_total=len(highlight_items)
                    )

                for hi_idx, url in enumerate(highlight_items):
                    # Update progress at start of each iteration (fires even on skips)
                    if self.activity_manager:
                        self.activity_manager.update_status(
                            "Downloading highlights",
                            progress_current=hi_idx + 1,
                            progress_total=len(highlight_items)
                        )

                    if checkpoint.is_completed(url):
                        continue

                    checkpoint.set_current(url)

                    try:
                        highlight = self.get_highlight_metadata(url)
                        if not highlight or not highlight.snaps:
                            continue

                        # Check if any snap is within date range
                        newest_snap = max(highlight.snaps, key=lambda s: s.timestamp)
                        if newest_snap.timestamp < cutoff_date:
                            self.log(f"Highlight {highlight.collection_id} is older than {days_back} days, skipping", "debug")
                            continue

                        # Check if already processed
                        if highlight.collection_id in processed or highlight.collection_id in self.downloaded_files:
                            self.log(f"Highlight {highlight.collection_id} already processed, skipping", "debug")
                            continue

                        # Separate videos and images
                        videos = [s for s in highlight.snaps if s.media_type == 'video']
                        images = [s for s in highlight.snaps if s.media_type == 'image']

                        # Download images individually
                        for snap in images:
                            if snap.timestamp < cutoff_date:
                                continue
                            if snap.media_id in processed or snap.media_id in self.downloaded_files:
                                continue

                            filename = self._generate_filename(username, snap, 'jpg')
                            output_path = str(stories_output / filename)

                            if self._download_media_file(snap, output_path):
                                self.downloaded_files.add(snap.media_id)
                                downloaded_count += 1
                                self.log(f"Downloaded image: {filename}", "info")

                                self._record_download(
                                    username=username,
                                    url=highlight.url,
                                    filename=filename,
                                    post_date=snap.timestamp,
                                    metadata={
                                        'media_id': snap.media_id,
                                        'highlight_id': highlight.collection_id,
                                        'content_type': 'highlight_image'
                                    },
                                    file_path=output_path,
                                    deferred=defer_database
                                )

                        # Handle videos - download each clip individually
                        if videos:
                            for snap in videos:
                                if snap.timestamp < cutoff_date:
                                    continue
                                if snap.media_id in processed or snap.media_id in self.downloaded_files:
                                    continue

                                filename = self._generate_filename(username, snap, 'mp4')
                                output_path = str(stories_output / filename)

                                if self._download_media_file(snap, output_path):
                                    self._set_metadata(output_path, snap)
                                    self.downloaded_files.add(snap.media_id)
                                    downloaded_count += 1
                                    self.log(f"Downloaded video: {filename}", "info")

                                    self._record_download(
                                        username=username,
                                        url=highlight.url,
                                        filename=filename,
                                        post_date=snap.timestamp,
                                        metadata={
                                            'media_id': snap.media_id,
                                            'highlight_id': highlight.collection_id,
                                            'content_type': 'highlight_video'
                                        },
                                        file_path=output_path,
                                        deferred=defer_database
                                    )

                    except Exception as e:
                        self.log(f"Error processing highlight: {e}", "error")

                    checkpoint.mark_completed(url)

        except Exception as e:
            self.log(f"Error during download: {e}", "error")

        checkpoint.finish()
        self.log(f"Downloaded {downloaded_count} files for @{username}", "info")
        return downloaded_count


def test_scraper():
    """Test the scraper"""
    print("=" * 60)
    print("SNAPCHAT DIRECT SCRAPER TEST")
    print("=" * 60)

    with SnapchatDirectScraper(headless=True) as scraper:
        username = "evalongoria"

        # Test download
        count = scraper.download(
            username=username,
            content_type="all",
            days_back=30,
            max_downloads=5,
            spotlight_dir="/tmp/snap_test/spotlight",
            stories_dir="/tmp/snap_test/stories",
            stitch_highlights=True
        )

        print(f"\nDownloaded {count} files")

        # Show files
        import os
        for root, dirs, files in os.walk("/tmp/snap_test"):
            for f in files:
                path = os.path.join(root, f)
                size = os.path.getsize(path) / 1024
                print(f"  {path}: {size:.1f}KB")

    print("=" * 60)
    print("TEST COMPLETE")
    print("=" * 60)


if __name__ == "__main__":
    test_scraper()