media-downloader/docs/archive/snapchat_module_storyclon.py

#!/usr/bin/env python3
"""
Snapchat downloader module using StoryClon e proxy (s.storyclone.com)
Based on ImgInn module structure with FastDL-compatible file naming
Format: {profile}_{YYYYMMDD_HHMMSS}_{media_id}{ext}
"""

# Allow nested event loops for compatibility with asyncio contexts
try:
    import nest_asyncio
    nest_asyncio.apply()
except ImportError:
    pass

import os
import json
import time
import re
import subprocess
import platform
import requests
from pathlib import Path
from datetime import datetime, timedelta
from modules.base_module import LoggingMixin
from modules.universal_logger import get_logger
from modules.cloudflare_handler import CloudflareHandler, SiteStatus, get_flaresolverr_user_agent

from playwright.sync_api import sync_playwright

class SnapchatDownloader(LoggingMixin):
    """Snapchat downloader using StoryClon e with FastDL-compatible naming"""

    def __init__(self,
                 headless: bool = True,
                 cookie_file: str = "/opt/media-downloader/cookies/snapchat_cookies.json",
                 show_progress: bool = True,
                 use_database: bool = True,
                 log_callback=None,
                 unified_db=None,
                 proxy_domain: str = "sn.storyclone.com"):
        """Initialize downloader compatible with media-downloader system"""
        self.headless = headless
        self.downloaded_files = set()  # Track downloaded media IDs
        self.file_dates = {}  # Map media_id -> datetime from existing filenames
        self.show_progress = show_progress
        self.use_database = use_database
        self.download_count = 0
        self.unified_db = unified_db  # Store for scraper config access
        self.scraper_id = 'snapchat'  # Scraper ID in database

        # Initialize logging via mixin
        self._init_logger('Snapchat', log_callback, default_module='Download')

        # Browser reuse across profiles
        self.playwright = None
        self.browser = None
        self.context = None
        self.page = None

        # Use unified database if provided
        if unified_db and use_database:
            from modules.unified_database import SnapchatDatabaseAdapter
            self.db = SnapchatDatabaseAdapter(unified_db)
        else:
            self.db = None
            self.use_database = False

        # Initialize activity status manager for real-time updates
        from modules.activity_status import get_activity_manager
        self.activity_manager = get_activity_manager(unified_db)

        # Load scraper configuration from database if available
        self.proxy_url = None
        self.cookie_file = None  # Default to None (use database)
        self.proxy_domain = proxy_domain  # Default proxy domain

        if unified_db:
            scraper_config = unified_db.get_scraper(self.scraper_id)
            if scraper_config:
                # Get proxy configuration
                if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
                    self.proxy_url = scraper_config['proxy_url']
                    self.log(f"Using proxy: {self.proxy_url}", "info")
                # Get base URL (proxy domain) from database
                if scraper_config.get('base_url'):
                    self.proxy_domain = scraper_config['base_url'].replace('https://', '').replace('http://', '').rstrip('/')

        # Fall back to cookie file if no database
        if not unified_db:
            self.cookie_file = Path(cookie_file)
            self.cookie_file.parent.mkdir(parents=True, exist_ok=True)

        # User-Agent to match FlareSolverr (dynamically fetched for consistency)
        self.user_agent = get_flaresolverr_user_agent()

        # Initialize universal Cloudflare handler
        # Pass proxy_url if configured, and cookie_file=None for database storage
        self.cf_handler = CloudflareHandler(
            module_name="Snapchat",
            cookie_file=str(self.cookie_file) if self.cookie_file else None,
            user_agent=self.user_agent,
            logger=self.logger,
            aggressive_expiry=True,
            proxy_url=self.proxy_url  # Pass proxy to FlareSolverr
        )

        # Keep for backwards compatibility
        self.flaresolverr_url = self.cf_handler.flaresolverr_url
        self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled

        self.pending_downloads = []  # Track downloads for deferred database recording

        # Load cookies from database if available
        self._load_cookies_from_db()

        # Check if we need to get initial cookies
        if not self._has_valid_cookies():
            self.log("No cookies found, will load cookies on first use", "info")

    def _load_cookies_from_db(self):
        """Load cookies from database if available"""
        if not self.unified_db:
            return

        try:
            cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
            if cookies:
                # Load into CloudflareHandler
                self.cf_handler._cookies = cookies
                self.log(f"Loaded {len(cookies)} cookies from database", "debug")
        except Exception as e:
            self.log(f"Error loading cookies from database: {e}", "warning")

    def _save_cookies_to_db(self, cookies: list):
        """Save cookies to database"""
        if not self.unified_db:
            return

        try:
            self.unified_db.save_scraper_cookies(
                self.scraper_id,
                cookies,
                user_agent=self.user_agent,
                merge=True
            )
            self.log(f"Saved {len(cookies)} cookies to database", "debug")
        except Exception as e:
            self.log(f"Error saving cookies to database: {e}", "warning")

    def _has_valid_cookies(self):
        """Check if we have valid cookies (either in file or database)"""
        if self.unified_db:
            cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
            return cookies and len(cookies) > 0
        elif self.cookie_file:
            return self.cookie_file.exists()
        return False

    def _cookies_expired(self):
        """Check if cookies are expired - delegates to CloudflareHandler"""
        return self.cf_handler.cookies_expired()

    def _get_cookies_for_requests(self):
        """Get cookies in format for requests library - delegates to CloudflareHandler"""
        return self.cf_handler.get_cookies_dict()

    def _get_cookies_via_flaresolverr(self, url=None, max_retries=2):
        """Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler

        Args:
            url: URL to fetch (defaults to proxy_domain)
            max_retries: Maximum number of retry attempts (default: 2)

        Returns:
            True if cookies obtained successfully, False otherwise
        """
        if url is None:
            url = f"https://{self.proxy_domain}/"
        success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries)

        # Save cookies to database if successful
        if success and self.unified_db:
            cookies_list = self.cf_handler.get_cookies_list()
            if cookies_list:
                self._save_cookies_to_db(cookies_list)

        return success

    def _start_browser(self):
        """Start browser if not already running (reusable across profiles)"""
        # Try to get fresh cookies via FlareSolverr if we don't have them or they're old
        # Do this BEFORE the browser reuse check so cookies are always checked
        if not self._has_valid_cookies() or self._cookies_expired():
            self.log("Cookies missing or expired, attempting FlareSolverr bypass...", "info", module="Cloudflare")
            if self._get_cookies_via_flaresolverr():
                self.log("Successfully got fresh cookies from FlareSolverr", "info", module="Cloudflare")
            else:
                self.log("FlareSolverr unavailable, will try with Playwright", "warning", module="Cloudflare")

        if self.browser is not None:
            self.log("Browser already running, reusing...", "debug", module="Browser")
            return

        import os
        # Use environment variable if set, otherwise use standard location
        if 'PLAYWRIGHT_BROWSERS_PATH' not in os.environ:
            os.environ['PLAYWRIGHT_BROWSERS_PATH'] = '/root/.cache/ms-playwright'
        os.environ['DISPLAY'] = ':100'  # Use Xvfb virtual display

        self.log("Starting browser (Chromium)...", "info", module="Browser")
        self.playwright = sync_playwright().start()

        self.browser = self.playwright.chromium.launch(
            headless=self.headless,
            args=[
                '--disable-blink-features=AutomationControlled',
                '--disable-dev-shm-usage',
                '--no-sandbox',
                '--disable-setuid-sandbox',
                '--disable-gpu',
                '--disable-software-rasterizer',
                '--disable-accelerated-2d-canvas',
                '--disable-accelerated-video-decode'
            ]
        )

        # CRITICAL: User-Agent must match FlareSolverr for cookies to work
        self.context = self.browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent=self.user_agent
        )

        # Load cookies
        self.load_cookies(self.context)

        self.page = self.context.new_page()

        # Add basic anti-detection
        self.page.add_init_script("""
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            });
        """)

        self.log("Browser started and ready", "info", module="Browser")

    def _stop_browser(self):
        """Stop the browser safely with proper error handling"""
        # Close context first
        if self.context:
            try:
                self.context.close()
                self.log("Browser context closed", "debug", module="Browser")
            except Exception as e:
                self.log(f"Error closing browser context: {e}", "warning")
            finally:
                self.context = None

        # Close browser
        if self.browser:
            try:
                self.browser.close()
                self.log("Browser closed", "debug", module="Browser")
            except Exception as e:
                self.log(f"Error closing browser: {e}", "warning")
            finally:
                self.browser = None

        # Stop playwright
        if self.playwright:
            try:
                self.playwright.stop()
            except Exception as e:
                self.log(f"Error stopping playwright: {e}", "warning")
            finally:
                self.playwright = None

        self.page = None

    def __del__(self):
        """Cleanup browser when instance is destroyed"""
        self._stop_browser()

    def __enter__(self):
        """Context manager entry - allows using 'with' statement"""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit - ensures browser cleanup"""
        self._stop_browser()
        return False  # Don't suppress exceptions

    def _extract_media_id_from_url(self, url: str) -> str:
        """Extract media ID from URL"""
        # URL format: various formats on storyclone.com
        # Try to extract meaningful ID from URL
        match = re.search(r'/([^/]+)/?$', url)
        if match:
            return match.group(1)
        return None

    def _update_file_timestamps(self, filepath: Path, post_date: datetime):
        """Update all timestamps for a file to match the post date"""
        try:
            # Convert datetime to timestamp
            timestamp = post_date.timestamp()

            # 1. Update file system timestamps (access time and modification time)
            os.utime(filepath, (timestamp, timestamp))
            self.log(f"Updated file timestamps to {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")

            # 2. Try to update creation time (platform-specific)
            if platform.system() == 'Darwin':  # macOS
                # Use SetFile command on macOS
                date_str = post_date.strftime('%m/%d/%Y %H:%M:%S')
                try:
                    subprocess.run(
                        ['SetFile', '-d', date_str, str(filepath)],
                        capture_output=True,
                        text=True
                    )
                except (subprocess.SubprocessError, FileNotFoundError, OSError):
                    pass  # SetFile not available on this system
            elif platform.system() == 'Windows':
                # On Windows, use PowerShell with proper escaping to prevent injection
                filepath_escaped = str(filepath).replace("'", "''")
                date_escaped = post_date.isoformat().replace("'", "''")
                ps_command = f"(Get-Item -LiteralPath '{filepath_escaped}').CreationTime = Get-Date '{date_escaped}'"
                try:
                    subprocess.run(
                        ['powershell', '-Command', ps_command],
                        capture_output=True,
                        text=True
                    )
                except (subprocess.SubprocessError, FileNotFoundError, OSError):
                    pass  # PowerShell command failed
            # Linux doesn't support changing creation time

            # 3. Update EXIF data for images
            if str(filepath).lower().endswith(('.jpg', '.jpeg', '.png')):
                self._update_exif_timestamps(filepath, post_date)

        except Exception as e:
            self.log(f"Error updating timestamps: {e}", "warning")

    def _update_exif_timestamps(self, filepath: Path, post_date: datetime):
        """Update EXIF timestamps in image files"""
        try:
            # Check if exiftool is available
            result = subprocess.run(['which', 'exiftool'], capture_output=True, text=True)
            if result.returncode == 0:
                # Format date for EXIF
                exif_date = post_date.strftime('%Y:%m:%d %H:%M:%S')

                # Update all date fields in EXIF including MetadataDate for Immich
                cmd = [
                    'exiftool', '-overwrite_original', '-quiet',
                    f'-AllDates={exif_date}',
                    f'-MetadataDate={exif_date}',
                    '-HistoryWhen=',
                    f'-FileModifyDate={exif_date}',
                    str(filepath)
                ]

                subprocess.run(cmd, capture_output=True, text=True)
                self.log(f"Updated EXIF timestamps", "debug")
        except (subprocess.SubprocessError, OSError, FileNotFoundError):
            # Silently skip if exiftool not available
            pass

    def _extract_post_date(self, page) -> datetime:
        """Try to extract post date from page"""
        try:
            # Wait a moment for dynamic content to load
            page.wait_for_timeout(500)

            # Look for date elements on StoryClon e
            date_selectors = [
                'time[datetime]',
                'time',
                '.date',
                '[datetime]',
                'span.date',
                'div.date',
                '.story-date',
                '.post-date'
            ]

            for selector in date_selectors:
                elem = page.locator(selector).first
                if elem.count() > 0:
                    # Try datetime attribute first
                    datetime_str = elem.get_attribute('datetime')
                    if datetime_str:
                        # Parse ISO format
                        for fmt in ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d']:
                            try:
                                return datetime.strptime(datetime_str.split('.')[0].replace('Z', ''), fmt)
                            except ValueError:
                                continue

                    # Try text content
                    text = elem.text_content()
                    if text:
                        # Parse various date formats
                        if "ago" in text.lower():
                            # Handle relative dates
                            if "hour" in text:
                                hours = int(re.search(r'(\d+)', text).group(1))
                                return datetime.now() - timedelta(hours=hours)
                            elif "day" in text:
                                days = int(re.search(r'(\d+)', text).group(1))
                                return datetime.now() - timedelta(days=days)
                            elif "week" in text:
                                weeks = int(re.search(r'(\d+)', text).group(1))
                                return datetime.now() - timedelta(weeks=weeks)
                        else:
                            # Try parsing absolute date
                            for fmt in ['%B %d, %Y', '%b %d, %Y', '%Y-%m-%d']:
                                try:
                                    return datetime.strptime(text, fmt)
                                except ValueError:
                                    continue
        except Exception as e:
            self.log(f"Error extracting date: {e}", "debug")

        return None

    def _parse_storyclone_filename(self, filename: str, profile_name: str) -> datetime:
        """
        Parse date from StoryClon e filename format and adjust for timezone
        Format: evalongoria-2025-10-23T17-42-56.jpg
        StoryClon e uses UTC, so subtract 4 hours to get local time

        Args:
            filename: StoryClon e filename
            profile_name: Username to strip from beginning

        Returns:
            datetime object adjusted to local time, or None if parsing failed
        """
        try:
            # Remove extension
            filename_no_ext = Path(filename).stem

            # Check if it starts with profile name
            if filename_no_ext.startswith(f"{profile_name}-"):
                # Extract date part: 2025-10-23T17-42-56
                date_part = filename_no_ext[len(f"{profile_name}-"):]

                # Parse ISO-like format with hyphens instead of colons
                # 2025-10-23T17-42-56 -> 2025-10-23 17:42:56
                date_part_clean = date_part.replace('T', ' ')

                # Replace only the time part hyphens with colons
                parts_dt = date_part_clean.split(' ')
                if len(parts_dt) == 2:
                    date_portion = parts_dt[0]  # 2025-10-23
                    time_portion = parts_dt[1].replace('-', ':')  # 17-42-56 -> 17:42:56
                    datetime_str = f"{date_portion} {time_portion}"

                    # Parse the datetime (this is in UTC)
                    parsed_date = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')

                    # Subtract 4 hours to convert from UTC to local time
                    local_date = parsed_date - timedelta(hours=4)

                    return local_date
        except Exception as e:
            self.log(f"Error parsing StoryClon e filename '{filename}': {e}", "debug")

        return None

    def _parse_story_date_text(self, date_text: str) -> datetime:
        """
        Parse StoryClon e date text format
        Examples: "Posted on today at 1:42 PM"
                  "Posted on today at 1:44 PM"

        Returns:
            datetime object or None if parsing failed
        """
        try:
            # StoryClon e format: "Posted on today at 1:42 PM"
            if "Posted on today at" in date_text:
                # Extract time part (e.g., "1:42 PM")
                time_match = re.search(r'(\d{1,2}):(\d{2})\s*(AM|PM)', date_text, re.IGNORECASE)
                if time_match:
                    hour = int(time_match.group(1))
                    minute = int(time_match.group(2))
                    am_pm = time_match.group(3).upper()

                    # Convert to 24-hour format
                    if am_pm == 'PM' and hour != 12:
                        hour += 12
                    elif am_pm == 'AM' and hour == 12:
                        hour = 0

                    # Use today's date with the extracted time
                    now = datetime.now()
                    story_datetime = now.replace(hour=hour, minute=minute, second=0, microsecond=0)

                    return story_datetime

            # Could add more date formats here if needed

        except Exception as e:
            self.log(f"Error parsing date text '{date_text}': {e}", "debug")

        return None

    def _record_download(self, username: str, url: str, filename: str,
                        post_date=None, metadata: dict = None, file_path: str = None,
                        deferred: bool = False):
        """Record a download in the database

        Args:
            deferred: If True, don't record to database now - add to pending_downloads list
                     for later recording after file move is complete
        """
        # If deferred, store for later recording instead of recording now
        if deferred:
            self.pending_downloads.append({
                'username': username,
                'url': url,
                'filename': filename,
                'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
                'file_path': file_path,
                'metadata': metadata
            })
            self.log(f"Deferred recording for {filename}", "debug")
            return True

        if not self.db:
            return

        try:
            self.db.mark_downloaded(
                username=username,
                url=url,
                filename=filename,
                post_date=post_date,
                metadata=metadata,
                file_path=file_path
            )
        except Exception as e:
            self.log(f"Failed to record download: {e}", "debug")

    def get_pending_downloads(self):
        """Get list of downloads that were deferred for later recording"""
        return self.pending_downloads.copy()

    def clear_pending_downloads(self):
        """Clear the pending downloads list after they've been recorded"""
        self.pending_downloads = []

    def _scan_existing_files(self, output_dir: Path, profile_name: str):
        """Scan directory for existing files and extract media IDs and dates"""
        self.downloaded_files.clear()
        self.file_dates = {}  # Map media_id -> datetime

        # Patterns: Both my format and StoryClon e format
        for pattern in ["*.jpg", "*.jpeg", "*.png", "*.heic", "*.mp4", "*.mov"]:
            for filepath in output_dir.glob(pattern):
                # Skip corrupted/incomplete files (less than 20KB)
                if filepath.stat().st_size < 20000:
                    self.log(f"Skipping corrupted file (size < 20KB): {filepath.name}", "debug")
                    continue

                filename = filepath.stem
                media_id = None
                file_date = None

                # Try my FastDL format: profile_YYYYMMDD_HHMMSS_mediaid.ext
                parts = filename.split('_', 3)
                if len(parts) >= 4 and parts[0] == profile_name:
                    media_id = parts[3]  # Everything after date/time
                    # Parse date from filename
                    try:
                        date_str = f"{parts[1]}_{parts[2]}"  # YYYYMMDD_HHMMSS
                        file_date = datetime.strptime(date_str, '%Y%m%d_%H%M%S')
                    except (ValueError, IndexError):
                        pass

                # Try StoryClon e format: profile-YYYY-MM-DDTHH-MM-SS.ext
                elif filename.startswith(f"{profile_name}-"):
                    # Example: evalongoria-2025-10-23T17-42-56
                    # Extract: 2025-10-23T17-42-56
                    date_part = filename[len(f"{profile_name}-"):]
                    try:
                        # Parse ISO-like format with hyphens instead of colons
                        # 2025-10-23T17-42-56 -> 2025-10-23 17:42:56
                        date_part_clean = date_part.replace('T', ' ')
                        # Replace only the time part hyphens with colons
                        # Split on space to separate date and time
                        parts_dt = date_part_clean.split(' ')
                        if len(parts_dt) == 2:
                            date_portion = parts_dt[0]  # 2025-10-23
                            time_portion = parts_dt[1].replace('-', ':')  # 17-42-56 -> 17:42:56
                            datetime_str = f"{date_portion} {time_portion}"
                            # Parse the datetime
                            parsed_date = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
                            # Subtract 4 hours to convert from UTC to local time
                            file_date = parsed_date - timedelta(hours=4)
                            # Use the date part as media_id
                            media_id = filename[len(f"{profile_name}-"):]
                    except Exception as e:
                        self.log(f"Could not parse StoryClon e date from {filename}: {e}", "debug")
                        # Still use as media_id for duplicate detection
                        media_id = filename[len(f"{profile_name}-"):]

                if media_id:
                    self.downloaded_files.add(media_id)
                    if file_date:
                        self.file_dates[media_id] = file_date

        if self.downloaded_files:
            self.log(f"Found {len(self.downloaded_files)} valid existing files for {profile_name} ({len(self.file_dates)} with dates)", "debug")

    def _get_processed_posts(self, username: str) -> set:
        """Get set of story IDs that have been processed from database"""
        processed = set()
        if not self.db:
            return processed

        try:
            with self.db.get_connection() as conn:
                cursor = conn.cursor()
                # Get all stories for this user from downloads table
                cursor.execute('''
                    SELECT url, filename, metadata FROM downloads
                    WHERE platform = 'snapchat'
                    AND source = ?
                ''', (username,))

                for row in cursor.fetchall():
                    url, filename, metadata_str = row

                    # Extract media_id from filename
                    if filename:
                        # Format: username_date_MEDIAID.ext or username_date_MEDIAID_N.ext
                        parts = filename.split('_')
                        if len(parts) >= 4:
                            # Get everything after date/time as media_id
                            media_id = '_'.join(parts[3:]).split('.')[0]
                            processed.add(media_id)

                    # Also check metadata for media_id
                    if metadata_str:
                        try:
                            metadata = json.loads(metadata_str)
                            if 'media_id' in metadata:
                                processed.add(metadata['media_id'])
                        except (json.JSONDecodeError, KeyError, TypeError):
                            pass

            if processed:
                self.log(f"Found {len(processed)} processed stories in database for {username}", "debug")
        except Exception as e:
            self.log(f"Error loading processed stories from database: {e}", "debug")

        return processed

    def save_cookies(self, context):
        """Save cookies to database or file"""
        cookies = context.cookies()

        # Save to database if available
        if self.unified_db:
            try:
                self.unified_db.save_scraper_cookies(self.scraper_id, cookies)
                self.log(f"Saved {len(cookies)} cookies to database", "debug")
                return
            except Exception as e:
                self.log(f"Error saving cookies to database: {e}", "warning")

        # Fallback to file-based storage
        if self.cookie_file:
            storage_data = {
                'cookies': cookies,
                'timestamp': datetime.now().isoformat()
            }
            with open(self.cookie_file, 'w') as f:
                json.dump(storage_data, f, indent=2)
            self.log(f"Saved {len(cookies)} cookies to file", "debug")

    def load_cookies(self, context):
        """Load saved cookies from database or file"""
        # Try loading from database first
        if self.unified_db:
            try:
                cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
                if cookies:
                    # Clean cookies - remove unsupported properties
                    cleaned_cookies = []
                    for cookie in cookies:
                        cleaned = {k: v for k, v in cookie.items()
                                  if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
                        cleaned_cookies.append(cleaned)

                    context.add_cookies(cleaned_cookies)
                    self.log(f"Loaded {len(cleaned_cookies)} cookies from database", "info")
                    return True
            except Exception as e:
                self.log(f"Error loading cookies from database: {e}", "warning")

        # Fallback to file-based cookies
        if not self.cookie_file or not self.cookie_file.exists():
            return False

        try:
            with open(self.cookie_file, 'r') as f:
                data = json.load(f)

            # Check age (24 hours)
            saved_time = datetime.fromisoformat(data['timestamp'])
            if datetime.now() - saved_time > timedelta(hours=24):
                self.log("Cookies expired", "debug")
                return False

            # Clean cookies - remove unsupported properties
            cleaned_cookies = []
            for cookie in data['cookies']:
                # Remove Chrome-specific properties that Playwright doesn't support
                cleaned = {k: v for k, v in cookie.items()
                          if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
                cleaned_cookies.append(cleaned)

            context.add_cookies(cleaned_cookies)
            self.log(f"Loaded {len(cleaned_cookies)} cookies from file", "info")
            return True
        except Exception as e:
            self.log(f"Failed to load cookies: {e}", "warning")
            return False

    def wait_for_cloudflare(self, page):
        """Wait for Cloudflare to auto-solve or page to load - uses FlareSolverr when needed"""
        self.log("Waiting for page to load...", "debug")

        max_wait = 120  # Extended wait to match ImgInn
        flaresolverr_attempted = False

        for i in range(max_wait):
            time.sleep(1)

            # Check current URL and content
            try:
                current_url = page.url
                content = page.content().lower()
            except Exception as e:
                if "navigating" in str(e).lower():
                    self.log("Page still navigating, waiting...", "debug")
                    continue
                else:
                    raise

            # Check for actual Cloudflare challenge or server error
            challenge_indicators = ['checking your browser', 'just a moment', 'verify you are human', 'enable javascript']
            error_indicators = ['internal server error', 'error code 500', 'error code 502', 'error code 503']

            has_challenge = any(indicator in content for indicator in challenge_indicators)
            has_error = any(indicator in content for indicator in error_indicators)

            if has_error:
                self.log("Server error detected (500/502/503) - site is likely down", "error")
                return False

            if has_challenge:
                if not flaresolverr_attempted:
                    self.log("Cloudflare challenge detected, attempting FlareSolverr bypass...", "info", module="Cloudflare")
                    # Try to get fresh cookies via FlareSolverr
                    if self._get_cookies_via_flaresolverr(page.url):
                        self.log("Got fresh cookies from FlareSolverr, reloading page...", "info", module="Cloudflare")
                        # Reload cookies in browser context
                        try:
                            self.load_cookies(self.context)
                            # Reload the page with new cookies
                            page.reload(wait_until='domcontentloaded', timeout=10000)
                            time.sleep(2)  # Give page time to load with new cookies
                        except Exception as e:
                            self.log(f"Error reloading page with new cookies: {e}", "debug")
                    else:
                        self.log("FlareSolverr failed, waiting for challenge to resolve...", "warning", module="Cloudflare")
                    flaresolverr_attempted = True
                continue

            # Check if we're on the correct page with content
            if 'storyclone.com' in current_url.lower():
                # Look for story content indicators
                if 'story' in content or 'username' in content or 'download' in content or 'stories' in content:
                    self.log(f"Page loaded after {i+1} seconds", "info")
                    return True

            # Status updates
            if i == 10:
                self.log("Still waiting (10s)... Cloudflare is checking", "debug")
            elif i == 20:
                self.log("Still waiting (20s)... Cloudflare challenge ongoing", "info")
            elif i == 30:
                self.log("Still waiting (30s)... This is normal for Cloudflare", "info")

        # Timeout reached
        self.log(f"Page load timeout. URL: {page.url}", "error")
        return False

    def download(self, username: str, content_type: str = "stories", days_back: int = 14,
                 max_downloads: int = 50, output_dir: str = None, phrase_config: dict = None,
                 defer_database: bool = False):
        """Download content from a user - compatible with media-downloader interface

        Args:
            username: Snapchat username
            content_type: Type of content ("stories" only for Snapchat)
            days_back: How many days back to search
            max_downloads: Maximum stories to download
            output_dir: Output directory
            phrase_config: Not used for Snapchat (stories don't have captions usually)
            defer_database: If True, don't record to database immediately - store in
                           pending_downloads for later recording after file move is complete
        """
        self.defer_database = defer_database  # Store for use in download methods
        # Clear downloaded_files cache between accounts to prevent memory growth
        self.downloaded_files.clear()

        # Check site status before doing anything else
        self.log(f"Checking {self.proxy_domain} site status...", "debug")
        site_status, error_msg = self.cf_handler.check_site_status(f"https://{self.proxy_domain}/", timeout=10)

        if self.cf_handler.should_skip_download(site_status):
            self.log(f"Skipping download - {self.proxy_domain} is unavailable: {error_msg}", "warning")
            return 0
        elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
            self.log("Cloudflare challenge detected, will attempt bypass during download", "info")

        # Set output directory
        if output_dir:
            output_path = Path(output_dir) / username
        else:
            output_path = Path(f"/opt/media-downloader/downloads/{username}")

        # Route to download method
        if content_type == "stories":
            files = self.download_stories(
                username=username,
                days_back=days_back,
                max_stories=max_downloads,
                output_dir=output_path
            )
        else:
            self.log(f"Snapchat downloader does not support content type: {content_type}", "warning")
            return 0

        return len(files)

    def download_stories(self, username: str, days_back: int = 14, max_stories: int = 50, output_dir: Path = None):
        """Download stories from a Snapchat user with FastDL naming

        Args:
            username: Snapchat username
            days_back: How many days back to search
            max_stories: Maximum stories to download
            output_dir: Output directory
        """

        profile_name = username.lower()
        if output_dir is None:
            output_dir = Path(f"/opt/media-downloader/downloads/{profile_name}")
        output_dir.mkdir(parents=True, exist_ok=True)

        # Scan existing files
        self._scan_existing_files(output_dir, profile_name)

        # Get processed stories from database
        processed_stories = self._get_processed_posts(profile_name)
        self.log(f"Loaded {len(processed_stories)} processed stories for {profile_name} from database", "info")

        downloaded_files = []
        cutoff_date = datetime.now() - timedelta(days=days_back)

        # Update activity status
        self.activity_manager.update_status(f"Checking stories from @{profile_name}")

        # Start or reuse browser
        self._start_browser()
        page = self.page

        try:
            # Navigate to user's stories page on StoryClon e
            self.log(f"Navigating to @{username} on {self.proxy_domain}", "info")
            page.goto(f"https://{self.proxy_domain}/u/{username}/", wait_until='domcontentloaded')

            # Wait for page to load
            if not self.wait_for_cloudflare(page):
                self.log("Page didn't load properly", "error")
                return []

            # Save cookies
            self.save_cookies(self.context)

            # Wait for page to load
            self.log("Waiting for page to load...", "info")
            time.sleep(3)  # Give page time to load content

            # Check if "Stories" section exists - if not, there are no stories to scrape
            stories_section = page.locator('div.font-semibold.ml-6:has-text("Stories")').first
            if stories_section.count() == 0:
                self.log("No 'Stories' section found - user has no stories available", "info")
                return []

            self.log("Found 'Stories' section - proceeding to load all stories...", "info")

            # Scroll down and load all stories by clicking "Load More" button
            self.log("Scrolling to load all stories...", "info")

            load_more_clicks = 0
            max_attempts = 20

            for attempt in range(max_attempts):
                # Step 1: Scroll until we see "Spotlight Highlights"
                self.log("Scrolling until 'Spotlight Highlights' is visible...", "debug")
                scroll_attempts = 0
                max_scrolls = 10

                while scroll_attempts < max_scrolls:
                    spotlight_highlights = page.locator('text=Spotlight Highlights').first
                    if spotlight_highlights.count() > 0:
                        self.log("Found 'Spotlight Highlights' in view", "debug")
                        break

                    page.evaluate("window.scrollBy(0, 400)")
                    time.sleep(1)
                    scroll_attempts += 1

                # Step 2: Check if there's a "Load More" button ABOVE "Spotlight Highlights" (positionally before)
                load_more_btn = page.locator('button:has-text("Load More"), button.load-more-button').first
                spotlight_highlights = page.locator('text=Spotlight Highlights').first

                load_more_visible = load_more_btn.count() > 0 and load_more_btn.is_visible()
                spotlight_visible = spotlight_highlights.count() > 0

                if load_more_visible and spotlight_visible:
                    # Both are visible - check Y positions to see which comes first
                    load_more_box = load_more_btn.bounding_box()
                    spotlight_box = spotlight_highlights.bounding_box()

                    if load_more_box and spotlight_box:
                        load_more_y = load_more_box['y']
                        spotlight_y = spotlight_box['y']

                        if load_more_y < spotlight_y:
                            # "Load More" is ABOVE "Spotlight Highlights" → Click it
                            load_more_clicks += 1
                            self.log(f"Found 'Load More' ABOVE 'Spotlight Highlights' (Y:{load_more_y:.0f} < {spotlight_y:.0f}) - clicking (click #{load_more_clicks})...", "info")
                            load_more_btn.click()
                            time.sleep(2.5)  # Wait for more posts to load

                            items_count = len(page.locator('.item').all())
                            self.log(f"Items after click: {items_count}", "debug")

                            # Go back and scroll to "Spotlight Highlights" again (it will be pushed down)
                            continue
                        else:
                            # "Load More" is BELOW "Spotlight Highlights" → We're done
                            items_final = page.locator('.item').all()
                            self.log(f"'Load More' is BELOW 'Spotlight Highlights' (Y:{load_more_y:.0f} > {spotlight_y:.0f}) - done! Found {len(items_final)} stories (clicked Load More {load_more_clicks} times)", "info")
                            break
                elif spotlight_visible:
                    # Only "Spotlight Highlights" visible, no "Load More" → We're done
                    items_final = page.locator('.item').all()
                    self.log(f"No 'Load More' button found - done! Found {len(items_final)} stories (clicked Load More {load_more_clicks} times)", "info")
                    break
                else:
                    # Neither visible, keep trying
                    self.log("Neither 'Load More' nor 'Spotlight Highlights' found, continuing...", "debug")
                    continue

            # Find story/media elements by processing each .item container
            # This ensures lazy-loaded content is properly triggered
            self.log("Extracting media from story items...", "info")

            # Get Y position of "Spotlight Highlights" to filter out items after it
            spotlight_highlights = page.locator('text=Spotlight Highlights').first
            spotlight_y = None
            if spotlight_highlights.count() > 0:
                spotlight_box = spotlight_highlights.bounding_box()
                if spotlight_box:
                    spotlight_y = spotlight_box['y']
                    self.log(f"'Spotlight Highlights' Y position: {spotlight_y:.0f}", "debug")

            # Get all .item elements
            all_items = page.locator('.item').all()

            # Filter to only items BEFORE "Spotlight Highlights"
            story_items = []
            for item in all_items:
                item_box = item.bounding_box()
                if item_box and spotlight_y:
                    item_y = item_box['y']
                    if item_y < spotlight_y:
                        story_items.append(item)
                elif not spotlight_y:
                    # No Spotlight Highlights found, include all items
                    story_items.append(item)

            self.log(f"Filtered to {len(story_items)} story items (before Spotlight Highlights) from {len(all_items)} total items", "info")

            media_elements = []

            for idx, item in enumerate(story_items):
                try:
                    # Scroll item into view to trigger lazy loading
                    item.scroll_into_view_if_needed()
                    time.sleep(0.3)  # Give it a moment to load

                    # Look for video first
                    video = item.locator('video[src]').first
                    if video.count() > 0:
                        media_elements.append(video)
                        self.log(f"Item {idx+1}: Found video", "debug")
                        continue

                    # If no video, look for image from Snapchat CDN
                    img = item.locator('img[src*="sc-cdn.net"]').first
                    if img.count() > 0:
                        src = img.get_attribute('src')
                        # Skip apple icons, favicons, and poster images
                        if src and 'apple-icon' not in src and 'favicon' not in src and '/d/' in src:
                            media_elements.append(img)
                            self.log(f"Item {idx+1}: Found image", "debug")
                            continue

                    self.log(f"Item {idx+1}: No media found (may be lazy-loading)", "debug")

                except Exception as e:
                    self.log(f"Item {idx+1}: Error processing - {e}", "debug")

            self.log(f"Extracted {len(media_elements)} media elements from {len(story_items)} items", "info")

            if not media_elements:
                self.log("No stories found for this user", "warning")
                return []

            self.log(f"Found {len(media_elements)} potential story items", "info")

            # Download each story
            story_index = 1
            for i, media_elem in enumerate(media_elements[:max_stories]):
                try:
                    # Get media URL
                    media_url = None

                    # Try to get src attribute
                    media_url = media_elem.get_attribute('src')

                    # If no src, try href (for download links)
                    if not media_url or media_url == '#':
                        media_url = media_elem.get_attribute('href')

                    if not media_url or media_url == '#' or media_url.startswith('data:'):
                        self.log(f"Story {story_index}: Invalid media URL", "warning")
                        continue

                    self.log(f"Story {story_index}: {media_url[:80]}...", "debug")

                    # Try to get higher quality version by replacing size parameter
                    # URLs look like: https://.../{id}.1034.IRZXSOY?...
                    # Try larger sizes: 2048, 1920, 1440, 1034 (original)
                    import re
                    hq_url = None
                    original_url = media_url

                    # Check if URL has a size parameter pattern
                    size_match = re.search(r'\.(\d+)\.IRZXSOY', media_url)
                    if size_match:
                        original_size = size_match.group(1)
                        # Try larger sizes (in descending order)
                        for test_size in ['2048', '1920', '1440']:
                            if int(test_size) > int(original_size):
                                test_url = media_url.replace(f'.{original_size}.IRZXSOY', f'.{test_size}.IRZXSOY')
                                # Test if this URL is accessible
                                try:
                                    import requests
                                    response = requests.head(test_url, timeout=5, allow_redirects=True)
                                    if response.status_code == 200:
                                        hq_url = test_url
                                        self.log(f"Story {story_index}: Found higher quality version (size {test_size})", "info")
                                        break
                                except requests.RequestException:
                                    continue

                    # Use HQ URL if found, otherwise use original
                    if hq_url:
                        media_url = hq_url

                    # Extract media ID from URL and determine correct extension
                    from urllib.parse import urlparse, unquote
                    url_path = urlparse(media_url).path
                    original_name = unquote(url_path.split('/')[-1].split('?')[0])

                    # Determine file type from element or URL
                    # Snapchat CDN uses weird extensions like .IRZXSOY, so we need to detect the actual type
                    if media_elem.evaluate("element => element.tagName").lower() == 'video':
                        ext = '.mp4'  # Videos are MP4
                    else:
                        ext = '.jpg'  # Images are JPG

                    # Use the full filename as media_id (without fake extension)
                    media_id = original_name.split('.')[0]  # Take first part before any dots

                    # Check if already downloaded
                    if media_id in self.downloaded_files or media_id in processed_stories:
                        self.log(f"Story {story_index}: Already downloaded ({media_id}), skipping", "debug")
                        story_index += 1
                        continue

                    # Extract post date from the story item on the page
                    story_date = None
                    try:
                        # Try multiple strategies to find the date associated with THIS specific story
                        # Strategy 1: Look in the immediate parent of the media element
                        immediate_parent = media_elem.locator('xpath=..').first
                        if immediate_parent.count() > 0:
                            date_elem = immediate_parent.locator('.text-sm').first
                            if date_elem.count() > 0:
                                date_text = date_elem.text_content()
                                if date_text and ("Posted on" in date_text or "at" in date_text):
                                    self.log(f"Story {story_index}: Found date in immediate parent: '{date_text}'", "debug")
                                    story_date = self._parse_story_date_text(date_text)
                                    if story_date:
                                        self.log(f"Story {story_index}: Extracted date from page: {story_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")

                        # Strategy 2: If not found, try the closest ancestor with a limited depth
                        if not story_date:
                            # Look for a closer parent (not going all the way up)
                            for depth in [1, 2, 3]:
                                parent_xpath = 'xpath=' + '/'.join(['..'] * depth)
                                parent = media_elem.locator(parent_xpath).first
                                if parent.count() > 0:
                                    # Get only the FIRST .text-sm in this parent
                                    date_elem = parent.locator('.text-sm').first
                                    if date_elem.count() > 0:
                                        date_text = date_elem.text_content()
                                        if date_text and ("Posted on" in date_text or "at" in date_text):
                                            self.log(f"Story {story_index}: Found date at depth {depth}: '{date_text}'", "debug")
                                            story_date = self._parse_story_date_text(date_text)
                                            if story_date:
                                                self.log(f"Story {story_index}: Extracted date from page: {story_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
                                                break

                        if not story_date:
                            self.log(f"Story {story_index}: Could not find date text for this story", "debug")
                    except Exception as e:
                        self.log(f"Story {story_index}: Could not extract date - {e}", "debug")
                        import traceback
                        self.log(f"Story {story_index}: Traceback: {traceback.format_exc()}", "debug")

                    # Fallback to current time if extraction failed
                    if not story_date:
                        story_date = datetime.now()
                        self.log(f"Story {story_index}: Using current time as fallback", "debug")

                    date_str = story_date.strftime('%Y%m%d_%H%M%S')

                    # Build filename: {profile}_{date}_{media_id}{ext}
                    filename = f"{profile_name}_{date_str}_{media_id}{ext}"
                    filepath = output_dir / filename

                    # Download the story
                    try:
                        import requests

                        # Ensure full URL
                        if not media_url.startswith('http'):
                            media_url = f"https:{media_url}" if media_url.startswith('//') else f"https://{self.proxy_domain}{media_url}"

                        response = requests.get(media_url, timeout=30, headers={
                            'User-Agent': self.user_agent,
                            'Referer': f'https://{self.proxy_domain}/'
                        }, cookies=self._get_cookies_for_requests())
                        response.raise_for_status()

                        # Save file
                        with open(filepath, 'wb') as f:
                            f.write(response.content)

                        self.log(f"Downloaded story: {filename} ({len(response.content)} bytes)", "info")
                        downloaded_files.append(str(filepath))

                        # Check for duplicate hash before recording
                        if self.db:
                            from pathlib import Path as PathLib
                            # Check for duplicate hash (hash blacklist persists even if original deleted)
                            file_hash = self.db.get_file_hash(str(filepath))
                            if file_hash:
                                existing = self.db.get_download_by_file_hash(file_hash)
                                if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
                                    # Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
                                    self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
                                    # Delete the duplicate regardless of whether original file still exists
                                    try:
                                        filepath.unlink()
                                        self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
                                        continue
                                    except Exception as e:
                                        self.log(f"Failed to delete duplicate {filename}: {e}", "warning")

                        # Update timestamps
                        self._update_file_timestamps(filepath, story_date)

                        # Add to tracking
                        self.downloaded_files.add(media_id)

                        # Record in database with media_id in metadata
                        self._record_download(
                            username=profile_name,
                            url=media_url,
                            filename=filename,
                            post_date=story_date,
                            metadata={'media_id': media_id},
                            file_path=str(filepath),
                            deferred=getattr(self, 'defer_database', False)
                        )

                        story_index += 1

                    except Exception as e:
                        self.log(f"Failed to download story {story_index}: {e}", "error")
                        story_index += 1
                        continue

                except Exception as e:
                    self.log(f"Error processing story {story_index}: {e}", "error")
                    story_index += 1
                    continue

            self.log(f"Downloaded {len(downloaded_files)} story files", "info")

        except Exception as e:
            self.log(f"Error downloading stories: {e}", "error")
            import traceback
            self.log(f"Traceback: {traceback.format_exc()}", "debug")

        # Don't close browser here - reuse it for next profile
        return downloaded_files


def main():
    """Test the downloader with FastDL naming"""
    import sys

    print("=" * 60)
    print("Snapchat Downloader (StoryClon e) - FastDL Compatible Naming")
    print("=" * 60)
    print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("=" * 60)

    downloader = SnapchatDownloader(
        api_key=None,  # Add your 2captcha key if needed
        headless=False  # Use with xvfb
    )

    # Test username (replace with actual Snapchat username)
    test_username = sys.argv[1] if len(sys.argv) > 1 else "testuser"

    # Download stories
    files = downloader.download_stories(
        username=test_username,
        days_back=7,
        max_stories=50
    )

    print("\n" + "=" * 60)
    print("RESULTS")
    print("=" * 60)

    if files:
        print(f"Successfully downloaded {len(files)} files!")
        print("\nDownloaded files (FastDL naming format):")
        for f in files:
            name = Path(f).name
            size = Path(f).stat().st_size / 1024
            parts = name.split('_', 3)
            if len(parts) >= 4:
                print(f"  - {name}")
                print(f"    Profile: {parts[0]}")
                print(f"    Date: {parts[1]}_{parts[2]}")
                print(f"    Media ID: {parts[3].split('.')[0]}")
                print(f"    Size: {size:.1f} KB")
    else:
        print("No files downloaded")

    # Check total in folder
    download_dir = Path(f"/opt/media-downloader/downloads/{test_username}")
    if download_dir.exists():
        all_files = list(download_dir.glob("*"))
        total_size = sum(f.stat().st_size for f in all_files) / 1024
        print(f"\nTotal in folder: {len(all_files)} files ({total_size:.1f} KB)")


if __name__ == "__main__":
    main()