#!/usr/bin/env python3 """ Snapchat downloader module using StoryClon e proxy (s.storyclone.com) Based on ImgInn module structure with FastDL-compatible file naming Format: {profile}_{YYYYMMDD_HHMMSS}_{media_id}{ext} """ # Allow nested event loops for compatibility with asyncio contexts try: import nest_asyncio nest_asyncio.apply() except ImportError: pass import os import json import time import re import subprocess import platform import requests from pathlib import Path from datetime import datetime, timedelta from modules.base_module import LoggingMixin from modules.universal_logger import get_logger from modules.cloudflare_handler import CloudflareHandler, SiteStatus, get_flaresolverr_user_agent from playwright.sync_api import sync_playwright class SnapchatDownloader(LoggingMixin): """Snapchat downloader using StoryClon e with FastDL-compatible naming""" def __init__(self, headless: bool = True, cookie_file: str = "/opt/media-downloader/cookies/snapchat_cookies.json", show_progress: bool = True, use_database: bool = True, log_callback=None, unified_db=None, proxy_domain: str = "sn.storyclone.com"): """Initialize downloader compatible with media-downloader system""" self.headless = headless self.downloaded_files = set() # Track downloaded media IDs self.file_dates = {} # Map media_id -> datetime from existing filenames self.show_progress = show_progress self.use_database = use_database self.download_count = 0 self.unified_db = unified_db # Store for scraper config access self.scraper_id = 'snapchat' # Scraper ID in database # Initialize logging via mixin self._init_logger('Snapchat', log_callback, default_module='Download') # Browser reuse across profiles self.playwright = None self.browser = None self.context = None self.page = None # Use unified database if provided if unified_db and use_database: from modules.unified_database import SnapchatDatabaseAdapter self.db = SnapchatDatabaseAdapter(unified_db) else: self.db = None self.use_database = False # Initialize activity status manager for real-time updates from modules.activity_status import get_activity_manager self.activity_manager = get_activity_manager(unified_db) # Load scraper configuration from database if available self.proxy_url = None self.cookie_file = None # Default to None (use database) self.proxy_domain = proxy_domain # Default proxy domain if unified_db: scraper_config = unified_db.get_scraper(self.scraper_id) if scraper_config: # Get proxy configuration if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'): self.proxy_url = scraper_config['proxy_url'] self.log(f"Using proxy: {self.proxy_url}", "info") # Get base URL (proxy domain) from database if scraper_config.get('base_url'): self.proxy_domain = scraper_config['base_url'].replace('https://', '').replace('http://', '').rstrip('/') # Fall back to cookie file if no database if not unified_db: self.cookie_file = Path(cookie_file) self.cookie_file.parent.mkdir(parents=True, exist_ok=True) # User-Agent to match FlareSolverr (dynamically fetched for consistency) self.user_agent = get_flaresolverr_user_agent() # Initialize universal Cloudflare handler # Pass proxy_url if configured, and cookie_file=None for database storage self.cf_handler = CloudflareHandler( module_name="Snapchat", cookie_file=str(self.cookie_file) if self.cookie_file else None, user_agent=self.user_agent, logger=self.logger, aggressive_expiry=True, proxy_url=self.proxy_url # Pass proxy to FlareSolverr ) # Keep for backwards compatibility self.flaresolverr_url = self.cf_handler.flaresolverr_url self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled self.pending_downloads = [] # Track downloads for deferred database recording # Load cookies from database if available self._load_cookies_from_db() # Check if we need to get initial cookies if not self._has_valid_cookies(): self.log("No cookies found, will load cookies on first use", "info") def _load_cookies_from_db(self): """Load cookies from database if available""" if not self.unified_db: return try: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) if cookies: # Load into CloudflareHandler self.cf_handler._cookies = cookies self.log(f"Loaded {len(cookies)} cookies from database", "debug") except Exception as e: self.log(f"Error loading cookies from database: {e}", "warning") def _save_cookies_to_db(self, cookies: list): """Save cookies to database""" if not self.unified_db: return try: self.unified_db.save_scraper_cookies( self.scraper_id, cookies, user_agent=self.user_agent, merge=True ) self.log(f"Saved {len(cookies)} cookies to database", "debug") except Exception as e: self.log(f"Error saving cookies to database: {e}", "warning") def _has_valid_cookies(self): """Check if we have valid cookies (either in file or database)""" if self.unified_db: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) return cookies and len(cookies) > 0 elif self.cookie_file: return self.cookie_file.exists() return False def _cookies_expired(self): """Check if cookies are expired - delegates to CloudflareHandler""" return self.cf_handler.cookies_expired() def _get_cookies_for_requests(self): """Get cookies in format for requests library - delegates to CloudflareHandler""" return self.cf_handler.get_cookies_dict() def _get_cookies_via_flaresolverr(self, url=None, max_retries=2): """Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler Args: url: URL to fetch (defaults to proxy_domain) max_retries: Maximum number of retry attempts (default: 2) Returns: True if cookies obtained successfully, False otherwise """ if url is None: url = f"https://{self.proxy_domain}/" success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries) # Save cookies to database if successful if success and self.unified_db: cookies_list = self.cf_handler.get_cookies_list() if cookies_list: self._save_cookies_to_db(cookies_list) return success def _start_browser(self): """Start browser if not already running (reusable across profiles)""" # Try to get fresh cookies via FlareSolverr if we don't have them or they're old # Do this BEFORE the browser reuse check so cookies are always checked if not self._has_valid_cookies() or self._cookies_expired(): self.log("Cookies missing or expired, attempting FlareSolverr bypass...", "info", module="Cloudflare") if self._get_cookies_via_flaresolverr(): self.log("Successfully got fresh cookies from FlareSolverr", "info", module="Cloudflare") else: self.log("FlareSolverr unavailable, will try with Playwright", "warning", module="Cloudflare") if self.browser is not None: self.log("Browser already running, reusing...", "debug", module="Browser") return import os # Use environment variable if set, otherwise use standard location if 'PLAYWRIGHT_BROWSERS_PATH' not in os.environ: os.environ['PLAYWRIGHT_BROWSERS_PATH'] = '/root/.cache/ms-playwright' os.environ['DISPLAY'] = ':100' # Use Xvfb virtual display self.log("Starting browser (Chromium)...", "info", module="Browser") self.playwright = sync_playwright().start() self.browser = self.playwright.chromium.launch( headless=self.headless, args=[ '--disable-blink-features=AutomationControlled', '--disable-dev-shm-usage', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu', '--disable-software-rasterizer', '--disable-accelerated-2d-canvas', '--disable-accelerated-video-decode' ] ) # CRITICAL: User-Agent must match FlareSolverr for cookies to work self.context = self.browser.new_context( viewport={'width': 1920, 'height': 1080}, user_agent=self.user_agent ) # Load cookies self.load_cookies(self.context) self.page = self.context.new_page() # Add basic anti-detection self.page.add_init_script(""" Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); """) self.log("Browser started and ready", "info", module="Browser") def _stop_browser(self): """Stop the browser safely with proper error handling""" # Close context first if self.context: try: self.context.close() self.log("Browser context closed", "debug", module="Browser") except Exception as e: self.log(f"Error closing browser context: {e}", "warning") finally: self.context = None # Close browser if self.browser: try: self.browser.close() self.log("Browser closed", "debug", module="Browser") except Exception as e: self.log(f"Error closing browser: {e}", "warning") finally: self.browser = None # Stop playwright if self.playwright: try: self.playwright.stop() except Exception as e: self.log(f"Error stopping playwright: {e}", "warning") finally: self.playwright = None self.page = None def __del__(self): """Cleanup browser when instance is destroyed""" self._stop_browser() def __enter__(self): """Context manager entry - allows using 'with' statement""" return self def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit - ensures browser cleanup""" self._stop_browser() return False # Don't suppress exceptions def _extract_media_id_from_url(self, url: str) -> str: """Extract media ID from URL""" # URL format: various formats on storyclone.com # Try to extract meaningful ID from URL match = re.search(r'/([^/]+)/?$', url) if match: return match.group(1) return None def _update_file_timestamps(self, filepath: Path, post_date: datetime): """Update all timestamps for a file to match the post date""" try: # Convert datetime to timestamp timestamp = post_date.timestamp() # 1. Update file system timestamps (access time and modification time) os.utime(filepath, (timestamp, timestamp)) self.log(f"Updated file timestamps to {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug") # 2. Try to update creation time (platform-specific) if platform.system() == 'Darwin': # macOS # Use SetFile command on macOS date_str = post_date.strftime('%m/%d/%Y %H:%M:%S') try: subprocess.run( ['SetFile', '-d', date_str, str(filepath)], capture_output=True, text=True ) except (subprocess.SubprocessError, FileNotFoundError, OSError): pass # SetFile not available on this system elif platform.system() == 'Windows': # On Windows, use PowerShell with proper escaping to prevent injection filepath_escaped = str(filepath).replace("'", "''") date_escaped = post_date.isoformat().replace("'", "''") ps_command = f"(Get-Item -LiteralPath '{filepath_escaped}').CreationTime = Get-Date '{date_escaped}'" try: subprocess.run( ['powershell', '-Command', ps_command], capture_output=True, text=True ) except (subprocess.SubprocessError, FileNotFoundError, OSError): pass # PowerShell command failed # Linux doesn't support changing creation time # 3. Update EXIF data for images if str(filepath).lower().endswith(('.jpg', '.jpeg', '.png')): self._update_exif_timestamps(filepath, post_date) except Exception as e: self.log(f"Error updating timestamps: {e}", "warning") def _update_exif_timestamps(self, filepath: Path, post_date: datetime): """Update EXIF timestamps in image files""" try: # Check if exiftool is available result = subprocess.run(['which', 'exiftool'], capture_output=True, text=True) if result.returncode == 0: # Format date for EXIF exif_date = post_date.strftime('%Y:%m:%d %H:%M:%S') # Update all date fields in EXIF including MetadataDate for Immich cmd = [ 'exiftool', '-overwrite_original', '-quiet', f'-AllDates={exif_date}', f'-MetadataDate={exif_date}', '-HistoryWhen=', f'-FileModifyDate={exif_date}', str(filepath) ] subprocess.run(cmd, capture_output=True, text=True) self.log(f"Updated EXIF timestamps", "debug") except (subprocess.SubprocessError, OSError, FileNotFoundError): # Silently skip if exiftool not available pass def _extract_post_date(self, page) -> datetime: """Try to extract post date from page""" try: # Wait a moment for dynamic content to load page.wait_for_timeout(500) # Look for date elements on StoryClon e date_selectors = [ 'time[datetime]', 'time', '.date', '[datetime]', 'span.date', 'div.date', '.story-date', '.post-date' ] for selector in date_selectors: elem = page.locator(selector).first if elem.count() > 0: # Try datetime attribute first datetime_str = elem.get_attribute('datetime') if datetime_str: # Parse ISO format for fmt in ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d']: try: return datetime.strptime(datetime_str.split('.')[0].replace('Z', ''), fmt) except ValueError: continue # Try text content text = elem.text_content() if text: # Parse various date formats if "ago" in text.lower(): # Handle relative dates if "hour" in text: hours = int(re.search(r'(\d+)', text).group(1)) return datetime.now() - timedelta(hours=hours) elif "day" in text: days = int(re.search(r'(\d+)', text).group(1)) return datetime.now() - timedelta(days=days) elif "week" in text: weeks = int(re.search(r'(\d+)', text).group(1)) return datetime.now() - timedelta(weeks=weeks) else: # Try parsing absolute date for fmt in ['%B %d, %Y', '%b %d, %Y', '%Y-%m-%d']: try: return datetime.strptime(text, fmt) except ValueError: continue except Exception as e: self.log(f"Error extracting date: {e}", "debug") return None def _parse_storyclone_filename(self, filename: str, profile_name: str) -> datetime: """ Parse date from StoryClon e filename format and adjust for timezone Format: evalongoria-2025-10-23T17-42-56.jpg StoryClon e uses UTC, so subtract 4 hours to get local time Args: filename: StoryClon e filename profile_name: Username to strip from beginning Returns: datetime object adjusted to local time, or None if parsing failed """ try: # Remove extension filename_no_ext = Path(filename).stem # Check if it starts with profile name if filename_no_ext.startswith(f"{profile_name}-"): # Extract date part: 2025-10-23T17-42-56 date_part = filename_no_ext[len(f"{profile_name}-"):] # Parse ISO-like format with hyphens instead of colons # 2025-10-23T17-42-56 -> 2025-10-23 17:42:56 date_part_clean = date_part.replace('T', ' ') # Replace only the time part hyphens with colons parts_dt = date_part_clean.split(' ') if len(parts_dt) == 2: date_portion = parts_dt[0] # 2025-10-23 time_portion = parts_dt[1].replace('-', ':') # 17-42-56 -> 17:42:56 datetime_str = f"{date_portion} {time_portion}" # Parse the datetime (this is in UTC) parsed_date = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S') # Subtract 4 hours to convert from UTC to local time local_date = parsed_date - timedelta(hours=4) return local_date except Exception as e: self.log(f"Error parsing StoryClon e filename '{filename}': {e}", "debug") return None def _parse_story_date_text(self, date_text: str) -> datetime: """ Parse StoryClon e date text format Examples: "Posted on today at 1:42 PM" "Posted on today at 1:44 PM" Returns: datetime object or None if parsing failed """ try: # StoryClon e format: "Posted on today at 1:42 PM" if "Posted on today at" in date_text: # Extract time part (e.g., "1:42 PM") time_match = re.search(r'(\d{1,2}):(\d{2})\s*(AM|PM)', date_text, re.IGNORECASE) if time_match: hour = int(time_match.group(1)) minute = int(time_match.group(2)) am_pm = time_match.group(3).upper() # Convert to 24-hour format if am_pm == 'PM' and hour != 12: hour += 12 elif am_pm == 'AM' and hour == 12: hour = 0 # Use today's date with the extracted time now = datetime.now() story_datetime = now.replace(hour=hour, minute=minute, second=0, microsecond=0) return story_datetime # Could add more date formats here if needed except Exception as e: self.log(f"Error parsing date text '{date_text}': {e}", "debug") return None def _record_download(self, username: str, url: str, filename: str, post_date=None, metadata: dict = None, file_path: str = None, deferred: bool = False): """Record a download in the database Args: deferred: If True, don't record to database now - add to pending_downloads list for later recording after file move is complete """ # If deferred, store for later recording instead of recording now if deferred: self.pending_downloads.append({ 'username': username, 'url': url, 'filename': filename, 'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date, 'file_path': file_path, 'metadata': metadata }) self.log(f"Deferred recording for {filename}", "debug") return True if not self.db: return try: self.db.mark_downloaded( username=username, url=url, filename=filename, post_date=post_date, metadata=metadata, file_path=file_path ) except Exception as e: self.log(f"Failed to record download: {e}", "debug") def get_pending_downloads(self): """Get list of downloads that were deferred for later recording""" return self.pending_downloads.copy() def clear_pending_downloads(self): """Clear the pending downloads list after they've been recorded""" self.pending_downloads = [] def _scan_existing_files(self, output_dir: Path, profile_name: str): """Scan directory for existing files and extract media IDs and dates""" self.downloaded_files.clear() self.file_dates = {} # Map media_id -> datetime # Patterns: Both my format and StoryClon e format for pattern in ["*.jpg", "*.jpeg", "*.png", "*.heic", "*.mp4", "*.mov"]: for filepath in output_dir.glob(pattern): # Skip corrupted/incomplete files (less than 20KB) if filepath.stat().st_size < 20000: self.log(f"Skipping corrupted file (size < 20KB): {filepath.name}", "debug") continue filename = filepath.stem media_id = None file_date = None # Try my FastDL format: profile_YYYYMMDD_HHMMSS_mediaid.ext parts = filename.split('_', 3) if len(parts) >= 4 and parts[0] == profile_name: media_id = parts[3] # Everything after date/time # Parse date from filename try: date_str = f"{parts[1]}_{parts[2]}" # YYYYMMDD_HHMMSS file_date = datetime.strptime(date_str, '%Y%m%d_%H%M%S') except (ValueError, IndexError): pass # Try StoryClon e format: profile-YYYY-MM-DDTHH-MM-SS.ext elif filename.startswith(f"{profile_name}-"): # Example: evalongoria-2025-10-23T17-42-56 # Extract: 2025-10-23T17-42-56 date_part = filename[len(f"{profile_name}-"):] try: # Parse ISO-like format with hyphens instead of colons # 2025-10-23T17-42-56 -> 2025-10-23 17:42:56 date_part_clean = date_part.replace('T', ' ') # Replace only the time part hyphens with colons # Split on space to separate date and time parts_dt = date_part_clean.split(' ') if len(parts_dt) == 2: date_portion = parts_dt[0] # 2025-10-23 time_portion = parts_dt[1].replace('-', ':') # 17-42-56 -> 17:42:56 datetime_str = f"{date_portion} {time_portion}" # Parse the datetime parsed_date = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S') # Subtract 4 hours to convert from UTC to local time file_date = parsed_date - timedelta(hours=4) # Use the date part as media_id media_id = filename[len(f"{profile_name}-"):] except Exception as e: self.log(f"Could not parse StoryClon e date from {filename}: {e}", "debug") # Still use as media_id for duplicate detection media_id = filename[len(f"{profile_name}-"):] if media_id: self.downloaded_files.add(media_id) if file_date: self.file_dates[media_id] = file_date if self.downloaded_files: self.log(f"Found {len(self.downloaded_files)} valid existing files for {profile_name} ({len(self.file_dates)} with dates)", "debug") def _get_processed_posts(self, username: str) -> set: """Get set of story IDs that have been processed from database""" processed = set() if not self.db: return processed try: with self.db.get_connection() as conn: cursor = conn.cursor() # Get all stories for this user from downloads table cursor.execute(''' SELECT url, filename, metadata FROM downloads WHERE platform = 'snapchat' AND source = ? ''', (username,)) for row in cursor.fetchall(): url, filename, metadata_str = row # Extract media_id from filename if filename: # Format: username_date_MEDIAID.ext or username_date_MEDIAID_N.ext parts = filename.split('_') if len(parts) >= 4: # Get everything after date/time as media_id media_id = '_'.join(parts[3:]).split('.')[0] processed.add(media_id) # Also check metadata for media_id if metadata_str: try: metadata = json.loads(metadata_str) if 'media_id' in metadata: processed.add(metadata['media_id']) except (json.JSONDecodeError, KeyError, TypeError): pass if processed: self.log(f"Found {len(processed)} processed stories in database for {username}", "debug") except Exception as e: self.log(f"Error loading processed stories from database: {e}", "debug") return processed def save_cookies(self, context): """Save cookies to database or file""" cookies = context.cookies() # Save to database if available if self.unified_db: try: self.unified_db.save_scraper_cookies(self.scraper_id, cookies) self.log(f"Saved {len(cookies)} cookies to database", "debug") return except Exception as e: self.log(f"Error saving cookies to database: {e}", "warning") # Fallback to file-based storage if self.cookie_file: storage_data = { 'cookies': cookies, 'timestamp': datetime.now().isoformat() } with open(self.cookie_file, 'w') as f: json.dump(storage_data, f, indent=2) self.log(f"Saved {len(cookies)} cookies to file", "debug") def load_cookies(self, context): """Load saved cookies from database or file""" # Try loading from database first if self.unified_db: try: cookies = self.unified_db.get_scraper_cookies(self.scraper_id) if cookies: # Clean cookies - remove unsupported properties cleaned_cookies = [] for cookie in cookies: cleaned = {k: v for k, v in cookie.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']} cleaned_cookies.append(cleaned) context.add_cookies(cleaned_cookies) self.log(f"Loaded {len(cleaned_cookies)} cookies from database", "info") return True except Exception as e: self.log(f"Error loading cookies from database: {e}", "warning") # Fallback to file-based cookies if not self.cookie_file or not self.cookie_file.exists(): return False try: with open(self.cookie_file, 'r') as f: data = json.load(f) # Check age (24 hours) saved_time = datetime.fromisoformat(data['timestamp']) if datetime.now() - saved_time > timedelta(hours=24): self.log("Cookies expired", "debug") return False # Clean cookies - remove unsupported properties cleaned_cookies = [] for cookie in data['cookies']: # Remove Chrome-specific properties that Playwright doesn't support cleaned = {k: v for k, v in cookie.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']} cleaned_cookies.append(cleaned) context.add_cookies(cleaned_cookies) self.log(f"Loaded {len(cleaned_cookies)} cookies from file", "info") return True except Exception as e: self.log(f"Failed to load cookies: {e}", "warning") return False def wait_for_cloudflare(self, page): """Wait for Cloudflare to auto-solve or page to load - uses FlareSolverr when needed""" self.log("Waiting for page to load...", "debug") max_wait = 120 # Extended wait to match ImgInn flaresolverr_attempted = False for i in range(max_wait): time.sleep(1) # Check current URL and content try: current_url = page.url content = page.content().lower() except Exception as e: if "navigating" in str(e).lower(): self.log("Page still navigating, waiting...", "debug") continue else: raise # Check for actual Cloudflare challenge or server error challenge_indicators = ['checking your browser', 'just a moment', 'verify you are human', 'enable javascript'] error_indicators = ['internal server error', 'error code 500', 'error code 502', 'error code 503'] has_challenge = any(indicator in content for indicator in challenge_indicators) has_error = any(indicator in content for indicator in error_indicators) if has_error: self.log("Server error detected (500/502/503) - site is likely down", "error") return False if has_challenge: if not flaresolverr_attempted: self.log("Cloudflare challenge detected, attempting FlareSolverr bypass...", "info", module="Cloudflare") # Try to get fresh cookies via FlareSolverr if self._get_cookies_via_flaresolverr(page.url): self.log("Got fresh cookies from FlareSolverr, reloading page...", "info", module="Cloudflare") # Reload cookies in browser context try: self.load_cookies(self.context) # Reload the page with new cookies page.reload(wait_until='domcontentloaded', timeout=10000) time.sleep(2) # Give page time to load with new cookies except Exception as e: self.log(f"Error reloading page with new cookies: {e}", "debug") else: self.log("FlareSolverr failed, waiting for challenge to resolve...", "warning", module="Cloudflare") flaresolverr_attempted = True continue # Check if we're on the correct page with content if 'storyclone.com' in current_url.lower(): # Look for story content indicators if 'story' in content or 'username' in content or 'download' in content or 'stories' in content: self.log(f"Page loaded after {i+1} seconds", "info") return True # Status updates if i == 10: self.log("Still waiting (10s)... Cloudflare is checking", "debug") elif i == 20: self.log("Still waiting (20s)... Cloudflare challenge ongoing", "info") elif i == 30: self.log("Still waiting (30s)... This is normal for Cloudflare", "info") # Timeout reached self.log(f"Page load timeout. URL: {page.url}", "error") return False def download(self, username: str, content_type: str = "stories", days_back: int = 14, max_downloads: int = 50, output_dir: str = None, phrase_config: dict = None, defer_database: bool = False): """Download content from a user - compatible with media-downloader interface Args: username: Snapchat username content_type: Type of content ("stories" only for Snapchat) days_back: How many days back to search max_downloads: Maximum stories to download output_dir: Output directory phrase_config: Not used for Snapchat (stories don't have captions usually) defer_database: If True, don't record to database immediately - store in pending_downloads for later recording after file move is complete """ self.defer_database = defer_database # Store for use in download methods # Clear downloaded_files cache between accounts to prevent memory growth self.downloaded_files.clear() # Check site status before doing anything else self.log(f"Checking {self.proxy_domain} site status...", "debug") site_status, error_msg = self.cf_handler.check_site_status(f"https://{self.proxy_domain}/", timeout=10) if self.cf_handler.should_skip_download(site_status): self.log(f"Skipping download - {self.proxy_domain} is unavailable: {error_msg}", "warning") return 0 elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE: self.log("Cloudflare challenge detected, will attempt bypass during download", "info") # Set output directory if output_dir: output_path = Path(output_dir) / username else: output_path = Path(f"/opt/media-downloader/downloads/{username}") # Route to download method if content_type == "stories": files = self.download_stories( username=username, days_back=days_back, max_stories=max_downloads, output_dir=output_path ) else: self.log(f"Snapchat downloader does not support content type: {content_type}", "warning") return 0 return len(files) def download_stories(self, username: str, days_back: int = 14, max_stories: int = 50, output_dir: Path = None): """Download stories from a Snapchat user with FastDL naming Args: username: Snapchat username days_back: How many days back to search max_stories: Maximum stories to download output_dir: Output directory """ profile_name = username.lower() if output_dir is None: output_dir = Path(f"/opt/media-downloader/downloads/{profile_name}") output_dir.mkdir(parents=True, exist_ok=True) # Scan existing files self._scan_existing_files(output_dir, profile_name) # Get processed stories from database processed_stories = self._get_processed_posts(profile_name) self.log(f"Loaded {len(processed_stories)} processed stories for {profile_name} from database", "info") downloaded_files = [] cutoff_date = datetime.now() - timedelta(days=days_back) # Update activity status self.activity_manager.update_status(f"Checking stories from @{profile_name}") # Start or reuse browser self._start_browser() page = self.page try: # Navigate to user's stories page on StoryClon e self.log(f"Navigating to @{username} on {self.proxy_domain}", "info") page.goto(f"https://{self.proxy_domain}/u/{username}/", wait_until='domcontentloaded') # Wait for page to load if not self.wait_for_cloudflare(page): self.log("Page didn't load properly", "error") return [] # Save cookies self.save_cookies(self.context) # Wait for page to load self.log("Waiting for page to load...", "info") time.sleep(3) # Give page time to load content # Check if "Stories" section exists - if not, there are no stories to scrape stories_section = page.locator('div.font-semibold.ml-6:has-text("Stories")').first if stories_section.count() == 0: self.log("No 'Stories' section found - user has no stories available", "info") return [] self.log("Found 'Stories' section - proceeding to load all stories...", "info") # Scroll down and load all stories by clicking "Load More" button self.log("Scrolling to load all stories...", "info") load_more_clicks = 0 max_attempts = 20 for attempt in range(max_attempts): # Step 1: Scroll until we see "Spotlight Highlights" self.log("Scrolling until 'Spotlight Highlights' is visible...", "debug") scroll_attempts = 0 max_scrolls = 10 while scroll_attempts < max_scrolls: spotlight_highlights = page.locator('text=Spotlight Highlights').first if spotlight_highlights.count() > 0: self.log("Found 'Spotlight Highlights' in view", "debug") break page.evaluate("window.scrollBy(0, 400)") time.sleep(1) scroll_attempts += 1 # Step 2: Check if there's a "Load More" button ABOVE "Spotlight Highlights" (positionally before) load_more_btn = page.locator('button:has-text("Load More"), button.load-more-button').first spotlight_highlights = page.locator('text=Spotlight Highlights').first load_more_visible = load_more_btn.count() > 0 and load_more_btn.is_visible() spotlight_visible = spotlight_highlights.count() > 0 if load_more_visible and spotlight_visible: # Both are visible - check Y positions to see which comes first load_more_box = load_more_btn.bounding_box() spotlight_box = spotlight_highlights.bounding_box() if load_more_box and spotlight_box: load_more_y = load_more_box['y'] spotlight_y = spotlight_box['y'] if load_more_y < spotlight_y: # "Load More" is ABOVE "Spotlight Highlights" → Click it load_more_clicks += 1 self.log(f"Found 'Load More' ABOVE 'Spotlight Highlights' (Y:{load_more_y:.0f} < {spotlight_y:.0f}) - clicking (click #{load_more_clicks})...", "info") load_more_btn.click() time.sleep(2.5) # Wait for more posts to load items_count = len(page.locator('.item').all()) self.log(f"Items after click: {items_count}", "debug") # Go back and scroll to "Spotlight Highlights" again (it will be pushed down) continue else: # "Load More" is BELOW "Spotlight Highlights" → We're done items_final = page.locator('.item').all() self.log(f"'Load More' is BELOW 'Spotlight Highlights' (Y:{load_more_y:.0f} > {spotlight_y:.0f}) - done! Found {len(items_final)} stories (clicked Load More {load_more_clicks} times)", "info") break elif spotlight_visible: # Only "Spotlight Highlights" visible, no "Load More" → We're done items_final = page.locator('.item').all() self.log(f"No 'Load More' button found - done! Found {len(items_final)} stories (clicked Load More {load_more_clicks} times)", "info") break else: # Neither visible, keep trying self.log("Neither 'Load More' nor 'Spotlight Highlights' found, continuing...", "debug") continue # Find story/media elements by processing each .item container # This ensures lazy-loaded content is properly triggered self.log("Extracting media from story items...", "info") # Get Y position of "Spotlight Highlights" to filter out items after it spotlight_highlights = page.locator('text=Spotlight Highlights').first spotlight_y = None if spotlight_highlights.count() > 0: spotlight_box = spotlight_highlights.bounding_box() if spotlight_box: spotlight_y = spotlight_box['y'] self.log(f"'Spotlight Highlights' Y position: {spotlight_y:.0f}", "debug") # Get all .item elements all_items = page.locator('.item').all() # Filter to only items BEFORE "Spotlight Highlights" story_items = [] for item in all_items: item_box = item.bounding_box() if item_box and spotlight_y: item_y = item_box['y'] if item_y < spotlight_y: story_items.append(item) elif not spotlight_y: # No Spotlight Highlights found, include all items story_items.append(item) self.log(f"Filtered to {len(story_items)} story items (before Spotlight Highlights) from {len(all_items)} total items", "info") media_elements = [] for idx, item in enumerate(story_items): try: # Scroll item into view to trigger lazy loading item.scroll_into_view_if_needed() time.sleep(0.3) # Give it a moment to load # Look for video first video = item.locator('video[src]').first if video.count() > 0: media_elements.append(video) self.log(f"Item {idx+1}: Found video", "debug") continue # If no video, look for image from Snapchat CDN img = item.locator('img[src*="sc-cdn.net"]').first if img.count() > 0: src = img.get_attribute('src') # Skip apple icons, favicons, and poster images if src and 'apple-icon' not in src and 'favicon' not in src and '/d/' in src: media_elements.append(img) self.log(f"Item {idx+1}: Found image", "debug") continue self.log(f"Item {idx+1}: No media found (may be lazy-loading)", "debug") except Exception as e: self.log(f"Item {idx+1}: Error processing - {e}", "debug") self.log(f"Extracted {len(media_elements)} media elements from {len(story_items)} items", "info") if not media_elements: self.log("No stories found for this user", "warning") return [] self.log(f"Found {len(media_elements)} potential story items", "info") # Download each story story_index = 1 for i, media_elem in enumerate(media_elements[:max_stories]): try: # Get media URL media_url = None # Try to get src attribute media_url = media_elem.get_attribute('src') # If no src, try href (for download links) if not media_url or media_url == '#': media_url = media_elem.get_attribute('href') if not media_url or media_url == '#' or media_url.startswith('data:'): self.log(f"Story {story_index}: Invalid media URL", "warning") continue self.log(f"Story {story_index}: {media_url[:80]}...", "debug") # Try to get higher quality version by replacing size parameter # URLs look like: https://.../{id}.1034.IRZXSOY?... # Try larger sizes: 2048, 1920, 1440, 1034 (original) import re hq_url = None original_url = media_url # Check if URL has a size parameter pattern size_match = re.search(r'\.(\d+)\.IRZXSOY', media_url) if size_match: original_size = size_match.group(1) # Try larger sizes (in descending order) for test_size in ['2048', '1920', '1440']: if int(test_size) > int(original_size): test_url = media_url.replace(f'.{original_size}.IRZXSOY', f'.{test_size}.IRZXSOY') # Test if this URL is accessible try: import requests response = requests.head(test_url, timeout=5, allow_redirects=True) if response.status_code == 200: hq_url = test_url self.log(f"Story {story_index}: Found higher quality version (size {test_size})", "info") break except requests.RequestException: continue # Use HQ URL if found, otherwise use original if hq_url: media_url = hq_url # Extract media ID from URL and determine correct extension from urllib.parse import urlparse, unquote url_path = urlparse(media_url).path original_name = unquote(url_path.split('/')[-1].split('?')[0]) # Determine file type from element or URL # Snapchat CDN uses weird extensions like .IRZXSOY, so we need to detect the actual type if media_elem.evaluate("element => element.tagName").lower() == 'video': ext = '.mp4' # Videos are MP4 else: ext = '.jpg' # Images are JPG # Use the full filename as media_id (without fake extension) media_id = original_name.split('.')[0] # Take first part before any dots # Check if already downloaded if media_id in self.downloaded_files or media_id in processed_stories: self.log(f"Story {story_index}: Already downloaded ({media_id}), skipping", "debug") story_index += 1 continue # Extract post date from the story item on the page story_date = None try: # Try multiple strategies to find the date associated with THIS specific story # Strategy 1: Look in the immediate parent of the media element immediate_parent = media_elem.locator('xpath=..').first if immediate_parent.count() > 0: date_elem = immediate_parent.locator('.text-sm').first if date_elem.count() > 0: date_text = date_elem.text_content() if date_text and ("Posted on" in date_text or "at" in date_text): self.log(f"Story {story_index}: Found date in immediate parent: '{date_text}'", "debug") story_date = self._parse_story_date_text(date_text) if story_date: self.log(f"Story {story_index}: Extracted date from page: {story_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug") # Strategy 2: If not found, try the closest ancestor with a limited depth if not story_date: # Look for a closer parent (not going all the way up) for depth in [1, 2, 3]: parent_xpath = 'xpath=' + '/'.join(['..'] * depth) parent = media_elem.locator(parent_xpath).first if parent.count() > 0: # Get only the FIRST .text-sm in this parent date_elem = parent.locator('.text-sm').first if date_elem.count() > 0: date_text = date_elem.text_content() if date_text and ("Posted on" in date_text or "at" in date_text): self.log(f"Story {story_index}: Found date at depth {depth}: '{date_text}'", "debug") story_date = self._parse_story_date_text(date_text) if story_date: self.log(f"Story {story_index}: Extracted date from page: {story_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug") break if not story_date: self.log(f"Story {story_index}: Could not find date text for this story", "debug") except Exception as e: self.log(f"Story {story_index}: Could not extract date - {e}", "debug") import traceback self.log(f"Story {story_index}: Traceback: {traceback.format_exc()}", "debug") # Fallback to current time if extraction failed if not story_date: story_date = datetime.now() self.log(f"Story {story_index}: Using current time as fallback", "debug") date_str = story_date.strftime('%Y%m%d_%H%M%S') # Build filename: {profile}_{date}_{media_id}{ext} filename = f"{profile_name}_{date_str}_{media_id}{ext}" filepath = output_dir / filename # Download the story try: import requests # Ensure full URL if not media_url.startswith('http'): media_url = f"https:{media_url}" if media_url.startswith('//') else f"https://{self.proxy_domain}{media_url}" response = requests.get(media_url, timeout=30, headers={ 'User-Agent': self.user_agent, 'Referer': f'https://{self.proxy_domain}/' }, cookies=self._get_cookies_for_requests()) response.raise_for_status() # Save file with open(filepath, 'wb') as f: f.write(response.content) self.log(f"Downloaded story: {filename} ({len(response.content)} bytes)", "info") downloaded_files.append(str(filepath)) # Check for duplicate hash before recording if self.db: from pathlib import Path as PathLib # Check for duplicate hash (hash blacklist persists even if original deleted) file_hash = self.db.get_file_hash(str(filepath)) if file_hash: existing = self.db.get_download_by_file_hash(file_hash) if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'): # Duplicate hash found - content was already downloaded (prevents redownload of deleted content) self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning") # Delete the duplicate regardless of whether original file still exists try: filepath.unlink() self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug") continue except Exception as e: self.log(f"Failed to delete duplicate {filename}: {e}", "warning") # Update timestamps self._update_file_timestamps(filepath, story_date) # Add to tracking self.downloaded_files.add(media_id) # Record in database with media_id in metadata self._record_download( username=profile_name, url=media_url, filename=filename, post_date=story_date, metadata={'media_id': media_id}, file_path=str(filepath), deferred=getattr(self, 'defer_database', False) ) story_index += 1 except Exception as e: self.log(f"Failed to download story {story_index}: {e}", "error") story_index += 1 continue except Exception as e: self.log(f"Error processing story {story_index}: {e}", "error") story_index += 1 continue self.log(f"Downloaded {len(downloaded_files)} story files", "info") except Exception as e: self.log(f"Error downloading stories: {e}", "error") import traceback self.log(f"Traceback: {traceback.format_exc()}", "debug") # Don't close browser here - reuse it for next profile return downloaded_files def main(): """Test the downloader with FastDL naming""" import sys print("=" * 60) print("Snapchat Downloader (StoryClon e) - FastDL Compatible Naming") print("=" * 60) print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print("=" * 60) downloader = SnapchatDownloader( api_key=None, # Add your 2captcha key if needed headless=False # Use with xvfb ) # Test username (replace with actual Snapchat username) test_username = sys.argv[1] if len(sys.argv) > 1 else "testuser" # Download stories files = downloader.download_stories( username=test_username, days_back=7, max_stories=50 ) print("\n" + "=" * 60) print("RESULTS") print("=" * 60) if files: print(f"Successfully downloaded {len(files)} files!") print("\nDownloaded files (FastDL naming format):") for f in files: name = Path(f).name size = Path(f).stat().st_size / 1024 parts = name.split('_', 3) if len(parts) >= 4: print(f" - {name}") print(f" Profile: {parts[0]}") print(f" Date: {parts[1]}_{parts[2]}") print(f" Media ID: {parts[3].split('.')[0]}") print(f" Size: {size:.1f} KB") else: print("No files downloaded") # Check total in folder download_dir = Path(f"/opt/media-downloader/downloads/{test_username}") if download_dir.exists(): all_files = list(download_dir.glob("*")) total_size = sum(f.stat().st_size for f in all_files) / 1024 print(f"\nTotal in folder: {len(all_files)} files ({total_size:.1f} KB)") if __name__ == "__main__": main()