Files
media-downloader/docs/archive/snapchat_module_storyclon.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

1302 lines
58 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Snapchat downloader module using StoryClon e proxy (s.storyclone.com)
Based on ImgInn module structure with FastDL-compatible file naming
Format: {profile}_{YYYYMMDD_HHMMSS}_{media_id}{ext}
"""
# Allow nested event loops for compatibility with asyncio contexts
try:
import nest_asyncio
nest_asyncio.apply()
except ImportError:
pass
import os
import json
import time
import re
import subprocess
import platform
import requests
from pathlib import Path
from datetime import datetime, timedelta
from modules.base_module import LoggingMixin
from modules.universal_logger import get_logger
from modules.cloudflare_handler import CloudflareHandler, SiteStatus, get_flaresolverr_user_agent
from playwright.sync_api import sync_playwright
class SnapchatDownloader(LoggingMixin):
"""Snapchat downloader using StoryClon e with FastDL-compatible naming"""
def __init__(self,
headless: bool = True,
cookie_file: str = "/opt/media-downloader/cookies/snapchat_cookies.json",
show_progress: bool = True,
use_database: bool = True,
log_callback=None,
unified_db=None,
proxy_domain: str = "sn.storyclone.com"):
"""Initialize downloader compatible with media-downloader system"""
self.headless = headless
self.downloaded_files = set() # Track downloaded media IDs
self.file_dates = {} # Map media_id -> datetime from existing filenames
self.show_progress = show_progress
self.use_database = use_database
self.download_count = 0
self.unified_db = unified_db # Store for scraper config access
self.scraper_id = 'snapchat' # Scraper ID in database
# Initialize logging via mixin
self._init_logger('Snapchat', log_callback, default_module='Download')
# Browser reuse across profiles
self.playwright = None
self.browser = None
self.context = None
self.page = None
# Use unified database if provided
if unified_db and use_database:
from modules.unified_database import SnapchatDatabaseAdapter
self.db = SnapchatDatabaseAdapter(unified_db)
else:
self.db = None
self.use_database = False
# Initialize activity status manager for real-time updates
from modules.activity_status import get_activity_manager
self.activity_manager = get_activity_manager(unified_db)
# Load scraper configuration from database if available
self.proxy_url = None
self.cookie_file = None # Default to None (use database)
self.proxy_domain = proxy_domain # Default proxy domain
if unified_db:
scraper_config = unified_db.get_scraper(self.scraper_id)
if scraper_config:
# Get proxy configuration
if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
self.proxy_url = scraper_config['proxy_url']
self.log(f"Using proxy: {self.proxy_url}", "info")
# Get base URL (proxy domain) from database
if scraper_config.get('base_url'):
self.proxy_domain = scraper_config['base_url'].replace('https://', '').replace('http://', '').rstrip('/')
# Fall back to cookie file if no database
if not unified_db:
self.cookie_file = Path(cookie_file)
self.cookie_file.parent.mkdir(parents=True, exist_ok=True)
# User-Agent to match FlareSolverr (dynamically fetched for consistency)
self.user_agent = get_flaresolverr_user_agent()
# Initialize universal Cloudflare handler
# Pass proxy_url if configured, and cookie_file=None for database storage
self.cf_handler = CloudflareHandler(
module_name="Snapchat",
cookie_file=str(self.cookie_file) if self.cookie_file else None,
user_agent=self.user_agent,
logger=self.logger,
aggressive_expiry=True,
proxy_url=self.proxy_url # Pass proxy to FlareSolverr
)
# Keep for backwards compatibility
self.flaresolverr_url = self.cf_handler.flaresolverr_url
self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled
self.pending_downloads = [] # Track downloads for deferred database recording
# Load cookies from database if available
self._load_cookies_from_db()
# Check if we need to get initial cookies
if not self._has_valid_cookies():
self.log("No cookies found, will load cookies on first use", "info")
def _load_cookies_from_db(self):
"""Load cookies from database if available"""
if not self.unified_db:
return
try:
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
if cookies:
# Load into CloudflareHandler
self.cf_handler._cookies = cookies
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
except Exception as e:
self.log(f"Error loading cookies from database: {e}", "warning")
def _save_cookies_to_db(self, cookies: list):
"""Save cookies to database"""
if not self.unified_db:
return
try:
self.unified_db.save_scraper_cookies(
self.scraper_id,
cookies,
user_agent=self.user_agent,
merge=True
)
self.log(f"Saved {len(cookies)} cookies to database", "debug")
except Exception as e:
self.log(f"Error saving cookies to database: {e}", "warning")
def _has_valid_cookies(self):
"""Check if we have valid cookies (either in file or database)"""
if self.unified_db:
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
return cookies and len(cookies) > 0
elif self.cookie_file:
return self.cookie_file.exists()
return False
def _cookies_expired(self):
"""Check if cookies are expired - delegates to CloudflareHandler"""
return self.cf_handler.cookies_expired()
def _get_cookies_for_requests(self):
"""Get cookies in format for requests library - delegates to CloudflareHandler"""
return self.cf_handler.get_cookies_dict()
def _get_cookies_via_flaresolverr(self, url=None, max_retries=2):
"""Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler
Args:
url: URL to fetch (defaults to proxy_domain)
max_retries: Maximum number of retry attempts (default: 2)
Returns:
True if cookies obtained successfully, False otherwise
"""
if url is None:
url = f"https://{self.proxy_domain}/"
success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries)
# Save cookies to database if successful
if success and self.unified_db:
cookies_list = self.cf_handler.get_cookies_list()
if cookies_list:
self._save_cookies_to_db(cookies_list)
return success
def _start_browser(self):
"""Start browser if not already running (reusable across profiles)"""
# Try to get fresh cookies via FlareSolverr if we don't have them or they're old
# Do this BEFORE the browser reuse check so cookies are always checked
if not self._has_valid_cookies() or self._cookies_expired():
self.log("Cookies missing or expired, attempting FlareSolverr bypass...", "info", module="Cloudflare")
if self._get_cookies_via_flaresolverr():
self.log("Successfully got fresh cookies from FlareSolverr", "info", module="Cloudflare")
else:
self.log("FlareSolverr unavailable, will try with Playwright", "warning", module="Cloudflare")
if self.browser is not None:
self.log("Browser already running, reusing...", "debug", module="Browser")
return
import os
# Use environment variable if set, otherwise use standard location
if 'PLAYWRIGHT_BROWSERS_PATH' not in os.environ:
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = '/root/.cache/ms-playwright'
os.environ['DISPLAY'] = ':100' # Use Xvfb virtual display
self.log("Starting browser (Chromium)...", "info", module="Browser")
self.playwright = sync_playwright().start()
self.browser = self.playwright.chromium.launch(
headless=self.headless,
args=[
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-gpu',
'--disable-software-rasterizer',
'--disable-accelerated-2d-canvas',
'--disable-accelerated-video-decode'
]
)
# CRITICAL: User-Agent must match FlareSolverr for cookies to work
self.context = self.browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent=self.user_agent
)
# Load cookies
self.load_cookies(self.context)
self.page = self.context.new_page()
# Add basic anti-detection
self.page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
""")
self.log("Browser started and ready", "info", module="Browser")
def _stop_browser(self):
"""Stop the browser safely with proper error handling"""
# Close context first
if self.context:
try:
self.context.close()
self.log("Browser context closed", "debug", module="Browser")
except Exception as e:
self.log(f"Error closing browser context: {e}", "warning")
finally:
self.context = None
# Close browser
if self.browser:
try:
self.browser.close()
self.log("Browser closed", "debug", module="Browser")
except Exception as e:
self.log(f"Error closing browser: {e}", "warning")
finally:
self.browser = None
# Stop playwright
if self.playwright:
try:
self.playwright.stop()
except Exception as e:
self.log(f"Error stopping playwright: {e}", "warning")
finally:
self.playwright = None
self.page = None
def __del__(self):
"""Cleanup browser when instance is destroyed"""
self._stop_browser()
def __enter__(self):
"""Context manager entry - allows using 'with' statement"""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit - ensures browser cleanup"""
self._stop_browser()
return False # Don't suppress exceptions
def _extract_media_id_from_url(self, url: str) -> str:
"""Extract media ID from URL"""
# URL format: various formats on storyclone.com
# Try to extract meaningful ID from URL
match = re.search(r'/([^/]+)/?$', url)
if match:
return match.group(1)
return None
def _update_file_timestamps(self, filepath: Path, post_date: datetime):
"""Update all timestamps for a file to match the post date"""
try:
# Convert datetime to timestamp
timestamp = post_date.timestamp()
# 1. Update file system timestamps (access time and modification time)
os.utime(filepath, (timestamp, timestamp))
self.log(f"Updated file timestamps to {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
# 2. Try to update creation time (platform-specific)
if platform.system() == 'Darwin': # macOS
# Use SetFile command on macOS
date_str = post_date.strftime('%m/%d/%Y %H:%M:%S')
try:
subprocess.run(
['SetFile', '-d', date_str, str(filepath)],
capture_output=True,
text=True
)
except (subprocess.SubprocessError, FileNotFoundError, OSError):
pass # SetFile not available on this system
elif platform.system() == 'Windows':
# On Windows, use PowerShell with proper escaping to prevent injection
filepath_escaped = str(filepath).replace("'", "''")
date_escaped = post_date.isoformat().replace("'", "''")
ps_command = f"(Get-Item -LiteralPath '{filepath_escaped}').CreationTime = Get-Date '{date_escaped}'"
try:
subprocess.run(
['powershell', '-Command', ps_command],
capture_output=True,
text=True
)
except (subprocess.SubprocessError, FileNotFoundError, OSError):
pass # PowerShell command failed
# Linux doesn't support changing creation time
# 3. Update EXIF data for images
if str(filepath).lower().endswith(('.jpg', '.jpeg', '.png')):
self._update_exif_timestamps(filepath, post_date)
except Exception as e:
self.log(f"Error updating timestamps: {e}", "warning")
def _update_exif_timestamps(self, filepath: Path, post_date: datetime):
"""Update EXIF timestamps in image files"""
try:
# Check if exiftool is available
result = subprocess.run(['which', 'exiftool'], capture_output=True, text=True)
if result.returncode == 0:
# Format date for EXIF
exif_date = post_date.strftime('%Y:%m:%d %H:%M:%S')
# Update all date fields in EXIF including MetadataDate for Immich
cmd = [
'exiftool', '-overwrite_original', '-quiet',
f'-AllDates={exif_date}',
f'-MetadataDate={exif_date}',
'-HistoryWhen=',
f'-FileModifyDate={exif_date}',
str(filepath)
]
subprocess.run(cmd, capture_output=True, text=True)
self.log(f"Updated EXIF timestamps", "debug")
except (subprocess.SubprocessError, OSError, FileNotFoundError):
# Silently skip if exiftool not available
pass
def _extract_post_date(self, page) -> datetime:
"""Try to extract post date from page"""
try:
# Wait a moment for dynamic content to load
page.wait_for_timeout(500)
# Look for date elements on StoryClon e
date_selectors = [
'time[datetime]',
'time',
'.date',
'[datetime]',
'span.date',
'div.date',
'.story-date',
'.post-date'
]
for selector in date_selectors:
elem = page.locator(selector).first
if elem.count() > 0:
# Try datetime attribute first
datetime_str = elem.get_attribute('datetime')
if datetime_str:
# Parse ISO format
for fmt in ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d']:
try:
return datetime.strptime(datetime_str.split('.')[0].replace('Z', ''), fmt)
except ValueError:
continue
# Try text content
text = elem.text_content()
if text:
# Parse various date formats
if "ago" in text.lower():
# Handle relative dates
if "hour" in text:
hours = int(re.search(r'(\d+)', text).group(1))
return datetime.now() - timedelta(hours=hours)
elif "day" in text:
days = int(re.search(r'(\d+)', text).group(1))
return datetime.now() - timedelta(days=days)
elif "week" in text:
weeks = int(re.search(r'(\d+)', text).group(1))
return datetime.now() - timedelta(weeks=weeks)
else:
# Try parsing absolute date
for fmt in ['%B %d, %Y', '%b %d, %Y', '%Y-%m-%d']:
try:
return datetime.strptime(text, fmt)
except ValueError:
continue
except Exception as e:
self.log(f"Error extracting date: {e}", "debug")
return None
def _parse_storyclone_filename(self, filename: str, profile_name: str) -> datetime:
"""
Parse date from StoryClon e filename format and adjust for timezone
Format: evalongoria-2025-10-23T17-42-56.jpg
StoryClon e uses UTC, so subtract 4 hours to get local time
Args:
filename: StoryClon e filename
profile_name: Username to strip from beginning
Returns:
datetime object adjusted to local time, or None if parsing failed
"""
try:
# Remove extension
filename_no_ext = Path(filename).stem
# Check if it starts with profile name
if filename_no_ext.startswith(f"{profile_name}-"):
# Extract date part: 2025-10-23T17-42-56
date_part = filename_no_ext[len(f"{profile_name}-"):]
# Parse ISO-like format with hyphens instead of colons
# 2025-10-23T17-42-56 -> 2025-10-23 17:42:56
date_part_clean = date_part.replace('T', ' ')
# Replace only the time part hyphens with colons
parts_dt = date_part_clean.split(' ')
if len(parts_dt) == 2:
date_portion = parts_dt[0] # 2025-10-23
time_portion = parts_dt[1].replace('-', ':') # 17-42-56 -> 17:42:56
datetime_str = f"{date_portion} {time_portion}"
# Parse the datetime (this is in UTC)
parsed_date = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
# Subtract 4 hours to convert from UTC to local time
local_date = parsed_date - timedelta(hours=4)
return local_date
except Exception as e:
self.log(f"Error parsing StoryClon e filename '{filename}': {e}", "debug")
return None
def _parse_story_date_text(self, date_text: str) -> datetime:
"""
Parse StoryClon e date text format
Examples: "Posted on today at 1:42 PM"
"Posted on today at 1:44 PM"
Returns:
datetime object or None if parsing failed
"""
try:
# StoryClon e format: "Posted on today at 1:42 PM"
if "Posted on today at" in date_text:
# Extract time part (e.g., "1:42 PM")
time_match = re.search(r'(\d{1,2}):(\d{2})\s*(AM|PM)', date_text, re.IGNORECASE)
if time_match:
hour = int(time_match.group(1))
minute = int(time_match.group(2))
am_pm = time_match.group(3).upper()
# Convert to 24-hour format
if am_pm == 'PM' and hour != 12:
hour += 12
elif am_pm == 'AM' and hour == 12:
hour = 0
# Use today's date with the extracted time
now = datetime.now()
story_datetime = now.replace(hour=hour, minute=minute, second=0, microsecond=0)
return story_datetime
# Could add more date formats here if needed
except Exception as e:
self.log(f"Error parsing date text '{date_text}': {e}", "debug")
return None
def _record_download(self, username: str, url: str, filename: str,
post_date=None, metadata: dict = None, file_path: str = None,
deferred: bool = False):
"""Record a download in the database
Args:
deferred: If True, don't record to database now - add to pending_downloads list
for later recording after file move is complete
"""
# If deferred, store for later recording instead of recording now
if deferred:
self.pending_downloads.append({
'username': username,
'url': url,
'filename': filename,
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
'file_path': file_path,
'metadata': metadata
})
self.log(f"Deferred recording for {filename}", "debug")
return True
if not self.db:
return
try:
self.db.mark_downloaded(
username=username,
url=url,
filename=filename,
post_date=post_date,
metadata=metadata,
file_path=file_path
)
except Exception as e:
self.log(f"Failed to record download: {e}", "debug")
def get_pending_downloads(self):
"""Get list of downloads that were deferred for later recording"""
return self.pending_downloads.copy()
def clear_pending_downloads(self):
"""Clear the pending downloads list after they've been recorded"""
self.pending_downloads = []
def _scan_existing_files(self, output_dir: Path, profile_name: str):
"""Scan directory for existing files and extract media IDs and dates"""
self.downloaded_files.clear()
self.file_dates = {} # Map media_id -> datetime
# Patterns: Both my format and StoryClon e format
for pattern in ["*.jpg", "*.jpeg", "*.png", "*.heic", "*.mp4", "*.mov"]:
for filepath in output_dir.glob(pattern):
# Skip corrupted/incomplete files (less than 20KB)
if filepath.stat().st_size < 20000:
self.log(f"Skipping corrupted file (size < 20KB): {filepath.name}", "debug")
continue
filename = filepath.stem
media_id = None
file_date = None
# Try my FastDL format: profile_YYYYMMDD_HHMMSS_mediaid.ext
parts = filename.split('_', 3)
if len(parts) >= 4 and parts[0] == profile_name:
media_id = parts[3] # Everything after date/time
# Parse date from filename
try:
date_str = f"{parts[1]}_{parts[2]}" # YYYYMMDD_HHMMSS
file_date = datetime.strptime(date_str, '%Y%m%d_%H%M%S')
except (ValueError, IndexError):
pass
# Try StoryClon e format: profile-YYYY-MM-DDTHH-MM-SS.ext
elif filename.startswith(f"{profile_name}-"):
# Example: evalongoria-2025-10-23T17-42-56
# Extract: 2025-10-23T17-42-56
date_part = filename[len(f"{profile_name}-"):]
try:
# Parse ISO-like format with hyphens instead of colons
# 2025-10-23T17-42-56 -> 2025-10-23 17:42:56
date_part_clean = date_part.replace('T', ' ')
# Replace only the time part hyphens with colons
# Split on space to separate date and time
parts_dt = date_part_clean.split(' ')
if len(parts_dt) == 2:
date_portion = parts_dt[0] # 2025-10-23
time_portion = parts_dt[1].replace('-', ':') # 17-42-56 -> 17:42:56
datetime_str = f"{date_portion} {time_portion}"
# Parse the datetime
parsed_date = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
# Subtract 4 hours to convert from UTC to local time
file_date = parsed_date - timedelta(hours=4)
# Use the date part as media_id
media_id = filename[len(f"{profile_name}-"):]
except Exception as e:
self.log(f"Could not parse StoryClon e date from {filename}: {e}", "debug")
# Still use as media_id for duplicate detection
media_id = filename[len(f"{profile_name}-"):]
if media_id:
self.downloaded_files.add(media_id)
if file_date:
self.file_dates[media_id] = file_date
if self.downloaded_files:
self.log(f"Found {len(self.downloaded_files)} valid existing files for {profile_name} ({len(self.file_dates)} with dates)", "debug")
def _get_processed_posts(self, username: str) -> set:
"""Get set of story IDs that have been processed from database"""
processed = set()
if not self.db:
return processed
try:
with self.db.get_connection() as conn:
cursor = conn.cursor()
# Get all stories for this user from downloads table
cursor.execute('''
SELECT url, filename, metadata FROM downloads
WHERE platform = 'snapchat'
AND source = ?
''', (username,))
for row in cursor.fetchall():
url, filename, metadata_str = row
# Extract media_id from filename
if filename:
# Format: username_date_MEDIAID.ext or username_date_MEDIAID_N.ext
parts = filename.split('_')
if len(parts) >= 4:
# Get everything after date/time as media_id
media_id = '_'.join(parts[3:]).split('.')[0]
processed.add(media_id)
# Also check metadata for media_id
if metadata_str:
try:
metadata = json.loads(metadata_str)
if 'media_id' in metadata:
processed.add(metadata['media_id'])
except (json.JSONDecodeError, KeyError, TypeError):
pass
if processed:
self.log(f"Found {len(processed)} processed stories in database for {username}", "debug")
except Exception as e:
self.log(f"Error loading processed stories from database: {e}", "debug")
return processed
def save_cookies(self, context):
"""Save cookies to database or file"""
cookies = context.cookies()
# Save to database if available
if self.unified_db:
try:
self.unified_db.save_scraper_cookies(self.scraper_id, cookies)
self.log(f"Saved {len(cookies)} cookies to database", "debug")
return
except Exception as e:
self.log(f"Error saving cookies to database: {e}", "warning")
# Fallback to file-based storage
if self.cookie_file:
storage_data = {
'cookies': cookies,
'timestamp': datetime.now().isoformat()
}
with open(self.cookie_file, 'w') as f:
json.dump(storage_data, f, indent=2)
self.log(f"Saved {len(cookies)} cookies to file", "debug")
def load_cookies(self, context):
"""Load saved cookies from database or file"""
# Try loading from database first
if self.unified_db:
try:
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
if cookies:
# Clean cookies - remove unsupported properties
cleaned_cookies = []
for cookie in cookies:
cleaned = {k: v for k, v in cookie.items()
if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
cleaned_cookies.append(cleaned)
context.add_cookies(cleaned_cookies)
self.log(f"Loaded {len(cleaned_cookies)} cookies from database", "info")
return True
except Exception as e:
self.log(f"Error loading cookies from database: {e}", "warning")
# Fallback to file-based cookies
if not self.cookie_file or not self.cookie_file.exists():
return False
try:
with open(self.cookie_file, 'r') as f:
data = json.load(f)
# Check age (24 hours)
saved_time = datetime.fromisoformat(data['timestamp'])
if datetime.now() - saved_time > timedelta(hours=24):
self.log("Cookies expired", "debug")
return False
# Clean cookies - remove unsupported properties
cleaned_cookies = []
for cookie in data['cookies']:
# Remove Chrome-specific properties that Playwright doesn't support
cleaned = {k: v for k, v in cookie.items()
if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
cleaned_cookies.append(cleaned)
context.add_cookies(cleaned_cookies)
self.log(f"Loaded {len(cleaned_cookies)} cookies from file", "info")
return True
except Exception as e:
self.log(f"Failed to load cookies: {e}", "warning")
return False
def wait_for_cloudflare(self, page):
"""Wait for Cloudflare to auto-solve or page to load - uses FlareSolverr when needed"""
self.log("Waiting for page to load...", "debug")
max_wait = 120 # Extended wait to match ImgInn
flaresolverr_attempted = False
for i in range(max_wait):
time.sleep(1)
# Check current URL and content
try:
current_url = page.url
content = page.content().lower()
except Exception as e:
if "navigating" in str(e).lower():
self.log("Page still navigating, waiting...", "debug")
continue
else:
raise
# Check for actual Cloudflare challenge or server error
challenge_indicators = ['checking your browser', 'just a moment', 'verify you are human', 'enable javascript']
error_indicators = ['internal server error', 'error code 500', 'error code 502', 'error code 503']
has_challenge = any(indicator in content for indicator in challenge_indicators)
has_error = any(indicator in content for indicator in error_indicators)
if has_error:
self.log("Server error detected (500/502/503) - site is likely down", "error")
return False
if has_challenge:
if not flaresolverr_attempted:
self.log("Cloudflare challenge detected, attempting FlareSolverr bypass...", "info", module="Cloudflare")
# Try to get fresh cookies via FlareSolverr
if self._get_cookies_via_flaresolverr(page.url):
self.log("Got fresh cookies from FlareSolverr, reloading page...", "info", module="Cloudflare")
# Reload cookies in browser context
try:
self.load_cookies(self.context)
# Reload the page with new cookies
page.reload(wait_until='domcontentloaded', timeout=10000)
time.sleep(2) # Give page time to load with new cookies
except Exception as e:
self.log(f"Error reloading page with new cookies: {e}", "debug")
else:
self.log("FlareSolverr failed, waiting for challenge to resolve...", "warning", module="Cloudflare")
flaresolverr_attempted = True
continue
# Check if we're on the correct page with content
if 'storyclone.com' in current_url.lower():
# Look for story content indicators
if 'story' in content or 'username' in content or 'download' in content or 'stories' in content:
self.log(f"Page loaded after {i+1} seconds", "info")
return True
# Status updates
if i == 10:
self.log("Still waiting (10s)... Cloudflare is checking", "debug")
elif i == 20:
self.log("Still waiting (20s)... Cloudflare challenge ongoing", "info")
elif i == 30:
self.log("Still waiting (30s)... This is normal for Cloudflare", "info")
# Timeout reached
self.log(f"Page load timeout. URL: {page.url}", "error")
return False
def download(self, username: str, content_type: str = "stories", days_back: int = 14,
max_downloads: int = 50, output_dir: str = None, phrase_config: dict = None,
defer_database: bool = False):
"""Download content from a user - compatible with media-downloader interface
Args:
username: Snapchat username
content_type: Type of content ("stories" only for Snapchat)
days_back: How many days back to search
max_downloads: Maximum stories to download
output_dir: Output directory
phrase_config: Not used for Snapchat (stories don't have captions usually)
defer_database: If True, don't record to database immediately - store in
pending_downloads for later recording after file move is complete
"""
self.defer_database = defer_database # Store for use in download methods
# Clear downloaded_files cache between accounts to prevent memory growth
self.downloaded_files.clear()
# Check site status before doing anything else
self.log(f"Checking {self.proxy_domain} site status...", "debug")
site_status, error_msg = self.cf_handler.check_site_status(f"https://{self.proxy_domain}/", timeout=10)
if self.cf_handler.should_skip_download(site_status):
self.log(f"Skipping download - {self.proxy_domain} is unavailable: {error_msg}", "warning")
return 0
elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
self.log("Cloudflare challenge detected, will attempt bypass during download", "info")
# Set output directory
if output_dir:
output_path = Path(output_dir) / username
else:
output_path = Path(f"/opt/media-downloader/downloads/{username}")
# Route to download method
if content_type == "stories":
files = self.download_stories(
username=username,
days_back=days_back,
max_stories=max_downloads,
output_dir=output_path
)
else:
self.log(f"Snapchat downloader does not support content type: {content_type}", "warning")
return 0
return len(files)
def download_stories(self, username: str, days_back: int = 14, max_stories: int = 50, output_dir: Path = None):
"""Download stories from a Snapchat user with FastDL naming
Args:
username: Snapchat username
days_back: How many days back to search
max_stories: Maximum stories to download
output_dir: Output directory
"""
profile_name = username.lower()
if output_dir is None:
output_dir = Path(f"/opt/media-downloader/downloads/{profile_name}")
output_dir.mkdir(parents=True, exist_ok=True)
# Scan existing files
self._scan_existing_files(output_dir, profile_name)
# Get processed stories from database
processed_stories = self._get_processed_posts(profile_name)
self.log(f"Loaded {len(processed_stories)} processed stories for {profile_name} from database", "info")
downloaded_files = []
cutoff_date = datetime.now() - timedelta(days=days_back)
# Update activity status
self.activity_manager.update_status(f"Checking stories from @{profile_name}")
# Start or reuse browser
self._start_browser()
page = self.page
try:
# Navigate to user's stories page on StoryClon e
self.log(f"Navigating to @{username} on {self.proxy_domain}", "info")
page.goto(f"https://{self.proxy_domain}/u/{username}/", wait_until='domcontentloaded')
# Wait for page to load
if not self.wait_for_cloudflare(page):
self.log("Page didn't load properly", "error")
return []
# Save cookies
self.save_cookies(self.context)
# Wait for page to load
self.log("Waiting for page to load...", "info")
time.sleep(3) # Give page time to load content
# Check if "Stories" section exists - if not, there are no stories to scrape
stories_section = page.locator('div.font-semibold.ml-6:has-text("Stories")').first
if stories_section.count() == 0:
self.log("No 'Stories' section found - user has no stories available", "info")
return []
self.log("Found 'Stories' section - proceeding to load all stories...", "info")
# Scroll down and load all stories by clicking "Load More" button
self.log("Scrolling to load all stories...", "info")
load_more_clicks = 0
max_attempts = 20
for attempt in range(max_attempts):
# Step 1: Scroll until we see "Spotlight Highlights"
self.log("Scrolling until 'Spotlight Highlights' is visible...", "debug")
scroll_attempts = 0
max_scrolls = 10
while scroll_attempts < max_scrolls:
spotlight_highlights = page.locator('text=Spotlight Highlights').first
if spotlight_highlights.count() > 0:
self.log("Found 'Spotlight Highlights' in view", "debug")
break
page.evaluate("window.scrollBy(0, 400)")
time.sleep(1)
scroll_attempts += 1
# Step 2: Check if there's a "Load More" button ABOVE "Spotlight Highlights" (positionally before)
load_more_btn = page.locator('button:has-text("Load More"), button.load-more-button').first
spotlight_highlights = page.locator('text=Spotlight Highlights').first
load_more_visible = load_more_btn.count() > 0 and load_more_btn.is_visible()
spotlight_visible = spotlight_highlights.count() > 0
if load_more_visible and spotlight_visible:
# Both are visible - check Y positions to see which comes first
load_more_box = load_more_btn.bounding_box()
spotlight_box = spotlight_highlights.bounding_box()
if load_more_box and spotlight_box:
load_more_y = load_more_box['y']
spotlight_y = spotlight_box['y']
if load_more_y < spotlight_y:
# "Load More" is ABOVE "Spotlight Highlights" → Click it
load_more_clicks += 1
self.log(f"Found 'Load More' ABOVE 'Spotlight Highlights' (Y:{load_more_y:.0f} < {spotlight_y:.0f}) - clicking (click #{load_more_clicks})...", "info")
load_more_btn.click()
time.sleep(2.5) # Wait for more posts to load
items_count = len(page.locator('.item').all())
self.log(f"Items after click: {items_count}", "debug")
# Go back and scroll to "Spotlight Highlights" again (it will be pushed down)
continue
else:
# "Load More" is BELOW "Spotlight Highlights" → We're done
items_final = page.locator('.item').all()
self.log(f"'Load More' is BELOW 'Spotlight Highlights' (Y:{load_more_y:.0f} > {spotlight_y:.0f}) - done! Found {len(items_final)} stories (clicked Load More {load_more_clicks} times)", "info")
break
elif spotlight_visible:
# Only "Spotlight Highlights" visible, no "Load More" → We're done
items_final = page.locator('.item').all()
self.log(f"No 'Load More' button found - done! Found {len(items_final)} stories (clicked Load More {load_more_clicks} times)", "info")
break
else:
# Neither visible, keep trying
self.log("Neither 'Load More' nor 'Spotlight Highlights' found, continuing...", "debug")
continue
# Find story/media elements by processing each .item container
# This ensures lazy-loaded content is properly triggered
self.log("Extracting media from story items...", "info")
# Get Y position of "Spotlight Highlights" to filter out items after it
spotlight_highlights = page.locator('text=Spotlight Highlights').first
spotlight_y = None
if spotlight_highlights.count() > 0:
spotlight_box = spotlight_highlights.bounding_box()
if spotlight_box:
spotlight_y = spotlight_box['y']
self.log(f"'Spotlight Highlights' Y position: {spotlight_y:.0f}", "debug")
# Get all .item elements
all_items = page.locator('.item').all()
# Filter to only items BEFORE "Spotlight Highlights"
story_items = []
for item in all_items:
item_box = item.bounding_box()
if item_box and spotlight_y:
item_y = item_box['y']
if item_y < spotlight_y:
story_items.append(item)
elif not spotlight_y:
# No Spotlight Highlights found, include all items
story_items.append(item)
self.log(f"Filtered to {len(story_items)} story items (before Spotlight Highlights) from {len(all_items)} total items", "info")
media_elements = []
for idx, item in enumerate(story_items):
try:
# Scroll item into view to trigger lazy loading
item.scroll_into_view_if_needed()
time.sleep(0.3) # Give it a moment to load
# Look for video first
video = item.locator('video[src]').first
if video.count() > 0:
media_elements.append(video)
self.log(f"Item {idx+1}: Found video", "debug")
continue
# If no video, look for image from Snapchat CDN
img = item.locator('img[src*="sc-cdn.net"]').first
if img.count() > 0:
src = img.get_attribute('src')
# Skip apple icons, favicons, and poster images
if src and 'apple-icon' not in src and 'favicon' not in src and '/d/' in src:
media_elements.append(img)
self.log(f"Item {idx+1}: Found image", "debug")
continue
self.log(f"Item {idx+1}: No media found (may be lazy-loading)", "debug")
except Exception as e:
self.log(f"Item {idx+1}: Error processing - {e}", "debug")
self.log(f"Extracted {len(media_elements)} media elements from {len(story_items)} items", "info")
if not media_elements:
self.log("No stories found for this user", "warning")
return []
self.log(f"Found {len(media_elements)} potential story items", "info")
# Download each story
story_index = 1
for i, media_elem in enumerate(media_elements[:max_stories]):
try:
# Get media URL
media_url = None
# Try to get src attribute
media_url = media_elem.get_attribute('src')
# If no src, try href (for download links)
if not media_url or media_url == '#':
media_url = media_elem.get_attribute('href')
if not media_url or media_url == '#' or media_url.startswith('data:'):
self.log(f"Story {story_index}: Invalid media URL", "warning")
continue
self.log(f"Story {story_index}: {media_url[:80]}...", "debug")
# Try to get higher quality version by replacing size parameter
# URLs look like: https://.../{id}.1034.IRZXSOY?...
# Try larger sizes: 2048, 1920, 1440, 1034 (original)
import re
hq_url = None
original_url = media_url
# Check if URL has a size parameter pattern
size_match = re.search(r'\.(\d+)\.IRZXSOY', media_url)
if size_match:
original_size = size_match.group(1)
# Try larger sizes (in descending order)
for test_size in ['2048', '1920', '1440']:
if int(test_size) > int(original_size):
test_url = media_url.replace(f'.{original_size}.IRZXSOY', f'.{test_size}.IRZXSOY')
# Test if this URL is accessible
try:
import requests
response = requests.head(test_url, timeout=5, allow_redirects=True)
if response.status_code == 200:
hq_url = test_url
self.log(f"Story {story_index}: Found higher quality version (size {test_size})", "info")
break
except requests.RequestException:
continue
# Use HQ URL if found, otherwise use original
if hq_url:
media_url = hq_url
# Extract media ID from URL and determine correct extension
from urllib.parse import urlparse, unquote
url_path = urlparse(media_url).path
original_name = unquote(url_path.split('/')[-1].split('?')[0])
# Determine file type from element or URL
# Snapchat CDN uses weird extensions like .IRZXSOY, so we need to detect the actual type
if media_elem.evaluate("element => element.tagName").lower() == 'video':
ext = '.mp4' # Videos are MP4
else:
ext = '.jpg' # Images are JPG
# Use the full filename as media_id (without fake extension)
media_id = original_name.split('.')[0] # Take first part before any dots
# Check if already downloaded
if media_id in self.downloaded_files or media_id in processed_stories:
self.log(f"Story {story_index}: Already downloaded ({media_id}), skipping", "debug")
story_index += 1
continue
# Extract post date from the story item on the page
story_date = None
try:
# Try multiple strategies to find the date associated with THIS specific story
# Strategy 1: Look in the immediate parent of the media element
immediate_parent = media_elem.locator('xpath=..').first
if immediate_parent.count() > 0:
date_elem = immediate_parent.locator('.text-sm').first
if date_elem.count() > 0:
date_text = date_elem.text_content()
if date_text and ("Posted on" in date_text or "at" in date_text):
self.log(f"Story {story_index}: Found date in immediate parent: '{date_text}'", "debug")
story_date = self._parse_story_date_text(date_text)
if story_date:
self.log(f"Story {story_index}: Extracted date from page: {story_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
# Strategy 2: If not found, try the closest ancestor with a limited depth
if not story_date:
# Look for a closer parent (not going all the way up)
for depth in [1, 2, 3]:
parent_xpath = 'xpath=' + '/'.join(['..'] * depth)
parent = media_elem.locator(parent_xpath).first
if parent.count() > 0:
# Get only the FIRST .text-sm in this parent
date_elem = parent.locator('.text-sm').first
if date_elem.count() > 0:
date_text = date_elem.text_content()
if date_text and ("Posted on" in date_text or "at" in date_text):
self.log(f"Story {story_index}: Found date at depth {depth}: '{date_text}'", "debug")
story_date = self._parse_story_date_text(date_text)
if story_date:
self.log(f"Story {story_index}: Extracted date from page: {story_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
break
if not story_date:
self.log(f"Story {story_index}: Could not find date text for this story", "debug")
except Exception as e:
self.log(f"Story {story_index}: Could not extract date - {e}", "debug")
import traceback
self.log(f"Story {story_index}: Traceback: {traceback.format_exc()}", "debug")
# Fallback to current time if extraction failed
if not story_date:
story_date = datetime.now()
self.log(f"Story {story_index}: Using current time as fallback", "debug")
date_str = story_date.strftime('%Y%m%d_%H%M%S')
# Build filename: {profile}_{date}_{media_id}{ext}
filename = f"{profile_name}_{date_str}_{media_id}{ext}"
filepath = output_dir / filename
# Download the story
try:
import requests
# Ensure full URL
if not media_url.startswith('http'):
media_url = f"https:{media_url}" if media_url.startswith('//') else f"https://{self.proxy_domain}{media_url}"
response = requests.get(media_url, timeout=30, headers={
'User-Agent': self.user_agent,
'Referer': f'https://{self.proxy_domain}/'
}, cookies=self._get_cookies_for_requests())
response.raise_for_status()
# Save file
with open(filepath, 'wb') as f:
f.write(response.content)
self.log(f"Downloaded story: {filename} ({len(response.content)} bytes)", "info")
downloaded_files.append(str(filepath))
# Check for duplicate hash before recording
if self.db:
from pathlib import Path as PathLib
# Check for duplicate hash (hash blacklist persists even if original deleted)
file_hash = self.db.get_file_hash(str(filepath))
if file_hash:
existing = self.db.get_download_by_file_hash(file_hash)
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
# Delete the duplicate regardless of whether original file still exists
try:
filepath.unlink()
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
continue
except Exception as e:
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
# Update timestamps
self._update_file_timestamps(filepath, story_date)
# Add to tracking
self.downloaded_files.add(media_id)
# Record in database with media_id in metadata
self._record_download(
username=profile_name,
url=media_url,
filename=filename,
post_date=story_date,
metadata={'media_id': media_id},
file_path=str(filepath),
deferred=getattr(self, 'defer_database', False)
)
story_index += 1
except Exception as e:
self.log(f"Failed to download story {story_index}: {e}", "error")
story_index += 1
continue
except Exception as e:
self.log(f"Error processing story {story_index}: {e}", "error")
story_index += 1
continue
self.log(f"Downloaded {len(downloaded_files)} story files", "info")
except Exception as e:
self.log(f"Error downloading stories: {e}", "error")
import traceback
self.log(f"Traceback: {traceback.format_exc()}", "debug")
# Don't close browser here - reuse it for next profile
return downloaded_files
def main():
"""Test the downloader with FastDL naming"""
import sys
print("=" * 60)
print("Snapchat Downloader (StoryClon e) - FastDL Compatible Naming")
print("=" * 60)
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 60)
downloader = SnapchatDownloader(
api_key=None, # Add your 2captcha key if needed
headless=False # Use with xvfb
)
# Test username (replace with actual Snapchat username)
test_username = sys.argv[1] if len(sys.argv) > 1 else "testuser"
# Download stories
files = downloader.download_stories(
username=test_username,
days_back=7,
max_stories=50
)
print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
if files:
print(f"Successfully downloaded {len(files)} files!")
print("\nDownloaded files (FastDL naming format):")
for f in files:
name = Path(f).name
size = Path(f).stat().st_size / 1024
parts = name.split('_', 3)
if len(parts) >= 4:
print(f" - {name}")
print(f" Profile: {parts[0]}")
print(f" Date: {parts[1]}_{parts[2]}")
print(f" Media ID: {parts[3].split('.')[0]}")
print(f" Size: {size:.1f} KB")
else:
print("No files downloaded")
# Check total in folder
download_dir = Path(f"/opt/media-downloader/downloads/{test_username}")
if download_dir.exists():
all_files = list(download_dir.glob("*"))
total_size = sum(f.stat().st_size for f in all_files) / 1024
print(f"\nTotal in folder: {len(all_files)} files ({total_size:.1f} KB)")
if __name__ == "__main__":
main()