1302 lines
58 KiB
Python
Executable File
1302 lines
58 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Snapchat downloader module using StoryClon e proxy (s.storyclone.com)
|
|
Based on ImgInn module structure with FastDL-compatible file naming
|
|
Format: {profile}_{YYYYMMDD_HHMMSS}_{media_id}{ext}
|
|
"""
|
|
|
|
# Allow nested event loops for compatibility with asyncio contexts
|
|
try:
|
|
import nest_asyncio
|
|
nest_asyncio.apply()
|
|
except ImportError:
|
|
pass
|
|
|
|
import os
|
|
import json
|
|
import time
|
|
import re
|
|
import subprocess
|
|
import platform
|
|
import requests
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
from modules.base_module import LoggingMixin
|
|
from modules.universal_logger import get_logger
|
|
from modules.cloudflare_handler import CloudflareHandler, SiteStatus, get_flaresolverr_user_agent
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
class SnapchatDownloader(LoggingMixin):
|
|
"""Snapchat downloader using StoryClon e with FastDL-compatible naming"""
|
|
|
|
def __init__(self,
|
|
headless: bool = True,
|
|
cookie_file: str = "/opt/media-downloader/cookies/snapchat_cookies.json",
|
|
show_progress: bool = True,
|
|
use_database: bool = True,
|
|
log_callback=None,
|
|
unified_db=None,
|
|
proxy_domain: str = "sn.storyclone.com"):
|
|
"""Initialize downloader compatible with media-downloader system"""
|
|
self.headless = headless
|
|
self.downloaded_files = set() # Track downloaded media IDs
|
|
self.file_dates = {} # Map media_id -> datetime from existing filenames
|
|
self.show_progress = show_progress
|
|
self.use_database = use_database
|
|
self.download_count = 0
|
|
self.unified_db = unified_db # Store for scraper config access
|
|
self.scraper_id = 'snapchat' # Scraper ID in database
|
|
|
|
# Initialize logging via mixin
|
|
self._init_logger('Snapchat', log_callback, default_module='Download')
|
|
|
|
# Browser reuse across profiles
|
|
self.playwright = None
|
|
self.browser = None
|
|
self.context = None
|
|
self.page = None
|
|
|
|
# Use unified database if provided
|
|
if unified_db and use_database:
|
|
from modules.unified_database import SnapchatDatabaseAdapter
|
|
self.db = SnapchatDatabaseAdapter(unified_db)
|
|
else:
|
|
self.db = None
|
|
self.use_database = False
|
|
|
|
# Initialize activity status manager for real-time updates
|
|
from modules.activity_status import get_activity_manager
|
|
self.activity_manager = get_activity_manager(unified_db)
|
|
|
|
# Load scraper configuration from database if available
|
|
self.proxy_url = None
|
|
self.cookie_file = None # Default to None (use database)
|
|
self.proxy_domain = proxy_domain # Default proxy domain
|
|
|
|
if unified_db:
|
|
scraper_config = unified_db.get_scraper(self.scraper_id)
|
|
if scraper_config:
|
|
# Get proxy configuration
|
|
if scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
|
|
self.proxy_url = scraper_config['proxy_url']
|
|
self.log(f"Using proxy: {self.proxy_url}", "info")
|
|
# Get base URL (proxy domain) from database
|
|
if scraper_config.get('base_url'):
|
|
self.proxy_domain = scraper_config['base_url'].replace('https://', '').replace('http://', '').rstrip('/')
|
|
|
|
# Fall back to cookie file if no database
|
|
if not unified_db:
|
|
self.cookie_file = Path(cookie_file)
|
|
self.cookie_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# User-Agent to match FlareSolverr (dynamically fetched for consistency)
|
|
self.user_agent = get_flaresolverr_user_agent()
|
|
|
|
# Initialize universal Cloudflare handler
|
|
# Pass proxy_url if configured, and cookie_file=None for database storage
|
|
self.cf_handler = CloudflareHandler(
|
|
module_name="Snapchat",
|
|
cookie_file=str(self.cookie_file) if self.cookie_file else None,
|
|
user_agent=self.user_agent,
|
|
logger=self.logger,
|
|
aggressive_expiry=True,
|
|
proxy_url=self.proxy_url # Pass proxy to FlareSolverr
|
|
)
|
|
|
|
# Keep for backwards compatibility
|
|
self.flaresolverr_url = self.cf_handler.flaresolverr_url
|
|
self.flaresolverr_enabled = self.cf_handler.flaresolverr_enabled
|
|
|
|
self.pending_downloads = [] # Track downloads for deferred database recording
|
|
|
|
# Load cookies from database if available
|
|
self._load_cookies_from_db()
|
|
|
|
# Check if we need to get initial cookies
|
|
if not self._has_valid_cookies():
|
|
self.log("No cookies found, will load cookies on first use", "info")
|
|
|
|
def _load_cookies_from_db(self):
|
|
"""Load cookies from database if available"""
|
|
if not self.unified_db:
|
|
return
|
|
|
|
try:
|
|
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
|
if cookies:
|
|
# Load into CloudflareHandler
|
|
self.cf_handler._cookies = cookies
|
|
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies from database: {e}", "warning")
|
|
|
|
def _save_cookies_to_db(self, cookies: list):
|
|
"""Save cookies to database"""
|
|
if not self.unified_db:
|
|
return
|
|
|
|
try:
|
|
self.unified_db.save_scraper_cookies(
|
|
self.scraper_id,
|
|
cookies,
|
|
user_agent=self.user_agent,
|
|
merge=True
|
|
)
|
|
self.log(f"Saved {len(cookies)} cookies to database", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error saving cookies to database: {e}", "warning")
|
|
|
|
def _has_valid_cookies(self):
|
|
"""Check if we have valid cookies (either in file or database)"""
|
|
if self.unified_db:
|
|
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
|
return cookies and len(cookies) > 0
|
|
elif self.cookie_file:
|
|
return self.cookie_file.exists()
|
|
return False
|
|
|
|
def _cookies_expired(self):
|
|
"""Check if cookies are expired - delegates to CloudflareHandler"""
|
|
return self.cf_handler.cookies_expired()
|
|
|
|
def _get_cookies_for_requests(self):
|
|
"""Get cookies in format for requests library - delegates to CloudflareHandler"""
|
|
return self.cf_handler.get_cookies_dict()
|
|
|
|
def _get_cookies_via_flaresolverr(self, url=None, max_retries=2):
|
|
"""Use FlareSolverr to bypass Cloudflare - delegates to CloudflareHandler
|
|
|
|
Args:
|
|
url: URL to fetch (defaults to proxy_domain)
|
|
max_retries: Maximum number of retry attempts (default: 2)
|
|
|
|
Returns:
|
|
True if cookies obtained successfully, False otherwise
|
|
"""
|
|
if url is None:
|
|
url = f"https://{self.proxy_domain}/"
|
|
success = self.cf_handler.get_cookies_via_flaresolverr(url, max_retries)
|
|
|
|
# Save cookies to database if successful
|
|
if success and self.unified_db:
|
|
cookies_list = self.cf_handler.get_cookies_list()
|
|
if cookies_list:
|
|
self._save_cookies_to_db(cookies_list)
|
|
|
|
return success
|
|
|
|
def _start_browser(self):
|
|
"""Start browser if not already running (reusable across profiles)"""
|
|
# Try to get fresh cookies via FlareSolverr if we don't have them or they're old
|
|
# Do this BEFORE the browser reuse check so cookies are always checked
|
|
if not self._has_valid_cookies() or self._cookies_expired():
|
|
self.log("Cookies missing or expired, attempting FlareSolverr bypass...", "info", module="Cloudflare")
|
|
if self._get_cookies_via_flaresolverr():
|
|
self.log("Successfully got fresh cookies from FlareSolverr", "info", module="Cloudflare")
|
|
else:
|
|
self.log("FlareSolverr unavailable, will try with Playwright", "warning", module="Cloudflare")
|
|
|
|
if self.browser is not None:
|
|
self.log("Browser already running, reusing...", "debug", module="Browser")
|
|
return
|
|
|
|
import os
|
|
# Use environment variable if set, otherwise use standard location
|
|
if 'PLAYWRIGHT_BROWSERS_PATH' not in os.environ:
|
|
os.environ['PLAYWRIGHT_BROWSERS_PATH'] = '/root/.cache/ms-playwright'
|
|
os.environ['DISPLAY'] = ':100' # Use Xvfb virtual display
|
|
|
|
self.log("Starting browser (Chromium)...", "info", module="Browser")
|
|
self.playwright = sync_playwright().start()
|
|
|
|
self.browser = self.playwright.chromium.launch(
|
|
headless=self.headless,
|
|
args=[
|
|
'--disable-blink-features=AutomationControlled',
|
|
'--disable-dev-shm-usage',
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-gpu',
|
|
'--disable-software-rasterizer',
|
|
'--disable-accelerated-2d-canvas',
|
|
'--disable-accelerated-video-decode'
|
|
]
|
|
)
|
|
|
|
# CRITICAL: User-Agent must match FlareSolverr for cookies to work
|
|
self.context = self.browser.new_context(
|
|
viewport={'width': 1920, 'height': 1080},
|
|
user_agent=self.user_agent
|
|
)
|
|
|
|
# Load cookies
|
|
self.load_cookies(self.context)
|
|
|
|
self.page = self.context.new_page()
|
|
|
|
# Add basic anti-detection
|
|
self.page.add_init_script("""
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|
get: () => undefined
|
|
});
|
|
""")
|
|
|
|
self.log("Browser started and ready", "info", module="Browser")
|
|
|
|
def _stop_browser(self):
|
|
"""Stop the browser safely with proper error handling"""
|
|
# Close context first
|
|
if self.context:
|
|
try:
|
|
self.context.close()
|
|
self.log("Browser context closed", "debug", module="Browser")
|
|
except Exception as e:
|
|
self.log(f"Error closing browser context: {e}", "warning")
|
|
finally:
|
|
self.context = None
|
|
|
|
# Close browser
|
|
if self.browser:
|
|
try:
|
|
self.browser.close()
|
|
self.log("Browser closed", "debug", module="Browser")
|
|
except Exception as e:
|
|
self.log(f"Error closing browser: {e}", "warning")
|
|
finally:
|
|
self.browser = None
|
|
|
|
# Stop playwright
|
|
if self.playwright:
|
|
try:
|
|
self.playwright.stop()
|
|
except Exception as e:
|
|
self.log(f"Error stopping playwright: {e}", "warning")
|
|
finally:
|
|
self.playwright = None
|
|
|
|
self.page = None
|
|
|
|
def __del__(self):
|
|
"""Cleanup browser when instance is destroyed"""
|
|
self._stop_browser()
|
|
|
|
def __enter__(self):
|
|
"""Context manager entry - allows using 'with' statement"""
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
"""Context manager exit - ensures browser cleanup"""
|
|
self._stop_browser()
|
|
return False # Don't suppress exceptions
|
|
|
|
def _extract_media_id_from_url(self, url: str) -> str:
|
|
"""Extract media ID from URL"""
|
|
# URL format: various formats on storyclone.com
|
|
# Try to extract meaningful ID from URL
|
|
match = re.search(r'/([^/]+)/?$', url)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
def _update_file_timestamps(self, filepath: Path, post_date: datetime):
|
|
"""Update all timestamps for a file to match the post date"""
|
|
try:
|
|
# Convert datetime to timestamp
|
|
timestamp = post_date.timestamp()
|
|
|
|
# 1. Update file system timestamps (access time and modification time)
|
|
os.utime(filepath, (timestamp, timestamp))
|
|
self.log(f"Updated file timestamps to {post_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
|
|
|
|
# 2. Try to update creation time (platform-specific)
|
|
if platform.system() == 'Darwin': # macOS
|
|
# Use SetFile command on macOS
|
|
date_str = post_date.strftime('%m/%d/%Y %H:%M:%S')
|
|
try:
|
|
subprocess.run(
|
|
['SetFile', '-d', date_str, str(filepath)],
|
|
capture_output=True,
|
|
text=True
|
|
)
|
|
except (subprocess.SubprocessError, FileNotFoundError, OSError):
|
|
pass # SetFile not available on this system
|
|
elif platform.system() == 'Windows':
|
|
# On Windows, use PowerShell with proper escaping to prevent injection
|
|
filepath_escaped = str(filepath).replace("'", "''")
|
|
date_escaped = post_date.isoformat().replace("'", "''")
|
|
ps_command = f"(Get-Item -LiteralPath '{filepath_escaped}').CreationTime = Get-Date '{date_escaped}'"
|
|
try:
|
|
subprocess.run(
|
|
['powershell', '-Command', ps_command],
|
|
capture_output=True,
|
|
text=True
|
|
)
|
|
except (subprocess.SubprocessError, FileNotFoundError, OSError):
|
|
pass # PowerShell command failed
|
|
# Linux doesn't support changing creation time
|
|
|
|
# 3. Update EXIF data for images
|
|
if str(filepath).lower().endswith(('.jpg', '.jpeg', '.png')):
|
|
self._update_exif_timestamps(filepath, post_date)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error updating timestamps: {e}", "warning")
|
|
|
|
def _update_exif_timestamps(self, filepath: Path, post_date: datetime):
|
|
"""Update EXIF timestamps in image files"""
|
|
try:
|
|
# Check if exiftool is available
|
|
result = subprocess.run(['which', 'exiftool'], capture_output=True, text=True)
|
|
if result.returncode == 0:
|
|
# Format date for EXIF
|
|
exif_date = post_date.strftime('%Y:%m:%d %H:%M:%S')
|
|
|
|
# Update all date fields in EXIF including MetadataDate for Immich
|
|
cmd = [
|
|
'exiftool', '-overwrite_original', '-quiet',
|
|
f'-AllDates={exif_date}',
|
|
f'-MetadataDate={exif_date}',
|
|
'-HistoryWhen=',
|
|
f'-FileModifyDate={exif_date}',
|
|
str(filepath)
|
|
]
|
|
|
|
subprocess.run(cmd, capture_output=True, text=True)
|
|
self.log(f"Updated EXIF timestamps", "debug")
|
|
except (subprocess.SubprocessError, OSError, FileNotFoundError):
|
|
# Silently skip if exiftool not available
|
|
pass
|
|
|
|
def _extract_post_date(self, page) -> datetime:
|
|
"""Try to extract post date from page"""
|
|
try:
|
|
# Wait a moment for dynamic content to load
|
|
page.wait_for_timeout(500)
|
|
|
|
# Look for date elements on StoryClon e
|
|
date_selectors = [
|
|
'time[datetime]',
|
|
'time',
|
|
'.date',
|
|
'[datetime]',
|
|
'span.date',
|
|
'div.date',
|
|
'.story-date',
|
|
'.post-date'
|
|
]
|
|
|
|
for selector in date_selectors:
|
|
elem = page.locator(selector).first
|
|
if elem.count() > 0:
|
|
# Try datetime attribute first
|
|
datetime_str = elem.get_attribute('datetime')
|
|
if datetime_str:
|
|
# Parse ISO format
|
|
for fmt in ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d']:
|
|
try:
|
|
return datetime.strptime(datetime_str.split('.')[0].replace('Z', ''), fmt)
|
|
except ValueError:
|
|
continue
|
|
|
|
# Try text content
|
|
text = elem.text_content()
|
|
if text:
|
|
# Parse various date formats
|
|
if "ago" in text.lower():
|
|
# Handle relative dates
|
|
if "hour" in text:
|
|
hours = int(re.search(r'(\d+)', text).group(1))
|
|
return datetime.now() - timedelta(hours=hours)
|
|
elif "day" in text:
|
|
days = int(re.search(r'(\d+)', text).group(1))
|
|
return datetime.now() - timedelta(days=days)
|
|
elif "week" in text:
|
|
weeks = int(re.search(r'(\d+)', text).group(1))
|
|
return datetime.now() - timedelta(weeks=weeks)
|
|
else:
|
|
# Try parsing absolute date
|
|
for fmt in ['%B %d, %Y', '%b %d, %Y', '%Y-%m-%d']:
|
|
try:
|
|
return datetime.strptime(text, fmt)
|
|
except ValueError:
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Error extracting date: {e}", "debug")
|
|
|
|
return None
|
|
|
|
def _parse_storyclone_filename(self, filename: str, profile_name: str) -> datetime:
|
|
"""
|
|
Parse date from StoryClon e filename format and adjust for timezone
|
|
Format: evalongoria-2025-10-23T17-42-56.jpg
|
|
StoryClon e uses UTC, so subtract 4 hours to get local time
|
|
|
|
Args:
|
|
filename: StoryClon e filename
|
|
profile_name: Username to strip from beginning
|
|
|
|
Returns:
|
|
datetime object adjusted to local time, or None if parsing failed
|
|
"""
|
|
try:
|
|
# Remove extension
|
|
filename_no_ext = Path(filename).stem
|
|
|
|
# Check if it starts with profile name
|
|
if filename_no_ext.startswith(f"{profile_name}-"):
|
|
# Extract date part: 2025-10-23T17-42-56
|
|
date_part = filename_no_ext[len(f"{profile_name}-"):]
|
|
|
|
# Parse ISO-like format with hyphens instead of colons
|
|
# 2025-10-23T17-42-56 -> 2025-10-23 17:42:56
|
|
date_part_clean = date_part.replace('T', ' ')
|
|
|
|
# Replace only the time part hyphens with colons
|
|
parts_dt = date_part_clean.split(' ')
|
|
if len(parts_dt) == 2:
|
|
date_portion = parts_dt[0] # 2025-10-23
|
|
time_portion = parts_dt[1].replace('-', ':') # 17-42-56 -> 17:42:56
|
|
datetime_str = f"{date_portion} {time_portion}"
|
|
|
|
# Parse the datetime (this is in UTC)
|
|
parsed_date = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
|
|
|
|
# Subtract 4 hours to convert from UTC to local time
|
|
local_date = parsed_date - timedelta(hours=4)
|
|
|
|
return local_date
|
|
except Exception as e:
|
|
self.log(f"Error parsing StoryClon e filename '{filename}': {e}", "debug")
|
|
|
|
return None
|
|
|
|
def _parse_story_date_text(self, date_text: str) -> datetime:
|
|
"""
|
|
Parse StoryClon e date text format
|
|
Examples: "Posted on today at 1:42 PM"
|
|
"Posted on today at 1:44 PM"
|
|
|
|
Returns:
|
|
datetime object or None if parsing failed
|
|
"""
|
|
try:
|
|
# StoryClon e format: "Posted on today at 1:42 PM"
|
|
if "Posted on today at" in date_text:
|
|
# Extract time part (e.g., "1:42 PM")
|
|
time_match = re.search(r'(\d{1,2}):(\d{2})\s*(AM|PM)', date_text, re.IGNORECASE)
|
|
if time_match:
|
|
hour = int(time_match.group(1))
|
|
minute = int(time_match.group(2))
|
|
am_pm = time_match.group(3).upper()
|
|
|
|
# Convert to 24-hour format
|
|
if am_pm == 'PM' and hour != 12:
|
|
hour += 12
|
|
elif am_pm == 'AM' and hour == 12:
|
|
hour = 0
|
|
|
|
# Use today's date with the extracted time
|
|
now = datetime.now()
|
|
story_datetime = now.replace(hour=hour, minute=minute, second=0, microsecond=0)
|
|
|
|
return story_datetime
|
|
|
|
# Could add more date formats here if needed
|
|
|
|
except Exception as e:
|
|
self.log(f"Error parsing date text '{date_text}': {e}", "debug")
|
|
|
|
return None
|
|
|
|
def _record_download(self, username: str, url: str, filename: str,
|
|
post_date=None, metadata: dict = None, file_path: str = None,
|
|
deferred: bool = False):
|
|
"""Record a download in the database
|
|
|
|
Args:
|
|
deferred: If True, don't record to database now - add to pending_downloads list
|
|
for later recording after file move is complete
|
|
"""
|
|
# If deferred, store for later recording instead of recording now
|
|
if deferred:
|
|
self.pending_downloads.append({
|
|
'username': username,
|
|
'url': url,
|
|
'filename': filename,
|
|
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
|
|
'file_path': file_path,
|
|
'metadata': metadata
|
|
})
|
|
self.log(f"Deferred recording for {filename}", "debug")
|
|
return True
|
|
|
|
if not self.db:
|
|
return
|
|
|
|
try:
|
|
self.db.mark_downloaded(
|
|
username=username,
|
|
url=url,
|
|
filename=filename,
|
|
post_date=post_date,
|
|
metadata=metadata,
|
|
file_path=file_path
|
|
)
|
|
except Exception as e:
|
|
self.log(f"Failed to record download: {e}", "debug")
|
|
|
|
def get_pending_downloads(self):
|
|
"""Get list of downloads that were deferred for later recording"""
|
|
return self.pending_downloads.copy()
|
|
|
|
def clear_pending_downloads(self):
|
|
"""Clear the pending downloads list after they've been recorded"""
|
|
self.pending_downloads = []
|
|
|
|
def _scan_existing_files(self, output_dir: Path, profile_name: str):
|
|
"""Scan directory for existing files and extract media IDs and dates"""
|
|
self.downloaded_files.clear()
|
|
self.file_dates = {} # Map media_id -> datetime
|
|
|
|
# Patterns: Both my format and StoryClon e format
|
|
for pattern in ["*.jpg", "*.jpeg", "*.png", "*.heic", "*.mp4", "*.mov"]:
|
|
for filepath in output_dir.glob(pattern):
|
|
# Skip corrupted/incomplete files (less than 20KB)
|
|
if filepath.stat().st_size < 20000:
|
|
self.log(f"Skipping corrupted file (size < 20KB): {filepath.name}", "debug")
|
|
continue
|
|
|
|
filename = filepath.stem
|
|
media_id = None
|
|
file_date = None
|
|
|
|
# Try my FastDL format: profile_YYYYMMDD_HHMMSS_mediaid.ext
|
|
parts = filename.split('_', 3)
|
|
if len(parts) >= 4 and parts[0] == profile_name:
|
|
media_id = parts[3] # Everything after date/time
|
|
# Parse date from filename
|
|
try:
|
|
date_str = f"{parts[1]}_{parts[2]}" # YYYYMMDD_HHMMSS
|
|
file_date = datetime.strptime(date_str, '%Y%m%d_%H%M%S')
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
# Try StoryClon e format: profile-YYYY-MM-DDTHH-MM-SS.ext
|
|
elif filename.startswith(f"{profile_name}-"):
|
|
# Example: evalongoria-2025-10-23T17-42-56
|
|
# Extract: 2025-10-23T17-42-56
|
|
date_part = filename[len(f"{profile_name}-"):]
|
|
try:
|
|
# Parse ISO-like format with hyphens instead of colons
|
|
# 2025-10-23T17-42-56 -> 2025-10-23 17:42:56
|
|
date_part_clean = date_part.replace('T', ' ')
|
|
# Replace only the time part hyphens with colons
|
|
# Split on space to separate date and time
|
|
parts_dt = date_part_clean.split(' ')
|
|
if len(parts_dt) == 2:
|
|
date_portion = parts_dt[0] # 2025-10-23
|
|
time_portion = parts_dt[1].replace('-', ':') # 17-42-56 -> 17:42:56
|
|
datetime_str = f"{date_portion} {time_portion}"
|
|
# Parse the datetime
|
|
parsed_date = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
|
|
# Subtract 4 hours to convert from UTC to local time
|
|
file_date = parsed_date - timedelta(hours=4)
|
|
# Use the date part as media_id
|
|
media_id = filename[len(f"{profile_name}-"):]
|
|
except Exception as e:
|
|
self.log(f"Could not parse StoryClon e date from {filename}: {e}", "debug")
|
|
# Still use as media_id for duplicate detection
|
|
media_id = filename[len(f"{profile_name}-"):]
|
|
|
|
if media_id:
|
|
self.downloaded_files.add(media_id)
|
|
if file_date:
|
|
self.file_dates[media_id] = file_date
|
|
|
|
if self.downloaded_files:
|
|
self.log(f"Found {len(self.downloaded_files)} valid existing files for {profile_name} ({len(self.file_dates)} with dates)", "debug")
|
|
|
|
def _get_processed_posts(self, username: str) -> set:
|
|
"""Get set of story IDs that have been processed from database"""
|
|
processed = set()
|
|
if not self.db:
|
|
return processed
|
|
|
|
try:
|
|
with self.db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
# Get all stories for this user from downloads table
|
|
cursor.execute('''
|
|
SELECT url, filename, metadata FROM downloads
|
|
WHERE platform = 'snapchat'
|
|
AND source = ?
|
|
''', (username,))
|
|
|
|
for row in cursor.fetchall():
|
|
url, filename, metadata_str = row
|
|
|
|
# Extract media_id from filename
|
|
if filename:
|
|
# Format: username_date_MEDIAID.ext or username_date_MEDIAID_N.ext
|
|
parts = filename.split('_')
|
|
if len(parts) >= 4:
|
|
# Get everything after date/time as media_id
|
|
media_id = '_'.join(parts[3:]).split('.')[0]
|
|
processed.add(media_id)
|
|
|
|
# Also check metadata for media_id
|
|
if metadata_str:
|
|
try:
|
|
metadata = json.loads(metadata_str)
|
|
if 'media_id' in metadata:
|
|
processed.add(metadata['media_id'])
|
|
except (json.JSONDecodeError, KeyError, TypeError):
|
|
pass
|
|
|
|
if processed:
|
|
self.log(f"Found {len(processed)} processed stories in database for {username}", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error loading processed stories from database: {e}", "debug")
|
|
|
|
return processed
|
|
|
|
def save_cookies(self, context):
|
|
"""Save cookies to database or file"""
|
|
cookies = context.cookies()
|
|
|
|
# Save to database if available
|
|
if self.unified_db:
|
|
try:
|
|
self.unified_db.save_scraper_cookies(self.scraper_id, cookies)
|
|
self.log(f"Saved {len(cookies)} cookies to database", "debug")
|
|
return
|
|
except Exception as e:
|
|
self.log(f"Error saving cookies to database: {e}", "warning")
|
|
|
|
# Fallback to file-based storage
|
|
if self.cookie_file:
|
|
storage_data = {
|
|
'cookies': cookies,
|
|
'timestamp': datetime.now().isoformat()
|
|
}
|
|
with open(self.cookie_file, 'w') as f:
|
|
json.dump(storage_data, f, indent=2)
|
|
self.log(f"Saved {len(cookies)} cookies to file", "debug")
|
|
|
|
def load_cookies(self, context):
|
|
"""Load saved cookies from database or file"""
|
|
# Try loading from database first
|
|
if self.unified_db:
|
|
try:
|
|
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
|
if cookies:
|
|
# Clean cookies - remove unsupported properties
|
|
cleaned_cookies = []
|
|
for cookie in cookies:
|
|
cleaned = {k: v for k, v in cookie.items()
|
|
if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
|
|
cleaned_cookies.append(cleaned)
|
|
|
|
context.add_cookies(cleaned_cookies)
|
|
self.log(f"Loaded {len(cleaned_cookies)} cookies from database", "info")
|
|
return True
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies from database: {e}", "warning")
|
|
|
|
# Fallback to file-based cookies
|
|
if not self.cookie_file or not self.cookie_file.exists():
|
|
return False
|
|
|
|
try:
|
|
with open(self.cookie_file, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
# Check age (24 hours)
|
|
saved_time = datetime.fromisoformat(data['timestamp'])
|
|
if datetime.now() - saved_time > timedelta(hours=24):
|
|
self.log("Cookies expired", "debug")
|
|
return False
|
|
|
|
# Clean cookies - remove unsupported properties
|
|
cleaned_cookies = []
|
|
for cookie in data['cookies']:
|
|
# Remove Chrome-specific properties that Playwright doesn't support
|
|
cleaned = {k: v for k, v in cookie.items()
|
|
if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
|
|
cleaned_cookies.append(cleaned)
|
|
|
|
context.add_cookies(cleaned_cookies)
|
|
self.log(f"Loaded {len(cleaned_cookies)} cookies from file", "info")
|
|
return True
|
|
except Exception as e:
|
|
self.log(f"Failed to load cookies: {e}", "warning")
|
|
return False
|
|
|
|
def wait_for_cloudflare(self, page):
|
|
"""Wait for Cloudflare to auto-solve or page to load - uses FlareSolverr when needed"""
|
|
self.log("Waiting for page to load...", "debug")
|
|
|
|
max_wait = 120 # Extended wait to match ImgInn
|
|
flaresolverr_attempted = False
|
|
|
|
for i in range(max_wait):
|
|
time.sleep(1)
|
|
|
|
# Check current URL and content
|
|
try:
|
|
current_url = page.url
|
|
content = page.content().lower()
|
|
except Exception as e:
|
|
if "navigating" in str(e).lower():
|
|
self.log("Page still navigating, waiting...", "debug")
|
|
continue
|
|
else:
|
|
raise
|
|
|
|
# Check for actual Cloudflare challenge or server error
|
|
challenge_indicators = ['checking your browser', 'just a moment', 'verify you are human', 'enable javascript']
|
|
error_indicators = ['internal server error', 'error code 500', 'error code 502', 'error code 503']
|
|
|
|
has_challenge = any(indicator in content for indicator in challenge_indicators)
|
|
has_error = any(indicator in content for indicator in error_indicators)
|
|
|
|
if has_error:
|
|
self.log("Server error detected (500/502/503) - site is likely down", "error")
|
|
return False
|
|
|
|
if has_challenge:
|
|
if not flaresolverr_attempted:
|
|
self.log("Cloudflare challenge detected, attempting FlareSolverr bypass...", "info", module="Cloudflare")
|
|
# Try to get fresh cookies via FlareSolverr
|
|
if self._get_cookies_via_flaresolverr(page.url):
|
|
self.log("Got fresh cookies from FlareSolverr, reloading page...", "info", module="Cloudflare")
|
|
# Reload cookies in browser context
|
|
try:
|
|
self.load_cookies(self.context)
|
|
# Reload the page with new cookies
|
|
page.reload(wait_until='domcontentloaded', timeout=10000)
|
|
time.sleep(2) # Give page time to load with new cookies
|
|
except Exception as e:
|
|
self.log(f"Error reloading page with new cookies: {e}", "debug")
|
|
else:
|
|
self.log("FlareSolverr failed, waiting for challenge to resolve...", "warning", module="Cloudflare")
|
|
flaresolverr_attempted = True
|
|
continue
|
|
|
|
# Check if we're on the correct page with content
|
|
if 'storyclone.com' in current_url.lower():
|
|
# Look for story content indicators
|
|
if 'story' in content or 'username' in content or 'download' in content or 'stories' in content:
|
|
self.log(f"Page loaded after {i+1} seconds", "info")
|
|
return True
|
|
|
|
# Status updates
|
|
if i == 10:
|
|
self.log("Still waiting (10s)... Cloudflare is checking", "debug")
|
|
elif i == 20:
|
|
self.log("Still waiting (20s)... Cloudflare challenge ongoing", "info")
|
|
elif i == 30:
|
|
self.log("Still waiting (30s)... This is normal for Cloudflare", "info")
|
|
|
|
# Timeout reached
|
|
self.log(f"Page load timeout. URL: {page.url}", "error")
|
|
return False
|
|
|
|
def download(self, username: str, content_type: str = "stories", days_back: int = 14,
|
|
max_downloads: int = 50, output_dir: str = None, phrase_config: dict = None,
|
|
defer_database: bool = False):
|
|
"""Download content from a user - compatible with media-downloader interface
|
|
|
|
Args:
|
|
username: Snapchat username
|
|
content_type: Type of content ("stories" only for Snapchat)
|
|
days_back: How many days back to search
|
|
max_downloads: Maximum stories to download
|
|
output_dir: Output directory
|
|
phrase_config: Not used for Snapchat (stories don't have captions usually)
|
|
defer_database: If True, don't record to database immediately - store in
|
|
pending_downloads for later recording after file move is complete
|
|
"""
|
|
self.defer_database = defer_database # Store for use in download methods
|
|
# Clear downloaded_files cache between accounts to prevent memory growth
|
|
self.downloaded_files.clear()
|
|
|
|
# Check site status before doing anything else
|
|
self.log(f"Checking {self.proxy_domain} site status...", "debug")
|
|
site_status, error_msg = self.cf_handler.check_site_status(f"https://{self.proxy_domain}/", timeout=10)
|
|
|
|
if self.cf_handler.should_skip_download(site_status):
|
|
self.log(f"Skipping download - {self.proxy_domain} is unavailable: {error_msg}", "warning")
|
|
return 0
|
|
elif site_status == SiteStatus.CLOUDFLARE_CHALLENGE:
|
|
self.log("Cloudflare challenge detected, will attempt bypass during download", "info")
|
|
|
|
# Set output directory
|
|
if output_dir:
|
|
output_path = Path(output_dir) / username
|
|
else:
|
|
output_path = Path(f"/opt/media-downloader/downloads/{username}")
|
|
|
|
# Route to download method
|
|
if content_type == "stories":
|
|
files = self.download_stories(
|
|
username=username,
|
|
days_back=days_back,
|
|
max_stories=max_downloads,
|
|
output_dir=output_path
|
|
)
|
|
else:
|
|
self.log(f"Snapchat downloader does not support content type: {content_type}", "warning")
|
|
return 0
|
|
|
|
return len(files)
|
|
|
|
def download_stories(self, username: str, days_back: int = 14, max_stories: int = 50, output_dir: Path = None):
|
|
"""Download stories from a Snapchat user with FastDL naming
|
|
|
|
Args:
|
|
username: Snapchat username
|
|
days_back: How many days back to search
|
|
max_stories: Maximum stories to download
|
|
output_dir: Output directory
|
|
"""
|
|
|
|
profile_name = username.lower()
|
|
if output_dir is None:
|
|
output_dir = Path(f"/opt/media-downloader/downloads/{profile_name}")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Scan existing files
|
|
self._scan_existing_files(output_dir, profile_name)
|
|
|
|
# Get processed stories from database
|
|
processed_stories = self._get_processed_posts(profile_name)
|
|
self.log(f"Loaded {len(processed_stories)} processed stories for {profile_name} from database", "info")
|
|
|
|
downloaded_files = []
|
|
cutoff_date = datetime.now() - timedelta(days=days_back)
|
|
|
|
# Update activity status
|
|
self.activity_manager.update_status(f"Checking stories from @{profile_name}")
|
|
|
|
# Start or reuse browser
|
|
self._start_browser()
|
|
page = self.page
|
|
|
|
try:
|
|
# Navigate to user's stories page on StoryClon e
|
|
self.log(f"Navigating to @{username} on {self.proxy_domain}", "info")
|
|
page.goto(f"https://{self.proxy_domain}/u/{username}/", wait_until='domcontentloaded')
|
|
|
|
# Wait for page to load
|
|
if not self.wait_for_cloudflare(page):
|
|
self.log("Page didn't load properly", "error")
|
|
return []
|
|
|
|
# Save cookies
|
|
self.save_cookies(self.context)
|
|
|
|
# Wait for page to load
|
|
self.log("Waiting for page to load...", "info")
|
|
time.sleep(3) # Give page time to load content
|
|
|
|
# Check if "Stories" section exists - if not, there are no stories to scrape
|
|
stories_section = page.locator('div.font-semibold.ml-6:has-text("Stories")').first
|
|
if stories_section.count() == 0:
|
|
self.log("No 'Stories' section found - user has no stories available", "info")
|
|
return []
|
|
|
|
self.log("Found 'Stories' section - proceeding to load all stories...", "info")
|
|
|
|
# Scroll down and load all stories by clicking "Load More" button
|
|
self.log("Scrolling to load all stories...", "info")
|
|
|
|
load_more_clicks = 0
|
|
max_attempts = 20
|
|
|
|
for attempt in range(max_attempts):
|
|
# Step 1: Scroll until we see "Spotlight Highlights"
|
|
self.log("Scrolling until 'Spotlight Highlights' is visible...", "debug")
|
|
scroll_attempts = 0
|
|
max_scrolls = 10
|
|
|
|
while scroll_attempts < max_scrolls:
|
|
spotlight_highlights = page.locator('text=Spotlight Highlights').first
|
|
if spotlight_highlights.count() > 0:
|
|
self.log("Found 'Spotlight Highlights' in view", "debug")
|
|
break
|
|
|
|
page.evaluate("window.scrollBy(0, 400)")
|
|
time.sleep(1)
|
|
scroll_attempts += 1
|
|
|
|
# Step 2: Check if there's a "Load More" button ABOVE "Spotlight Highlights" (positionally before)
|
|
load_more_btn = page.locator('button:has-text("Load More"), button.load-more-button').first
|
|
spotlight_highlights = page.locator('text=Spotlight Highlights').first
|
|
|
|
load_more_visible = load_more_btn.count() > 0 and load_more_btn.is_visible()
|
|
spotlight_visible = spotlight_highlights.count() > 0
|
|
|
|
if load_more_visible and spotlight_visible:
|
|
# Both are visible - check Y positions to see which comes first
|
|
load_more_box = load_more_btn.bounding_box()
|
|
spotlight_box = spotlight_highlights.bounding_box()
|
|
|
|
if load_more_box and spotlight_box:
|
|
load_more_y = load_more_box['y']
|
|
spotlight_y = spotlight_box['y']
|
|
|
|
if load_more_y < spotlight_y:
|
|
# "Load More" is ABOVE "Spotlight Highlights" → Click it
|
|
load_more_clicks += 1
|
|
self.log(f"Found 'Load More' ABOVE 'Spotlight Highlights' (Y:{load_more_y:.0f} < {spotlight_y:.0f}) - clicking (click #{load_more_clicks})...", "info")
|
|
load_more_btn.click()
|
|
time.sleep(2.5) # Wait for more posts to load
|
|
|
|
items_count = len(page.locator('.item').all())
|
|
self.log(f"Items after click: {items_count}", "debug")
|
|
|
|
# Go back and scroll to "Spotlight Highlights" again (it will be pushed down)
|
|
continue
|
|
else:
|
|
# "Load More" is BELOW "Spotlight Highlights" → We're done
|
|
items_final = page.locator('.item').all()
|
|
self.log(f"'Load More' is BELOW 'Spotlight Highlights' (Y:{load_more_y:.0f} > {spotlight_y:.0f}) - done! Found {len(items_final)} stories (clicked Load More {load_more_clicks} times)", "info")
|
|
break
|
|
elif spotlight_visible:
|
|
# Only "Spotlight Highlights" visible, no "Load More" → We're done
|
|
items_final = page.locator('.item').all()
|
|
self.log(f"No 'Load More' button found - done! Found {len(items_final)} stories (clicked Load More {load_more_clicks} times)", "info")
|
|
break
|
|
else:
|
|
# Neither visible, keep trying
|
|
self.log("Neither 'Load More' nor 'Spotlight Highlights' found, continuing...", "debug")
|
|
continue
|
|
|
|
# Find story/media elements by processing each .item container
|
|
# This ensures lazy-loaded content is properly triggered
|
|
self.log("Extracting media from story items...", "info")
|
|
|
|
# Get Y position of "Spotlight Highlights" to filter out items after it
|
|
spotlight_highlights = page.locator('text=Spotlight Highlights').first
|
|
spotlight_y = None
|
|
if spotlight_highlights.count() > 0:
|
|
spotlight_box = spotlight_highlights.bounding_box()
|
|
if spotlight_box:
|
|
spotlight_y = spotlight_box['y']
|
|
self.log(f"'Spotlight Highlights' Y position: {spotlight_y:.0f}", "debug")
|
|
|
|
# Get all .item elements
|
|
all_items = page.locator('.item').all()
|
|
|
|
# Filter to only items BEFORE "Spotlight Highlights"
|
|
story_items = []
|
|
for item in all_items:
|
|
item_box = item.bounding_box()
|
|
if item_box and spotlight_y:
|
|
item_y = item_box['y']
|
|
if item_y < spotlight_y:
|
|
story_items.append(item)
|
|
elif not spotlight_y:
|
|
# No Spotlight Highlights found, include all items
|
|
story_items.append(item)
|
|
|
|
self.log(f"Filtered to {len(story_items)} story items (before Spotlight Highlights) from {len(all_items)} total items", "info")
|
|
|
|
media_elements = []
|
|
|
|
for idx, item in enumerate(story_items):
|
|
try:
|
|
# Scroll item into view to trigger lazy loading
|
|
item.scroll_into_view_if_needed()
|
|
time.sleep(0.3) # Give it a moment to load
|
|
|
|
# Look for video first
|
|
video = item.locator('video[src]').first
|
|
if video.count() > 0:
|
|
media_elements.append(video)
|
|
self.log(f"Item {idx+1}: Found video", "debug")
|
|
continue
|
|
|
|
# If no video, look for image from Snapchat CDN
|
|
img = item.locator('img[src*="sc-cdn.net"]').first
|
|
if img.count() > 0:
|
|
src = img.get_attribute('src')
|
|
# Skip apple icons, favicons, and poster images
|
|
if src and 'apple-icon' not in src and 'favicon' not in src and '/d/' in src:
|
|
media_elements.append(img)
|
|
self.log(f"Item {idx+1}: Found image", "debug")
|
|
continue
|
|
|
|
self.log(f"Item {idx+1}: No media found (may be lazy-loading)", "debug")
|
|
|
|
except Exception as e:
|
|
self.log(f"Item {idx+1}: Error processing - {e}", "debug")
|
|
|
|
self.log(f"Extracted {len(media_elements)} media elements from {len(story_items)} items", "info")
|
|
|
|
if not media_elements:
|
|
self.log("No stories found for this user", "warning")
|
|
return []
|
|
|
|
self.log(f"Found {len(media_elements)} potential story items", "info")
|
|
|
|
# Download each story
|
|
story_index = 1
|
|
for i, media_elem in enumerate(media_elements[:max_stories]):
|
|
try:
|
|
# Get media URL
|
|
media_url = None
|
|
|
|
# Try to get src attribute
|
|
media_url = media_elem.get_attribute('src')
|
|
|
|
# If no src, try href (for download links)
|
|
if not media_url or media_url == '#':
|
|
media_url = media_elem.get_attribute('href')
|
|
|
|
if not media_url or media_url == '#' or media_url.startswith('data:'):
|
|
self.log(f"Story {story_index}: Invalid media URL", "warning")
|
|
continue
|
|
|
|
self.log(f"Story {story_index}: {media_url[:80]}...", "debug")
|
|
|
|
# Try to get higher quality version by replacing size parameter
|
|
# URLs look like: https://.../{id}.1034.IRZXSOY?...
|
|
# Try larger sizes: 2048, 1920, 1440, 1034 (original)
|
|
import re
|
|
hq_url = None
|
|
original_url = media_url
|
|
|
|
# Check if URL has a size parameter pattern
|
|
size_match = re.search(r'\.(\d+)\.IRZXSOY', media_url)
|
|
if size_match:
|
|
original_size = size_match.group(1)
|
|
# Try larger sizes (in descending order)
|
|
for test_size in ['2048', '1920', '1440']:
|
|
if int(test_size) > int(original_size):
|
|
test_url = media_url.replace(f'.{original_size}.IRZXSOY', f'.{test_size}.IRZXSOY')
|
|
# Test if this URL is accessible
|
|
try:
|
|
import requests
|
|
response = requests.head(test_url, timeout=5, allow_redirects=True)
|
|
if response.status_code == 200:
|
|
hq_url = test_url
|
|
self.log(f"Story {story_index}: Found higher quality version (size {test_size})", "info")
|
|
break
|
|
except requests.RequestException:
|
|
continue
|
|
|
|
# Use HQ URL if found, otherwise use original
|
|
if hq_url:
|
|
media_url = hq_url
|
|
|
|
# Extract media ID from URL and determine correct extension
|
|
from urllib.parse import urlparse, unquote
|
|
url_path = urlparse(media_url).path
|
|
original_name = unquote(url_path.split('/')[-1].split('?')[0])
|
|
|
|
# Determine file type from element or URL
|
|
# Snapchat CDN uses weird extensions like .IRZXSOY, so we need to detect the actual type
|
|
if media_elem.evaluate("element => element.tagName").lower() == 'video':
|
|
ext = '.mp4' # Videos are MP4
|
|
else:
|
|
ext = '.jpg' # Images are JPG
|
|
|
|
# Use the full filename as media_id (without fake extension)
|
|
media_id = original_name.split('.')[0] # Take first part before any dots
|
|
|
|
# Check if already downloaded
|
|
if media_id in self.downloaded_files or media_id in processed_stories:
|
|
self.log(f"Story {story_index}: Already downloaded ({media_id}), skipping", "debug")
|
|
story_index += 1
|
|
continue
|
|
|
|
# Extract post date from the story item on the page
|
|
story_date = None
|
|
try:
|
|
# Try multiple strategies to find the date associated with THIS specific story
|
|
# Strategy 1: Look in the immediate parent of the media element
|
|
immediate_parent = media_elem.locator('xpath=..').first
|
|
if immediate_parent.count() > 0:
|
|
date_elem = immediate_parent.locator('.text-sm').first
|
|
if date_elem.count() > 0:
|
|
date_text = date_elem.text_content()
|
|
if date_text and ("Posted on" in date_text or "at" in date_text):
|
|
self.log(f"Story {story_index}: Found date in immediate parent: '{date_text}'", "debug")
|
|
story_date = self._parse_story_date_text(date_text)
|
|
if story_date:
|
|
self.log(f"Story {story_index}: Extracted date from page: {story_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
|
|
|
|
# Strategy 2: If not found, try the closest ancestor with a limited depth
|
|
if not story_date:
|
|
# Look for a closer parent (not going all the way up)
|
|
for depth in [1, 2, 3]:
|
|
parent_xpath = 'xpath=' + '/'.join(['..'] * depth)
|
|
parent = media_elem.locator(parent_xpath).first
|
|
if parent.count() > 0:
|
|
# Get only the FIRST .text-sm in this parent
|
|
date_elem = parent.locator('.text-sm').first
|
|
if date_elem.count() > 0:
|
|
date_text = date_elem.text_content()
|
|
if date_text and ("Posted on" in date_text or "at" in date_text):
|
|
self.log(f"Story {story_index}: Found date at depth {depth}: '{date_text}'", "debug")
|
|
story_date = self._parse_story_date_text(date_text)
|
|
if story_date:
|
|
self.log(f"Story {story_index}: Extracted date from page: {story_date.strftime('%Y-%m-%d %H:%M:%S')}", "debug")
|
|
break
|
|
|
|
if not story_date:
|
|
self.log(f"Story {story_index}: Could not find date text for this story", "debug")
|
|
except Exception as e:
|
|
self.log(f"Story {story_index}: Could not extract date - {e}", "debug")
|
|
import traceback
|
|
self.log(f"Story {story_index}: Traceback: {traceback.format_exc()}", "debug")
|
|
|
|
# Fallback to current time if extraction failed
|
|
if not story_date:
|
|
story_date = datetime.now()
|
|
self.log(f"Story {story_index}: Using current time as fallback", "debug")
|
|
|
|
date_str = story_date.strftime('%Y%m%d_%H%M%S')
|
|
|
|
# Build filename: {profile}_{date}_{media_id}{ext}
|
|
filename = f"{profile_name}_{date_str}_{media_id}{ext}"
|
|
filepath = output_dir / filename
|
|
|
|
# Download the story
|
|
try:
|
|
import requests
|
|
|
|
# Ensure full URL
|
|
if not media_url.startswith('http'):
|
|
media_url = f"https:{media_url}" if media_url.startswith('//') else f"https://{self.proxy_domain}{media_url}"
|
|
|
|
response = requests.get(media_url, timeout=30, headers={
|
|
'User-Agent': self.user_agent,
|
|
'Referer': f'https://{self.proxy_domain}/'
|
|
}, cookies=self._get_cookies_for_requests())
|
|
response.raise_for_status()
|
|
|
|
# Save file
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
self.log(f"Downloaded story: {filename} ({len(response.content)} bytes)", "info")
|
|
downloaded_files.append(str(filepath))
|
|
|
|
# Check for duplicate hash before recording
|
|
if self.db:
|
|
from pathlib import Path as PathLib
|
|
# Check for duplicate hash (hash blacklist persists even if original deleted)
|
|
file_hash = self.db.get_file_hash(str(filepath))
|
|
if file_hash:
|
|
existing = self.db.get_download_by_file_hash(file_hash)
|
|
if existing and existing.get('file_path') and str(filepath) != existing.get('file_path'):
|
|
# Duplicate hash found - content was already downloaded (prevents redownload of deleted content)
|
|
self.log(f"⚠ Duplicate content detected (hash match): {filename} matches {existing['filename']} from {existing['platform']}/{existing['source']}", "warning")
|
|
# Delete the duplicate regardless of whether original file still exists
|
|
try:
|
|
filepath.unlink()
|
|
self.log(f"Deleted duplicate (hash blacklist): {filename}", "debug")
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Failed to delete duplicate {filename}: {e}", "warning")
|
|
|
|
# Update timestamps
|
|
self._update_file_timestamps(filepath, story_date)
|
|
|
|
# Add to tracking
|
|
self.downloaded_files.add(media_id)
|
|
|
|
# Record in database with media_id in metadata
|
|
self._record_download(
|
|
username=profile_name,
|
|
url=media_url,
|
|
filename=filename,
|
|
post_date=story_date,
|
|
metadata={'media_id': media_id},
|
|
file_path=str(filepath),
|
|
deferred=getattr(self, 'defer_database', False)
|
|
)
|
|
|
|
story_index += 1
|
|
|
|
except Exception as e:
|
|
self.log(f"Failed to download story {story_index}: {e}", "error")
|
|
story_index += 1
|
|
continue
|
|
|
|
except Exception as e:
|
|
self.log(f"Error processing story {story_index}: {e}", "error")
|
|
story_index += 1
|
|
continue
|
|
|
|
self.log(f"Downloaded {len(downloaded_files)} story files", "info")
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading stories: {e}", "error")
|
|
import traceback
|
|
self.log(f"Traceback: {traceback.format_exc()}", "debug")
|
|
|
|
# Don't close browser here - reuse it for next profile
|
|
return downloaded_files
|
|
|
|
|
|
def main():
|
|
"""Test the downloader with FastDL naming"""
|
|
import sys
|
|
|
|
print("=" * 60)
|
|
print("Snapchat Downloader (StoryClon e) - FastDL Compatible Naming")
|
|
print("=" * 60)
|
|
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print("=" * 60)
|
|
|
|
downloader = SnapchatDownloader(
|
|
api_key=None, # Add your 2captcha key if needed
|
|
headless=False # Use with xvfb
|
|
)
|
|
|
|
# Test username (replace with actual Snapchat username)
|
|
test_username = sys.argv[1] if len(sys.argv) > 1 else "testuser"
|
|
|
|
# Download stories
|
|
files = downloader.download_stories(
|
|
username=test_username,
|
|
days_back=7,
|
|
max_stories=50
|
|
)
|
|
|
|
print("\n" + "=" * 60)
|
|
print("RESULTS")
|
|
print("=" * 60)
|
|
|
|
if files:
|
|
print(f"Successfully downloaded {len(files)} files!")
|
|
print("\nDownloaded files (FastDL naming format):")
|
|
for f in files:
|
|
name = Path(f).name
|
|
size = Path(f).stat().st_size / 1024
|
|
parts = name.split('_', 3)
|
|
if len(parts) >= 4:
|
|
print(f" - {name}")
|
|
print(f" Profile: {parts[0]}")
|
|
print(f" Date: {parts[1]}_{parts[2]}")
|
|
print(f" Media ID: {parts[3].split('.')[0]}")
|
|
print(f" Size: {size:.1f} KB")
|
|
else:
|
|
print("No files downloaded")
|
|
|
|
# Check total in folder
|
|
download_dir = Path(f"/opt/media-downloader/downloads/{test_username}")
|
|
if download_dir.exists():
|
|
all_files = list(download_dir.glob("*"))
|
|
total_size = sum(f.stat().st_size for f in all_files) / 1024
|
|
print(f"\nTotal in folder: {len(all_files)} files ({total_size:.1f} KB)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|