985
modules/snapchat_scraper.py
Normal file
985
modules/snapchat_scraper.py
Normal file
@@ -0,0 +1,985 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Snapchat Direct Scraper Module - Scrapes directly from Snapchat.com
|
||||
|
||||
Uses Playwright to scrape profiles and extract:
|
||||
- Spotlight videos (540x960)
|
||||
- Stories/Highlights (480x852, stitched into single videos)
|
||||
|
||||
Full metadata extraction including timestamps, media IDs, descriptions.
|
||||
Follows the same interface as the original snapchat_module.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
import tempfile
|
||||
import subprocess
|
||||
import shutil
|
||||
import platform
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, List, Any, Set
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
# Set environment for Playwright
|
||||
os.environ.setdefault('PLAYWRIGHT_BROWSERS_PATH', '/root/.cache/ms-playwright')
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from modules.cloudflare_handler import (
|
||||
get_playwright_context_options,
|
||||
get_playwright_stealth_scripts,
|
||||
get_flaresolverr_user_agent
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SnapMedia:
|
||||
"""Represents a single snap media item"""
|
||||
media_id: str
|
||||
media_type: str # 'video' or 'image'
|
||||
media_url: str
|
||||
timestamp: datetime
|
||||
index: int = 0
|
||||
thumbnail_url: str = ""
|
||||
duration_ms: int = 0
|
||||
description: str = ""
|
||||
view_count: int = 0
|
||||
width: int = 0
|
||||
height: int = 0
|
||||
lat: Optional[float] = None
|
||||
lng: Optional[float] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SnapCollection:
|
||||
"""Represents a spotlight or highlight collection"""
|
||||
collection_id: str
|
||||
collection_type: str # 'spotlight' or 'highlight'
|
||||
title: str = ""
|
||||
username: str = ""
|
||||
snaps: List[SnapMedia] = field(default_factory=list)
|
||||
url: str = ""
|
||||
|
||||
|
||||
class SnapchatDirectScraper(LoggingMixin):
|
||||
"""
|
||||
Scrapes Snapchat profiles directly for media content.
|
||||
|
||||
Follows the same interface as SnapchatDownloader for compatibility
|
||||
with the media-downloader system.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
headless: bool = True,
|
||||
show_progress: bool = True,
|
||||
use_database: bool = True,
|
||||
log_callback=None,
|
||||
unified_db=None):
|
||||
"""Initialize scraper compatible with media-downloader system"""
|
||||
self.headless = headless
|
||||
self.show_progress = show_progress
|
||||
self.use_database = use_database
|
||||
self.unified_db = unified_db
|
||||
self.scraper_id = 'snapchat_direct'
|
||||
self.download_count = 0
|
||||
self.downloaded_files: Set[str] = set()
|
||||
self.pending_downloads = []
|
||||
|
||||
# Initialize logging via mixin
|
||||
self._init_logger('SnapchatDirect', log_callback, default_module='Download')
|
||||
|
||||
# User-Agent to match FlareSolverr (dynamically fetched for consistency)
|
||||
self.user_agent = get_flaresolverr_user_agent()
|
||||
|
||||
# Browser state
|
||||
self._playwright = None
|
||||
self.browser = None
|
||||
self.context = None
|
||||
|
||||
# Database adapter
|
||||
if unified_db and use_database:
|
||||
from modules.unified_database import SnapchatDatabaseAdapter
|
||||
self.db = SnapchatDatabaseAdapter(unified_db)
|
||||
else:
|
||||
self.db = None
|
||||
self.use_database = False
|
||||
|
||||
# Activity status manager
|
||||
try:
|
||||
from modules.activity_status import get_activity_manager
|
||||
self.activity_manager = get_activity_manager(unified_db)
|
||||
except ImportError:
|
||||
self.activity_manager = None
|
||||
|
||||
# Load cookies from database
|
||||
self.cookies = self._load_cookies_from_db()
|
||||
|
||||
# Load proxy configuration from database
|
||||
self.proxy_url = None
|
||||
if unified_db:
|
||||
try:
|
||||
scraper_config = unified_db.get_scraper('snapchat')
|
||||
if scraper_config and scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
|
||||
self.proxy_url = scraper_config['proxy_url']
|
||||
self.log(f"Using proxy: {self.proxy_url}", "info")
|
||||
except Exception as e:
|
||||
self.log(f"Could not load proxy config: {e}", "debug")
|
||||
|
||||
def _load_cookies_from_db(self) -> List[Dict]:
|
||||
"""Load cookies from database"""
|
||||
if not self.unified_db:
|
||||
return self._get_default_cookies()
|
||||
|
||||
try:
|
||||
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
||||
if cookies:
|
||||
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
|
||||
return cookies
|
||||
except Exception as e:
|
||||
self.log(f"Error loading cookies from database: {e}", "warning")
|
||||
|
||||
# Try loading from original snapchat scraper
|
||||
try:
|
||||
cookies = self.unified_db.get_scraper_cookies('snapchat')
|
||||
if cookies:
|
||||
self.log(f"Using cookies from 'snapchat' scraper", "debug")
|
||||
return cookies
|
||||
except Exception as e:
|
||||
self.log(f"Error loading cookies from snapchat scraper: {e}", "debug")
|
||||
|
||||
return self._get_default_cookies()
|
||||
|
||||
def _get_default_cookies(self) -> List[Dict]:
|
||||
"""Get default cookies for Snapchat"""
|
||||
return [
|
||||
{"name": "sc-cookies-accepted", "value": "true", "domain": "www.snapchat.com", "path": "/"},
|
||||
]
|
||||
|
||||
def _save_cookies_to_db(self, cookies: List[Dict], user_agent: str = None):
|
||||
"""Save cookies to database
|
||||
|
||||
Args:
|
||||
cookies: List of cookie dictionaries
|
||||
user_agent: User agent to associate with cookies (important for cf_clearance).
|
||||
If not provided, uses self.user_agent as fallback.
|
||||
"""
|
||||
if not self.unified_db:
|
||||
return
|
||||
|
||||
try:
|
||||
# Use provided user_agent or fall back to self.user_agent
|
||||
ua = user_agent or self.user_agent
|
||||
self.unified_db.save_scraper_cookies(
|
||||
self.scraper_id,
|
||||
cookies,
|
||||
user_agent=ua,
|
||||
merge=True
|
||||
)
|
||||
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50]}...)", "debug")
|
||||
except Exception as e:
|
||||
self.log(f"Error saving cookies to database: {e}", "warning")
|
||||
|
||||
def _parse_proxy_url(self, proxy_url: str) -> Optional[Dict]:
|
||||
"""
|
||||
Parse proxy URL into Playwright proxy config.
|
||||
Supports: protocol://user:pass@host:port or protocol://host:port
|
||||
"""
|
||||
import re
|
||||
try:
|
||||
# Match: protocol://[user:pass@]host:port
|
||||
match = re.match(
|
||||
r'^(https?|socks[45]?)://(?:([^:]+):([^@]+)@)?([^:]+):(\d+)$',
|
||||
proxy_url
|
||||
)
|
||||
if match:
|
||||
protocol, username, password, host, port = match.groups()
|
||||
config = {'server': f'{protocol}://{host}:{port}'}
|
||||
if username and password:
|
||||
config['username'] = username
|
||||
config['password'] = password
|
||||
return config
|
||||
except Exception as e:
|
||||
self.log(f"Failed to parse proxy URL: {e}", "warning")
|
||||
return None
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry"""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit"""
|
||||
self._close_browser()
|
||||
return False
|
||||
|
||||
def _start_browser(self):
|
||||
"""Start Playwright browser"""
|
||||
if self.browser is not None:
|
||||
return
|
||||
|
||||
os.environ['DISPLAY'] = ':100'
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
self._playwright = sync_playwright().start()
|
||||
self.browser = self._playwright.chromium.launch(
|
||||
headless=self.headless,
|
||||
args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
|
||||
)
|
||||
|
||||
# Build context options - use dynamic fingerprinting from FlareSolverr
|
||||
context_options = get_playwright_context_options()
|
||||
|
||||
# IMPORTANT: If cookies have a stored user_agent, use THAT user_agent
|
||||
# Cloudflare cf_clearance cookies are fingerprinted to the browser that solved the challenge
|
||||
try:
|
||||
if self.unified_db:
|
||||
stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
|
||||
if stored_user_agent:
|
||||
self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug", module="Browser")
|
||||
context_options['user_agent'] = stored_user_agent
|
||||
else:
|
||||
self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser")
|
||||
else:
|
||||
self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser")
|
||||
except Exception as e:
|
||||
self.log(f"Error getting stored user_agent, using default: {e}", "debug", module="Browser")
|
||||
|
||||
# Add proxy if configured
|
||||
if self.proxy_url:
|
||||
proxy_config = self._parse_proxy_url(self.proxy_url)
|
||||
if proxy_config:
|
||||
context_options['proxy'] = proxy_config
|
||||
self.log(f"Browser using proxy: {proxy_config.get('server')}", "info", module="Browser")
|
||||
|
||||
self.context = self.browser.new_context(**context_options)
|
||||
|
||||
# Add anti-detection scripts to all pages in this context
|
||||
self.context.add_init_script(get_playwright_stealth_scripts())
|
||||
|
||||
# Add cookies
|
||||
if self.cookies:
|
||||
# Clean cookies for Playwright and convert expiry->expires
|
||||
cleaned = []
|
||||
for c in self.cookies:
|
||||
clean = {k: v for k, v in c.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
|
||||
# FlareSolverr uses 'expiry' but Playwright uses 'expires'
|
||||
if 'expiry' in clean and 'expires' not in clean:
|
||||
clean['expires'] = clean.pop('expiry')
|
||||
cleaned.append(clean)
|
||||
|
||||
# CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect
|
||||
try:
|
||||
self.context.clear_cookies()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self.context.add_cookies(cleaned)
|
||||
|
||||
self.log("Browser started", "info", module="Browser")
|
||||
|
||||
def _close_browser(self):
|
||||
"""Close browser and cleanup"""
|
||||
if self.context:
|
||||
try:
|
||||
self.context.close()
|
||||
except Exception as e:
|
||||
self.log(f"Error closing browser context: {e}", "debug")
|
||||
self.context = None
|
||||
|
||||
if self.browser:
|
||||
try:
|
||||
self.browser.close()
|
||||
except Exception as e:
|
||||
self.log(f"Error closing browser: {e}", "debug")
|
||||
self.browser = None
|
||||
|
||||
if self._playwright:
|
||||
try:
|
||||
self._playwright.stop()
|
||||
except Exception as e:
|
||||
self.log(f"Error stopping playwright: {e}", "debug")
|
||||
self._playwright = None
|
||||
|
||||
def _get_next_data(self, page) -> Optional[Dict]:
|
||||
"""Extract __NEXT_DATA__ JSON from page"""
|
||||
try:
|
||||
next_data_elem = page.locator('script#__NEXT_DATA__').first
|
||||
if next_data_elem.count() > 0:
|
||||
return json.loads(next_data_elem.inner_text())
|
||||
except Exception as e:
|
||||
self.log(f"Error extracting __NEXT_DATA__: {e}", "debug")
|
||||
return None
|
||||
|
||||
def _set_metadata(self, file_path: str, snap: SnapMedia, description: str = None):
|
||||
"""Set EXIF metadata and file timestamp"""
|
||||
try:
|
||||
date_str = snap.timestamp.strftime('%Y:%m:%d %H:%M:%S')
|
||||
desc = description or snap.description or ""
|
||||
if snap.view_count:
|
||||
desc += f" [Views: {snap.view_count}]"
|
||||
desc = desc.strip()
|
||||
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
is_video = ext in ['.mp4', '.mov', '.avi', '.webm']
|
||||
is_image = ext in ['.jpg', '.jpeg', '.png', '.webp']
|
||||
|
||||
exif_args = [
|
||||
'exiftool', '-overwrite_original', '-ignoreMinorErrors',
|
||||
f'-FileModifyDate={date_str}',
|
||||
]
|
||||
|
||||
if is_image:
|
||||
exif_args.extend([
|
||||
f'-DateTimeOriginal={date_str}',
|
||||
f'-CreateDate={date_str}',
|
||||
f'-ModifyDate={date_str}',
|
||||
f'-MetadataDate={date_str}',
|
||||
])
|
||||
if desc:
|
||||
exif_args.extend([
|
||||
f'-ImageDescription={desc}',
|
||||
f'-XPComment={desc}',
|
||||
f'-UserComment={desc}',
|
||||
])
|
||||
if snap.lat and snap.lng:
|
||||
lat_ref = 'N' if snap.lat >= 0 else 'S'
|
||||
lng_ref = 'E' if snap.lng >= 0 else 'W'
|
||||
exif_args.extend([
|
||||
f'-GPSLatitude={abs(snap.lat)}',
|
||||
f'-GPSLatitudeRef={lat_ref}',
|
||||
f'-GPSLongitude={abs(snap.lng)}',
|
||||
f'-GPSLongitudeRef={lng_ref}',
|
||||
])
|
||||
|
||||
elif is_video:
|
||||
exif_args.extend([
|
||||
f'-CreateDate={date_str}',
|
||||
f'-ModifyDate={date_str}',
|
||||
f'-MediaCreateDate={date_str}',
|
||||
f'-MediaModifyDate={date_str}',
|
||||
f'-TrackCreateDate={date_str}',
|
||||
f'-TrackModifyDate={date_str}',
|
||||
])
|
||||
if desc:
|
||||
exif_args.extend([
|
||||
f'-Description={desc}',
|
||||
f'-Comment={desc}',
|
||||
])
|
||||
|
||||
exif_args.append(file_path)
|
||||
subprocess.run(exif_args, capture_output=True, timeout=30)
|
||||
|
||||
# Set filesystem modification time
|
||||
ts = snap.timestamp.timestamp()
|
||||
os.utime(file_path, (ts, ts))
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Warning: Could not set metadata for {file_path}: {e}", "debug")
|
||||
|
||||
def get_profile_content(self, username: str) -> Dict[str, List[str]]:
|
||||
"""Get all spotlight and highlight URLs from a profile"""
|
||||
import time
|
||||
|
||||
if not self.browser:
|
||||
self._start_browser()
|
||||
|
||||
page = self.context.new_page()
|
||||
result = {'spotlights': [], 'highlights': []}
|
||||
|
||||
try:
|
||||
url = f"https://www.snapchat.com/@{username}"
|
||||
self.log(f"Navigating to profile @{username}", "info")
|
||||
page.goto(url, wait_until='networkidle', timeout=30000)
|
||||
time.sleep(2)
|
||||
|
||||
content = page.content()
|
||||
|
||||
# Extract spotlight URLs
|
||||
spotlight_pattern = rf'/@{username}/spotlight/([A-Za-z0-9_-]+)'
|
||||
spotlight_ids = list(set(re.findall(spotlight_pattern, content)))
|
||||
result['spotlights'] = [
|
||||
f"https://www.snapchat.com/@{username}/spotlight/{sid}"
|
||||
for sid in spotlight_ids
|
||||
]
|
||||
self.log(f"Found {len(result['spotlights'])} spotlights", "info")
|
||||
|
||||
# Click Stories tab to get highlights
|
||||
stories_tab = page.locator('[role="tab"]:has-text("Stories")').first
|
||||
if stories_tab.count() > 0:
|
||||
stories_tab.click()
|
||||
time.sleep(2)
|
||||
|
||||
content = page.content()
|
||||
highlight_pattern = rf'/@{username}/highlight/([A-Za-z0-9-]+)'
|
||||
highlight_ids = list(set(re.findall(highlight_pattern, content)))
|
||||
result['highlights'] = [
|
||||
f"https://www.snapchat.com/@{username}/highlight/{hid}"
|
||||
for hid in highlight_ids
|
||||
]
|
||||
self.log(f"Found {len(result['highlights'])} highlights", "info")
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting profile content: {e}", "error")
|
||||
finally:
|
||||
page.close()
|
||||
|
||||
return result
|
||||
|
||||
def get_spotlight_metadata(self, url: str) -> Optional[SnapCollection]:
|
||||
"""Extract full metadata from a spotlight URL"""
|
||||
import time
|
||||
|
||||
if not self.browser:
|
||||
self._start_browser()
|
||||
|
||||
page = self.context.new_page()
|
||||
|
||||
try:
|
||||
page.goto(url, wait_until='domcontentloaded', timeout=60000)
|
||||
time.sleep(2)
|
||||
|
||||
data = self._get_next_data(page)
|
||||
if not data:
|
||||
return None
|
||||
|
||||
props = (data.get('props') or {}).get('pageProps') or {}
|
||||
feed = props.get('spotlightFeed') or {}
|
||||
stories = feed.get('spotlightStories') or []
|
||||
|
||||
if not stories:
|
||||
return None
|
||||
|
||||
story_data = stories[0]
|
||||
story = story_data.get('story') or {}
|
||||
metadata = (story_data.get('metadata') or {}).get('videoMetadata') or {}
|
||||
|
||||
story_id = (story.get('storyId') or {}).get('value', '')
|
||||
creator = (metadata.get('creator') or {}).get('personCreator') or {}
|
||||
username = creator.get('username', '')
|
||||
|
||||
collection = SnapCollection(
|
||||
collection_id=story_id,
|
||||
collection_type='spotlight',
|
||||
title=metadata.get('description', ''),
|
||||
username=username,
|
||||
url=url
|
||||
)
|
||||
|
||||
for snap_data in story.get('snapList') or []:
|
||||
snap_id = (snap_data.get('snapId') or {}).get('value', '')
|
||||
snap_urls = snap_data.get('snapUrls') or {}
|
||||
media_url = snap_urls.get('mediaUrl', '')
|
||||
|
||||
media_id = ''
|
||||
if '/d/' in media_url:
|
||||
media_id = media_url.split('/d/')[1].split('.')[0]
|
||||
|
||||
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
|
||||
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
|
||||
|
||||
snap = SnapMedia(
|
||||
media_id=media_id or snap_id,
|
||||
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
|
||||
media_url=media_url,
|
||||
timestamp=timestamp,
|
||||
index=snap_data.get('snapIndex', 0),
|
||||
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
|
||||
duration_ms=int(metadata.get('durationMs', 0)),
|
||||
description=metadata.get('description', ''),
|
||||
view_count=int(metadata.get('viewCount', 0)),
|
||||
width=int(metadata.get('width', 540)),
|
||||
height=int(metadata.get('height', 960))
|
||||
)
|
||||
collection.snaps.append(snap)
|
||||
|
||||
return collection
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting spotlight metadata: {e}", "error")
|
||||
return None
|
||||
finally:
|
||||
page.close()
|
||||
|
||||
def get_highlight_metadata(self, url: str) -> Optional[SnapCollection]:
|
||||
"""Extract full metadata from a highlight URL"""
|
||||
import time
|
||||
|
||||
if not self.browser:
|
||||
self._start_browser()
|
||||
|
||||
page = self.context.new_page()
|
||||
|
||||
try:
|
||||
page.goto(url, wait_until='domcontentloaded', timeout=60000)
|
||||
time.sleep(2)
|
||||
|
||||
data = self._get_next_data(page)
|
||||
if not data:
|
||||
return None
|
||||
|
||||
props = (data.get('props') or {}).get('pageProps') or {}
|
||||
highlight = props.get('highlight') or {}
|
||||
|
||||
if not highlight:
|
||||
return None
|
||||
|
||||
highlight_id = highlight.get('highlightId') or {}
|
||||
if isinstance(highlight_id, dict):
|
||||
highlight_id = highlight_id.get('value', '')
|
||||
|
||||
username_match = re.search(r'@([^/]+)', url)
|
||||
username = username_match.group(1) if username_match else ''
|
||||
|
||||
title = highlight.get('storyTitle') or {}
|
||||
if isinstance(title, dict):
|
||||
title = title.get('value', '')
|
||||
|
||||
collection = SnapCollection(
|
||||
collection_id=highlight_id,
|
||||
collection_type='highlight',
|
||||
title=title or 'Untitled Highlight',
|
||||
username=username,
|
||||
url=url
|
||||
)
|
||||
|
||||
for snap_data in highlight.get('snapList') or []:
|
||||
snap_urls = snap_data.get('snapUrls') or {}
|
||||
media_url = snap_urls.get('mediaUrl', '')
|
||||
|
||||
media_id = ''
|
||||
if '/d/' in media_url:
|
||||
media_id = media_url.split('/d/')[1].split('.')[0]
|
||||
|
||||
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
|
||||
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
|
||||
|
||||
lat = snap_data.get('lat')
|
||||
lng = snap_data.get('lng')
|
||||
|
||||
snap = SnapMedia(
|
||||
media_id=media_id,
|
||||
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
|
||||
media_url=media_url,
|
||||
timestamp=timestamp,
|
||||
index=snap_data.get('snapIndex', 0),
|
||||
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
|
||||
lat=float(lat) if lat else None,
|
||||
lng=float(lng) if lng else None
|
||||
)
|
||||
collection.snaps.append(snap)
|
||||
|
||||
return collection
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting highlight metadata: {e}", "error")
|
||||
return None
|
||||
finally:
|
||||
page.close()
|
||||
|
||||
def _download_media_file(self, snap: SnapMedia, output_path: str) -> bool:
|
||||
"""Download a single media file"""
|
||||
try:
|
||||
url = snap.media_url.replace('&', '&')
|
||||
|
||||
result = subprocess.run([
|
||||
'curl', '-sL', '-o', output_path,
|
||||
'-H', 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
url
|
||||
], capture_output=True, timeout=60)
|
||||
|
||||
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
|
||||
self._set_metadata(output_path, snap)
|
||||
return True
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading media: {e}", "error")
|
||||
return False
|
||||
|
||||
def _generate_filename(self, username: str, snap: SnapMedia, ext: str) -> str:
|
||||
"""Generate filename with timestamp and media ID (FastDL format)"""
|
||||
date_str = snap.timestamp.strftime('%Y%m%d_%H%M%S')
|
||||
return f"{username}_{date_str}_{snap.media_id}.{ext}"
|
||||
|
||||
def _record_download(self, username: str, url: str, filename: str,
|
||||
post_date=None, metadata: dict = None, file_path: str = None,
|
||||
deferred: bool = False):
|
||||
"""Record a download in the database"""
|
||||
if deferred:
|
||||
self.pending_downloads.append({
|
||||
'username': username,
|
||||
'url': url,
|
||||
'filename': filename,
|
||||
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
|
||||
'file_path': file_path,
|
||||
'metadata': metadata
|
||||
})
|
||||
return True
|
||||
|
||||
if not self.db:
|
||||
return
|
||||
|
||||
try:
|
||||
self.db.mark_downloaded(
|
||||
username=username,
|
||||
url=url,
|
||||
filename=filename,
|
||||
post_date=post_date,
|
||||
metadata=metadata,
|
||||
file_path=file_path
|
||||
)
|
||||
except Exception as e:
|
||||
self.log(f"Failed to record download: {e}", "debug")
|
||||
|
||||
def get_pending_downloads(self):
|
||||
"""Get list of downloads that were deferred"""
|
||||
return self.pending_downloads.copy()
|
||||
|
||||
def clear_pending_downloads(self):
|
||||
"""Clear the pending downloads list"""
|
||||
self.pending_downloads = []
|
||||
|
||||
def _get_processed_posts(self, username: str) -> Set[str]:
|
||||
"""Get set of media IDs that have been processed"""
|
||||
processed = set()
|
||||
if not self.db:
|
||||
return processed
|
||||
|
||||
try:
|
||||
with self.db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
SELECT filename, metadata FROM downloads
|
||||
WHERE platform = 'snapchat'
|
||||
AND source = ?
|
||||
''', (username,))
|
||||
|
||||
for row in cursor.fetchall():
|
||||
filename, metadata_str = row
|
||||
if filename:
|
||||
parts = filename.split('_')
|
||||
if len(parts) >= 4:
|
||||
media_id = '_'.join(parts[3:]).split('.')[0]
|
||||
processed.add(media_id)
|
||||
|
||||
if metadata_str:
|
||||
try:
|
||||
metadata = json.loads(metadata_str)
|
||||
if 'media_id' in metadata:
|
||||
processed.add(metadata['media_id'])
|
||||
except (json.JSONDecodeError, TypeError, KeyError):
|
||||
pass # Invalid metadata, skip
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error loading processed posts: {e}", "debug")
|
||||
|
||||
return processed
|
||||
|
||||
def download(self, username: str, content_type: str = "all", days_back: int = 14,
|
||||
max_downloads: int = 50, output_dir: str = None,
|
||||
spotlight_dir: str = None, stories_dir: str = None,
|
||||
stitch_highlights: bool = True, defer_database: bool = False,
|
||||
phrase_config: dict = None):
|
||||
"""
|
||||
Download content from a user - compatible with media-downloader interface
|
||||
|
||||
Args:
|
||||
username: Snapchat username
|
||||
content_type: "spotlight", "stories", "highlights", or "all"
|
||||
days_back: How many days back to download (filters by post date)
|
||||
max_downloads: Maximum items to download per content type
|
||||
output_dir: Default output directory (used if specific dirs not set)
|
||||
spotlight_dir: Output directory for spotlights
|
||||
stories_dir: Output directory for stories/highlights
|
||||
stitch_highlights: Ignored (kept for backwards compatibility)
|
||||
defer_database: If True, defer database recording
|
||||
phrase_config: Not used (for interface compatibility)
|
||||
|
||||
Returns:
|
||||
Number of files downloaded
|
||||
"""
|
||||
self.defer_database = defer_database
|
||||
self.downloaded_files.clear()
|
||||
|
||||
# Set output directories
|
||||
# If specific dirs provided, use them directly
|
||||
# If only output_dir provided, use it directly (caller handles structure)
|
||||
# If nothing provided, use default with subdirectories
|
||||
if spotlight_dir:
|
||||
spotlight_output = Path(spotlight_dir)
|
||||
elif output_dir:
|
||||
spotlight_output = Path(output_dir)
|
||||
else:
|
||||
spotlight_output = Path(f"/opt/media-downloader/downloads/snapchat/spotlight/{username}")
|
||||
|
||||
if stories_dir:
|
||||
stories_output = Path(stories_dir)
|
||||
elif output_dir:
|
||||
stories_output = Path(output_dir)
|
||||
else:
|
||||
stories_output = Path(f"/opt/media-downloader/downloads/snapchat/stories/{username}")
|
||||
|
||||
spotlight_output.mkdir(parents=True, exist_ok=True)
|
||||
stories_output.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Update activity status
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status("Checking Snapchat")
|
||||
|
||||
# Get processed posts
|
||||
processed = self._get_processed_posts(username)
|
||||
self.log(f"Loaded {len(processed)} processed posts from database", "debug")
|
||||
|
||||
cutoff_date = datetime.now() - timedelta(days=days_back)
|
||||
downloaded_count = 0
|
||||
|
||||
# Crash recovery checkpoint
|
||||
from modules.task_checkpoint import TaskCheckpoint
|
||||
checkpoint = TaskCheckpoint(f'snapchat:{username}', 'scraping')
|
||||
|
||||
try:
|
||||
# Start browser
|
||||
self._start_browser()
|
||||
|
||||
# Get profile content
|
||||
content = self.get_profile_content(username)
|
||||
|
||||
# Count total items for checkpoint
|
||||
total_items = 0
|
||||
if content_type in ['spotlight', 'all'] and content['spotlights']:
|
||||
total_items += min(len(content['spotlights']), max_downloads)
|
||||
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
|
||||
total_items += min(len(content['highlights']), max_downloads)
|
||||
checkpoint.start(total_items=total_items)
|
||||
if checkpoint.is_recovering():
|
||||
self.log(f"Snapchat @{username}: recovering — skipping already-processed URLs", "info")
|
||||
|
||||
# Download spotlights
|
||||
if content_type in ['spotlight', 'all'] and content['spotlights']:
|
||||
spotlight_items = content['spotlights'][:max_downloads]
|
||||
self.log(f"Processing {len(spotlight_items)} spotlights...", "info")
|
||||
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status(
|
||||
"Downloading spotlights",
|
||||
progress_current=0,
|
||||
progress_total=len(spotlight_items)
|
||||
)
|
||||
|
||||
for spot_idx, url in enumerate(spotlight_items):
|
||||
# Update progress at start of each iteration (fires even on skips)
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status(
|
||||
"Downloading spotlights",
|
||||
progress_current=spot_idx + 1,
|
||||
progress_total=len(spotlight_items)
|
||||
)
|
||||
|
||||
if checkpoint.is_completed(url):
|
||||
continue
|
||||
|
||||
checkpoint.set_current(url)
|
||||
|
||||
try:
|
||||
spotlight = self.get_spotlight_metadata(url)
|
||||
if not spotlight or not spotlight.snaps:
|
||||
continue
|
||||
|
||||
snap = spotlight.snaps[0]
|
||||
|
||||
# Check date filter
|
||||
if snap.timestamp < cutoff_date:
|
||||
self.log(f"Spotlight {snap.media_id} is older than {days_back} days, skipping", "debug")
|
||||
continue
|
||||
|
||||
# Check if already processed
|
||||
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
||||
self.log(f"Spotlight {snap.media_id} already processed, skipping", "debug")
|
||||
continue
|
||||
|
||||
# Download
|
||||
ext = 'mp4' if snap.media_type == 'video' else 'jpg'
|
||||
filename = self._generate_filename(username, snap, ext)
|
||||
output_path = str(spotlight_output / filename)
|
||||
|
||||
if self._download_media_file(snap, output_path):
|
||||
self.downloaded_files.add(snap.media_id)
|
||||
downloaded_count += 1
|
||||
self.log(f"Downloaded spotlight: {filename}", "info")
|
||||
|
||||
self._record_download(
|
||||
username=username,
|
||||
url=url,
|
||||
filename=filename,
|
||||
post_date=snap.timestamp,
|
||||
metadata={
|
||||
'media_id': snap.media_id,
|
||||
'description': snap.description,
|
||||
'view_count': snap.view_count,
|
||||
'content_type': 'spotlight'
|
||||
},
|
||||
file_path=output_path,
|
||||
deferred=defer_database
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error processing spotlight: {e}", "error")
|
||||
|
||||
checkpoint.mark_completed(url)
|
||||
|
||||
# Download highlights (stories)
|
||||
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
|
||||
highlight_items = content['highlights'][:max_downloads]
|
||||
self.log(f"Processing {len(highlight_items)} highlights...", "info")
|
||||
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status(
|
||||
"Downloading highlights",
|
||||
progress_current=0,
|
||||
progress_total=len(highlight_items)
|
||||
)
|
||||
|
||||
for hi_idx, url in enumerate(highlight_items):
|
||||
# Update progress at start of each iteration (fires even on skips)
|
||||
if self.activity_manager:
|
||||
self.activity_manager.update_status(
|
||||
"Downloading highlights",
|
||||
progress_current=hi_idx + 1,
|
||||
progress_total=len(highlight_items)
|
||||
)
|
||||
|
||||
if checkpoint.is_completed(url):
|
||||
continue
|
||||
|
||||
checkpoint.set_current(url)
|
||||
|
||||
try:
|
||||
highlight = self.get_highlight_metadata(url)
|
||||
if not highlight or not highlight.snaps:
|
||||
continue
|
||||
|
||||
# Check if any snap is within date range
|
||||
newest_snap = max(highlight.snaps, key=lambda s: s.timestamp)
|
||||
if newest_snap.timestamp < cutoff_date:
|
||||
self.log(f"Highlight {highlight.collection_id} is older than {days_back} days, skipping", "debug")
|
||||
continue
|
||||
|
||||
# Check if already processed
|
||||
if highlight.collection_id in processed or highlight.collection_id in self.downloaded_files:
|
||||
self.log(f"Highlight {highlight.collection_id} already processed, skipping", "debug")
|
||||
continue
|
||||
|
||||
# Separate videos and images
|
||||
videos = [s for s in highlight.snaps if s.media_type == 'video']
|
||||
images = [s for s in highlight.snaps if s.media_type == 'image']
|
||||
|
||||
# Download images individually
|
||||
for snap in images:
|
||||
if snap.timestamp < cutoff_date:
|
||||
continue
|
||||
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
||||
continue
|
||||
|
||||
filename = self._generate_filename(username, snap, 'jpg')
|
||||
output_path = str(stories_output / filename)
|
||||
|
||||
if self._download_media_file(snap, output_path):
|
||||
self.downloaded_files.add(snap.media_id)
|
||||
downloaded_count += 1
|
||||
self.log(f"Downloaded image: {filename}", "info")
|
||||
|
||||
self._record_download(
|
||||
username=username,
|
||||
url=highlight.url,
|
||||
filename=filename,
|
||||
post_date=snap.timestamp,
|
||||
metadata={
|
||||
'media_id': snap.media_id,
|
||||
'highlight_id': highlight.collection_id,
|
||||
'content_type': 'highlight_image'
|
||||
},
|
||||
file_path=output_path,
|
||||
deferred=defer_database
|
||||
)
|
||||
|
||||
# Handle videos - download each clip individually
|
||||
if videos:
|
||||
for snap in videos:
|
||||
if snap.timestamp < cutoff_date:
|
||||
continue
|
||||
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
||||
continue
|
||||
|
||||
filename = self._generate_filename(username, snap, 'mp4')
|
||||
output_path = str(stories_output / filename)
|
||||
|
||||
if self._download_media_file(snap, output_path):
|
||||
self._set_metadata(output_path, snap)
|
||||
self.downloaded_files.add(snap.media_id)
|
||||
downloaded_count += 1
|
||||
self.log(f"Downloaded video: {filename}", "info")
|
||||
|
||||
self._record_download(
|
||||
username=username,
|
||||
url=highlight.url,
|
||||
filename=filename,
|
||||
post_date=snap.timestamp,
|
||||
metadata={
|
||||
'media_id': snap.media_id,
|
||||
'highlight_id': highlight.collection_id,
|
||||
'content_type': 'highlight_video'
|
||||
},
|
||||
file_path=output_path,
|
||||
deferred=defer_database
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error processing highlight: {e}", "error")
|
||||
|
||||
checkpoint.mark_completed(url)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error during download: {e}", "error")
|
||||
|
||||
checkpoint.finish()
|
||||
self.log(f"Downloaded {downloaded_count} files for @{username}", "info")
|
||||
return downloaded_count
|
||||
|
||||
|
||||
def test_scraper():
|
||||
"""Test the scraper"""
|
||||
print("=" * 60)
|
||||
print("SNAPCHAT DIRECT SCRAPER TEST")
|
||||
print("=" * 60)
|
||||
|
||||
with SnapchatDirectScraper(headless=True) as scraper:
|
||||
username = "evalongoria"
|
||||
|
||||
# Test download
|
||||
count = scraper.download(
|
||||
username=username,
|
||||
content_type="all",
|
||||
days_back=30,
|
||||
max_downloads=5,
|
||||
spotlight_dir="/tmp/snap_test/spotlight",
|
||||
stories_dir="/tmp/snap_test/stories",
|
||||
stitch_highlights=True
|
||||
)
|
||||
|
||||
print(f"\nDownloaded {count} files")
|
||||
|
||||
# Show files
|
||||
import os
|
||||
for root, dirs, files in os.walk("/tmp/snap_test"):
|
||||
for f in files:
|
||||
path = os.path.join(root, f)
|
||||
size = os.path.getsize(path) / 1024
|
||||
print(f" {path}: {size:.1f}KB")
|
||||
|
||||
print("=" * 60)
|
||||
print("TEST COMPLETE")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_scraper()
|
||||
Reference in New Issue
Block a user