986 lines
38 KiB
Python
986 lines
38 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Snapchat Direct Scraper Module - Scrapes directly from Snapchat.com
|
|
|
|
Uses Playwright to scrape profiles and extract:
|
|
- Spotlight videos (540x960)
|
|
- Stories/Highlights (480x852, stitched into single videos)
|
|
|
|
Full metadata extraction including timestamps, media IDs, descriptions.
|
|
Follows the same interface as the original snapchat_module.py
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import re
|
|
import tempfile
|
|
import subprocess
|
|
import shutil
|
|
import platform
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, List, Any, Set
|
|
from dataclasses import dataclass, field
|
|
|
|
# Set environment for Playwright
|
|
os.environ.setdefault('PLAYWRIGHT_BROWSERS_PATH', '/root/.cache/ms-playwright')
|
|
|
|
from modules.base_module import LoggingMixin
|
|
from modules.cloudflare_handler import (
|
|
get_playwright_context_options,
|
|
get_playwright_stealth_scripts,
|
|
get_flaresolverr_user_agent
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class SnapMedia:
|
|
"""Represents a single snap media item"""
|
|
media_id: str
|
|
media_type: str # 'video' or 'image'
|
|
media_url: str
|
|
timestamp: datetime
|
|
index: int = 0
|
|
thumbnail_url: str = ""
|
|
duration_ms: int = 0
|
|
description: str = ""
|
|
view_count: int = 0
|
|
width: int = 0
|
|
height: int = 0
|
|
lat: Optional[float] = None
|
|
lng: Optional[float] = None
|
|
|
|
|
|
@dataclass
|
|
class SnapCollection:
|
|
"""Represents a spotlight or highlight collection"""
|
|
collection_id: str
|
|
collection_type: str # 'spotlight' or 'highlight'
|
|
title: str = ""
|
|
username: str = ""
|
|
snaps: List[SnapMedia] = field(default_factory=list)
|
|
url: str = ""
|
|
|
|
|
|
class SnapchatDirectScraper(LoggingMixin):
|
|
"""
|
|
Scrapes Snapchat profiles directly for media content.
|
|
|
|
Follows the same interface as SnapchatDownloader for compatibility
|
|
with the media-downloader system.
|
|
"""
|
|
|
|
def __init__(self,
|
|
headless: bool = True,
|
|
show_progress: bool = True,
|
|
use_database: bool = True,
|
|
log_callback=None,
|
|
unified_db=None):
|
|
"""Initialize scraper compatible with media-downloader system"""
|
|
self.headless = headless
|
|
self.show_progress = show_progress
|
|
self.use_database = use_database
|
|
self.unified_db = unified_db
|
|
self.scraper_id = 'snapchat_direct'
|
|
self.download_count = 0
|
|
self.downloaded_files: Set[str] = set()
|
|
self.pending_downloads = []
|
|
|
|
# Initialize logging via mixin
|
|
self._init_logger('SnapchatDirect', log_callback, default_module='Download')
|
|
|
|
# User-Agent to match FlareSolverr (dynamically fetched for consistency)
|
|
self.user_agent = get_flaresolverr_user_agent()
|
|
|
|
# Browser state
|
|
self._playwright = None
|
|
self.browser = None
|
|
self.context = None
|
|
|
|
# Database adapter
|
|
if unified_db and use_database:
|
|
from modules.unified_database import SnapchatDatabaseAdapter
|
|
self.db = SnapchatDatabaseAdapter(unified_db)
|
|
else:
|
|
self.db = None
|
|
self.use_database = False
|
|
|
|
# Activity status manager
|
|
try:
|
|
from modules.activity_status import get_activity_manager
|
|
self.activity_manager = get_activity_manager(unified_db)
|
|
except ImportError:
|
|
self.activity_manager = None
|
|
|
|
# Load cookies from database
|
|
self.cookies = self._load_cookies_from_db()
|
|
|
|
# Load proxy configuration from database
|
|
self.proxy_url = None
|
|
if unified_db:
|
|
try:
|
|
scraper_config = unified_db.get_scraper('snapchat')
|
|
if scraper_config and scraper_config.get('proxy_enabled') and scraper_config.get('proxy_url'):
|
|
self.proxy_url = scraper_config['proxy_url']
|
|
self.log(f"Using proxy: {self.proxy_url}", "info")
|
|
except Exception as e:
|
|
self.log(f"Could not load proxy config: {e}", "debug")
|
|
|
|
def _load_cookies_from_db(self) -> List[Dict]:
|
|
"""Load cookies from database"""
|
|
if not self.unified_db:
|
|
return self._get_default_cookies()
|
|
|
|
try:
|
|
cookies = self.unified_db.get_scraper_cookies(self.scraper_id)
|
|
if cookies:
|
|
self.log(f"Loaded {len(cookies)} cookies from database", "debug")
|
|
return cookies
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies from database: {e}", "warning")
|
|
|
|
# Try loading from original snapchat scraper
|
|
try:
|
|
cookies = self.unified_db.get_scraper_cookies('snapchat')
|
|
if cookies:
|
|
self.log(f"Using cookies from 'snapchat' scraper", "debug")
|
|
return cookies
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies from snapchat scraper: {e}", "debug")
|
|
|
|
return self._get_default_cookies()
|
|
|
|
def _get_default_cookies(self) -> List[Dict]:
|
|
"""Get default cookies for Snapchat"""
|
|
return [
|
|
{"name": "sc-cookies-accepted", "value": "true", "domain": "www.snapchat.com", "path": "/"},
|
|
]
|
|
|
|
def _save_cookies_to_db(self, cookies: List[Dict], user_agent: str = None):
|
|
"""Save cookies to database
|
|
|
|
Args:
|
|
cookies: List of cookie dictionaries
|
|
user_agent: User agent to associate with cookies (important for cf_clearance).
|
|
If not provided, uses self.user_agent as fallback.
|
|
"""
|
|
if not self.unified_db:
|
|
return
|
|
|
|
try:
|
|
# Use provided user_agent or fall back to self.user_agent
|
|
ua = user_agent or self.user_agent
|
|
self.unified_db.save_scraper_cookies(
|
|
self.scraper_id,
|
|
cookies,
|
|
user_agent=ua,
|
|
merge=True
|
|
)
|
|
self.log(f"Saved {len(cookies)} cookies to database (UA: {ua[:50]}...)", "debug")
|
|
except Exception as e:
|
|
self.log(f"Error saving cookies to database: {e}", "warning")
|
|
|
|
def _parse_proxy_url(self, proxy_url: str) -> Optional[Dict]:
|
|
"""
|
|
Parse proxy URL into Playwright proxy config.
|
|
Supports: protocol://user:pass@host:port or protocol://host:port
|
|
"""
|
|
import re
|
|
try:
|
|
# Match: protocol://[user:pass@]host:port
|
|
match = re.match(
|
|
r'^(https?|socks[45]?)://(?:([^:]+):([^@]+)@)?([^:]+):(\d+)$',
|
|
proxy_url
|
|
)
|
|
if match:
|
|
protocol, username, password, host, port = match.groups()
|
|
config = {'server': f'{protocol}://{host}:{port}'}
|
|
if username and password:
|
|
config['username'] = username
|
|
config['password'] = password
|
|
return config
|
|
except Exception as e:
|
|
self.log(f"Failed to parse proxy URL: {e}", "warning")
|
|
return None
|
|
|
|
def __enter__(self):
|
|
"""Context manager entry"""
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
"""Context manager exit"""
|
|
self._close_browser()
|
|
return False
|
|
|
|
def _start_browser(self):
|
|
"""Start Playwright browser"""
|
|
if self.browser is not None:
|
|
return
|
|
|
|
os.environ['DISPLAY'] = ':100'
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
self._playwright = sync_playwright().start()
|
|
self.browser = self._playwright.chromium.launch(
|
|
headless=self.headless,
|
|
args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
|
|
)
|
|
|
|
# Build context options - use dynamic fingerprinting from FlareSolverr
|
|
context_options = get_playwright_context_options()
|
|
|
|
# IMPORTANT: If cookies have a stored user_agent, use THAT user_agent
|
|
# Cloudflare cf_clearance cookies are fingerprinted to the browser that solved the challenge
|
|
try:
|
|
if self.unified_db:
|
|
stored_user_agent = self.unified_db.get_scraper_cookies_user_agent(self.scraper_id)
|
|
if stored_user_agent:
|
|
self.log(f"Using stored cookie user_agent: {stored_user_agent[:50]}...", "debug", module="Browser")
|
|
context_options['user_agent'] = stored_user_agent
|
|
else:
|
|
self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser")
|
|
else:
|
|
self.log(f"Using fingerprint: Chrome {context_options.get('extra_http_headers', {}).get('Sec-Ch-Ua', 'unknown')[:30]}...", "debug", module="Browser")
|
|
except Exception as e:
|
|
self.log(f"Error getting stored user_agent, using default: {e}", "debug", module="Browser")
|
|
|
|
# Add proxy if configured
|
|
if self.proxy_url:
|
|
proxy_config = self._parse_proxy_url(self.proxy_url)
|
|
if proxy_config:
|
|
context_options['proxy'] = proxy_config
|
|
self.log(f"Browser using proxy: {proxy_config.get('server')}", "info", module="Browser")
|
|
|
|
self.context = self.browser.new_context(**context_options)
|
|
|
|
# Add anti-detection scripts to all pages in this context
|
|
self.context.add_init_script(get_playwright_stealth_scripts())
|
|
|
|
# Add cookies
|
|
if self.cookies:
|
|
# Clean cookies for Playwright and convert expiry->expires
|
|
cleaned = []
|
|
for c in self.cookies:
|
|
clean = {k: v for k, v in c.items() if k not in ['partitionKey', '_crHasCrossSiteAncestor']}
|
|
# FlareSolverr uses 'expiry' but Playwright uses 'expires'
|
|
if 'expiry' in clean and 'expires' not in clean:
|
|
clean['expires'] = clean.pop('expiry')
|
|
cleaned.append(clean)
|
|
|
|
# CRITICAL: Clear existing cookies first to ensure new cf_clearance takes effect
|
|
try:
|
|
self.context.clear_cookies()
|
|
except Exception:
|
|
pass
|
|
|
|
self.context.add_cookies(cleaned)
|
|
|
|
self.log("Browser started", "info", module="Browser")
|
|
|
|
def _close_browser(self):
|
|
"""Close browser and cleanup"""
|
|
if self.context:
|
|
try:
|
|
self.context.close()
|
|
except Exception as e:
|
|
self.log(f"Error closing browser context: {e}", "debug")
|
|
self.context = None
|
|
|
|
if self.browser:
|
|
try:
|
|
self.browser.close()
|
|
except Exception as e:
|
|
self.log(f"Error closing browser: {e}", "debug")
|
|
self.browser = None
|
|
|
|
if self._playwright:
|
|
try:
|
|
self._playwright.stop()
|
|
except Exception as e:
|
|
self.log(f"Error stopping playwright: {e}", "debug")
|
|
self._playwright = None
|
|
|
|
def _get_next_data(self, page) -> Optional[Dict]:
|
|
"""Extract __NEXT_DATA__ JSON from page"""
|
|
try:
|
|
next_data_elem = page.locator('script#__NEXT_DATA__').first
|
|
if next_data_elem.count() > 0:
|
|
return json.loads(next_data_elem.inner_text())
|
|
except Exception as e:
|
|
self.log(f"Error extracting __NEXT_DATA__: {e}", "debug")
|
|
return None
|
|
|
|
def _set_metadata(self, file_path: str, snap: SnapMedia, description: str = None):
|
|
"""Set EXIF metadata and file timestamp"""
|
|
try:
|
|
date_str = snap.timestamp.strftime('%Y:%m:%d %H:%M:%S')
|
|
desc = description or snap.description or ""
|
|
if snap.view_count:
|
|
desc += f" [Views: {snap.view_count}]"
|
|
desc = desc.strip()
|
|
|
|
ext = os.path.splitext(file_path)[1].lower()
|
|
is_video = ext in ['.mp4', '.mov', '.avi', '.webm']
|
|
is_image = ext in ['.jpg', '.jpeg', '.png', '.webp']
|
|
|
|
exif_args = [
|
|
'exiftool', '-overwrite_original', '-ignoreMinorErrors',
|
|
f'-FileModifyDate={date_str}',
|
|
]
|
|
|
|
if is_image:
|
|
exif_args.extend([
|
|
f'-DateTimeOriginal={date_str}',
|
|
f'-CreateDate={date_str}',
|
|
f'-ModifyDate={date_str}',
|
|
f'-MetadataDate={date_str}',
|
|
])
|
|
if desc:
|
|
exif_args.extend([
|
|
f'-ImageDescription={desc}',
|
|
f'-XPComment={desc}',
|
|
f'-UserComment={desc}',
|
|
])
|
|
if snap.lat and snap.lng:
|
|
lat_ref = 'N' if snap.lat >= 0 else 'S'
|
|
lng_ref = 'E' if snap.lng >= 0 else 'W'
|
|
exif_args.extend([
|
|
f'-GPSLatitude={abs(snap.lat)}',
|
|
f'-GPSLatitudeRef={lat_ref}',
|
|
f'-GPSLongitude={abs(snap.lng)}',
|
|
f'-GPSLongitudeRef={lng_ref}',
|
|
])
|
|
|
|
elif is_video:
|
|
exif_args.extend([
|
|
f'-CreateDate={date_str}',
|
|
f'-ModifyDate={date_str}',
|
|
f'-MediaCreateDate={date_str}',
|
|
f'-MediaModifyDate={date_str}',
|
|
f'-TrackCreateDate={date_str}',
|
|
f'-TrackModifyDate={date_str}',
|
|
])
|
|
if desc:
|
|
exif_args.extend([
|
|
f'-Description={desc}',
|
|
f'-Comment={desc}',
|
|
])
|
|
|
|
exif_args.append(file_path)
|
|
subprocess.run(exif_args, capture_output=True, timeout=30)
|
|
|
|
# Set filesystem modification time
|
|
ts = snap.timestamp.timestamp()
|
|
os.utime(file_path, (ts, ts))
|
|
|
|
except Exception as e:
|
|
self.log(f"Warning: Could not set metadata for {file_path}: {e}", "debug")
|
|
|
|
def get_profile_content(self, username: str) -> Dict[str, List[str]]:
|
|
"""Get all spotlight and highlight URLs from a profile"""
|
|
import time
|
|
|
|
if not self.browser:
|
|
self._start_browser()
|
|
|
|
page = self.context.new_page()
|
|
result = {'spotlights': [], 'highlights': []}
|
|
|
|
try:
|
|
url = f"https://www.snapchat.com/@{username}"
|
|
self.log(f"Navigating to profile @{username}", "info")
|
|
page.goto(url, wait_until='networkidle', timeout=30000)
|
|
time.sleep(2)
|
|
|
|
content = page.content()
|
|
|
|
# Extract spotlight URLs
|
|
spotlight_pattern = rf'/@{username}/spotlight/([A-Za-z0-9_-]+)'
|
|
spotlight_ids = list(set(re.findall(spotlight_pattern, content)))
|
|
result['spotlights'] = [
|
|
f"https://www.snapchat.com/@{username}/spotlight/{sid}"
|
|
for sid in spotlight_ids
|
|
]
|
|
self.log(f"Found {len(result['spotlights'])} spotlights", "info")
|
|
|
|
# Click Stories tab to get highlights
|
|
stories_tab = page.locator('[role="tab"]:has-text("Stories")').first
|
|
if stories_tab.count() > 0:
|
|
stories_tab.click()
|
|
time.sleep(2)
|
|
|
|
content = page.content()
|
|
highlight_pattern = rf'/@{username}/highlight/([A-Za-z0-9-]+)'
|
|
highlight_ids = list(set(re.findall(highlight_pattern, content)))
|
|
result['highlights'] = [
|
|
f"https://www.snapchat.com/@{username}/highlight/{hid}"
|
|
for hid in highlight_ids
|
|
]
|
|
self.log(f"Found {len(result['highlights'])} highlights", "info")
|
|
|
|
except Exception as e:
|
|
self.log(f"Error getting profile content: {e}", "error")
|
|
finally:
|
|
page.close()
|
|
|
|
return result
|
|
|
|
def get_spotlight_metadata(self, url: str) -> Optional[SnapCollection]:
|
|
"""Extract full metadata from a spotlight URL"""
|
|
import time
|
|
|
|
if not self.browser:
|
|
self._start_browser()
|
|
|
|
page = self.context.new_page()
|
|
|
|
try:
|
|
page.goto(url, wait_until='domcontentloaded', timeout=60000)
|
|
time.sleep(2)
|
|
|
|
data = self._get_next_data(page)
|
|
if not data:
|
|
return None
|
|
|
|
props = (data.get('props') or {}).get('pageProps') or {}
|
|
feed = props.get('spotlightFeed') or {}
|
|
stories = feed.get('spotlightStories') or []
|
|
|
|
if not stories:
|
|
return None
|
|
|
|
story_data = stories[0]
|
|
story = story_data.get('story') or {}
|
|
metadata = (story_data.get('metadata') or {}).get('videoMetadata') or {}
|
|
|
|
story_id = (story.get('storyId') or {}).get('value', '')
|
|
creator = (metadata.get('creator') or {}).get('personCreator') or {}
|
|
username = creator.get('username', '')
|
|
|
|
collection = SnapCollection(
|
|
collection_id=story_id,
|
|
collection_type='spotlight',
|
|
title=metadata.get('description', ''),
|
|
username=username,
|
|
url=url
|
|
)
|
|
|
|
for snap_data in story.get('snapList') or []:
|
|
snap_id = (snap_data.get('snapId') or {}).get('value', '')
|
|
snap_urls = snap_data.get('snapUrls') or {}
|
|
media_url = snap_urls.get('mediaUrl', '')
|
|
|
|
media_id = ''
|
|
if '/d/' in media_url:
|
|
media_id = media_url.split('/d/')[1].split('.')[0]
|
|
|
|
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
|
|
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
|
|
|
|
snap = SnapMedia(
|
|
media_id=media_id or snap_id,
|
|
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
|
|
media_url=media_url,
|
|
timestamp=timestamp,
|
|
index=snap_data.get('snapIndex', 0),
|
|
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
|
|
duration_ms=int(metadata.get('durationMs', 0)),
|
|
description=metadata.get('description', ''),
|
|
view_count=int(metadata.get('viewCount', 0)),
|
|
width=int(metadata.get('width', 540)),
|
|
height=int(metadata.get('height', 960))
|
|
)
|
|
collection.snaps.append(snap)
|
|
|
|
return collection
|
|
|
|
except Exception as e:
|
|
self.log(f"Error getting spotlight metadata: {e}", "error")
|
|
return None
|
|
finally:
|
|
page.close()
|
|
|
|
def get_highlight_metadata(self, url: str) -> Optional[SnapCollection]:
|
|
"""Extract full metadata from a highlight URL"""
|
|
import time
|
|
|
|
if not self.browser:
|
|
self._start_browser()
|
|
|
|
page = self.context.new_page()
|
|
|
|
try:
|
|
page.goto(url, wait_until='domcontentloaded', timeout=60000)
|
|
time.sleep(2)
|
|
|
|
data = self._get_next_data(page)
|
|
if not data:
|
|
return None
|
|
|
|
props = (data.get('props') or {}).get('pageProps') or {}
|
|
highlight = props.get('highlight') or {}
|
|
|
|
if not highlight:
|
|
return None
|
|
|
|
highlight_id = highlight.get('highlightId') or {}
|
|
if isinstance(highlight_id, dict):
|
|
highlight_id = highlight_id.get('value', '')
|
|
|
|
username_match = re.search(r'@([^/]+)', url)
|
|
username = username_match.group(1) if username_match else ''
|
|
|
|
title = highlight.get('storyTitle') or {}
|
|
if isinstance(title, dict):
|
|
title = title.get('value', '')
|
|
|
|
collection = SnapCollection(
|
|
collection_id=highlight_id,
|
|
collection_type='highlight',
|
|
title=title or 'Untitled Highlight',
|
|
username=username,
|
|
url=url
|
|
)
|
|
|
|
for snap_data in highlight.get('snapList') or []:
|
|
snap_urls = snap_data.get('snapUrls') or {}
|
|
media_url = snap_urls.get('mediaUrl', '')
|
|
|
|
media_id = ''
|
|
if '/d/' in media_url:
|
|
media_id = media_url.split('/d/')[1].split('.')[0]
|
|
|
|
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
|
|
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
|
|
|
|
lat = snap_data.get('lat')
|
|
lng = snap_data.get('lng')
|
|
|
|
snap = SnapMedia(
|
|
media_id=media_id,
|
|
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
|
|
media_url=media_url,
|
|
timestamp=timestamp,
|
|
index=snap_data.get('snapIndex', 0),
|
|
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
|
|
lat=float(lat) if lat else None,
|
|
lng=float(lng) if lng else None
|
|
)
|
|
collection.snaps.append(snap)
|
|
|
|
return collection
|
|
|
|
except Exception as e:
|
|
self.log(f"Error getting highlight metadata: {e}", "error")
|
|
return None
|
|
finally:
|
|
page.close()
|
|
|
|
def _download_media_file(self, snap: SnapMedia, output_path: str) -> bool:
|
|
"""Download a single media file"""
|
|
try:
|
|
url = snap.media_url.replace('&', '&')
|
|
|
|
result = subprocess.run([
|
|
'curl', '-sL', '-o', output_path,
|
|
'-H', 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
url
|
|
], capture_output=True, timeout=60)
|
|
|
|
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
|
|
self._set_metadata(output_path, snap)
|
|
return True
|
|
return False
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading media: {e}", "error")
|
|
return False
|
|
|
|
def _generate_filename(self, username: str, snap: SnapMedia, ext: str) -> str:
|
|
"""Generate filename with timestamp and media ID (FastDL format)"""
|
|
date_str = snap.timestamp.strftime('%Y%m%d_%H%M%S')
|
|
return f"{username}_{date_str}_{snap.media_id}.{ext}"
|
|
|
|
def _record_download(self, username: str, url: str, filename: str,
|
|
post_date=None, metadata: dict = None, file_path: str = None,
|
|
deferred: bool = False):
|
|
"""Record a download in the database"""
|
|
if deferred:
|
|
self.pending_downloads.append({
|
|
'username': username,
|
|
'url': url,
|
|
'filename': filename,
|
|
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
|
|
'file_path': file_path,
|
|
'metadata': metadata
|
|
})
|
|
return True
|
|
|
|
if not self.db:
|
|
return
|
|
|
|
try:
|
|
self.db.mark_downloaded(
|
|
username=username,
|
|
url=url,
|
|
filename=filename,
|
|
post_date=post_date,
|
|
metadata=metadata,
|
|
file_path=file_path
|
|
)
|
|
except Exception as e:
|
|
self.log(f"Failed to record download: {e}", "debug")
|
|
|
|
def get_pending_downloads(self):
|
|
"""Get list of downloads that were deferred"""
|
|
return self.pending_downloads.copy()
|
|
|
|
def clear_pending_downloads(self):
|
|
"""Clear the pending downloads list"""
|
|
self.pending_downloads = []
|
|
|
|
def _get_processed_posts(self, username: str) -> Set[str]:
|
|
"""Get set of media IDs that have been processed"""
|
|
processed = set()
|
|
if not self.db:
|
|
return processed
|
|
|
|
try:
|
|
with self.db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
SELECT filename, metadata FROM downloads
|
|
WHERE platform = 'snapchat'
|
|
AND source = ?
|
|
''', (username,))
|
|
|
|
for row in cursor.fetchall():
|
|
filename, metadata_str = row
|
|
if filename:
|
|
parts = filename.split('_')
|
|
if len(parts) >= 4:
|
|
media_id = '_'.join(parts[3:]).split('.')[0]
|
|
processed.add(media_id)
|
|
|
|
if metadata_str:
|
|
try:
|
|
metadata = json.loads(metadata_str)
|
|
if 'media_id' in metadata:
|
|
processed.add(metadata['media_id'])
|
|
except (json.JSONDecodeError, TypeError, KeyError):
|
|
pass # Invalid metadata, skip
|
|
|
|
except Exception as e:
|
|
self.log(f"Error loading processed posts: {e}", "debug")
|
|
|
|
return processed
|
|
|
|
def download(self, username: str, content_type: str = "all", days_back: int = 14,
|
|
max_downloads: int = 50, output_dir: str = None,
|
|
spotlight_dir: str = None, stories_dir: str = None,
|
|
stitch_highlights: bool = True, defer_database: bool = False,
|
|
phrase_config: dict = None):
|
|
"""
|
|
Download content from a user - compatible with media-downloader interface
|
|
|
|
Args:
|
|
username: Snapchat username
|
|
content_type: "spotlight", "stories", "highlights", or "all"
|
|
days_back: How many days back to download (filters by post date)
|
|
max_downloads: Maximum items to download per content type
|
|
output_dir: Default output directory (used if specific dirs not set)
|
|
spotlight_dir: Output directory for spotlights
|
|
stories_dir: Output directory for stories/highlights
|
|
stitch_highlights: Ignored (kept for backwards compatibility)
|
|
defer_database: If True, defer database recording
|
|
phrase_config: Not used (for interface compatibility)
|
|
|
|
Returns:
|
|
Number of files downloaded
|
|
"""
|
|
self.defer_database = defer_database
|
|
self.downloaded_files.clear()
|
|
|
|
# Set output directories
|
|
# If specific dirs provided, use them directly
|
|
# If only output_dir provided, use it directly (caller handles structure)
|
|
# If nothing provided, use default with subdirectories
|
|
if spotlight_dir:
|
|
spotlight_output = Path(spotlight_dir)
|
|
elif output_dir:
|
|
spotlight_output = Path(output_dir)
|
|
else:
|
|
spotlight_output = Path(f"/opt/media-downloader/downloads/snapchat/spotlight/{username}")
|
|
|
|
if stories_dir:
|
|
stories_output = Path(stories_dir)
|
|
elif output_dir:
|
|
stories_output = Path(output_dir)
|
|
else:
|
|
stories_output = Path(f"/opt/media-downloader/downloads/snapchat/stories/{username}")
|
|
|
|
spotlight_output.mkdir(parents=True, exist_ok=True)
|
|
stories_output.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Update activity status
|
|
if self.activity_manager:
|
|
self.activity_manager.update_status("Checking Snapchat")
|
|
|
|
# Get processed posts
|
|
processed = self._get_processed_posts(username)
|
|
self.log(f"Loaded {len(processed)} processed posts from database", "debug")
|
|
|
|
cutoff_date = datetime.now() - timedelta(days=days_back)
|
|
downloaded_count = 0
|
|
|
|
# Crash recovery checkpoint
|
|
from modules.task_checkpoint import TaskCheckpoint
|
|
checkpoint = TaskCheckpoint(f'snapchat:{username}', 'scraping')
|
|
|
|
try:
|
|
# Start browser
|
|
self._start_browser()
|
|
|
|
# Get profile content
|
|
content = self.get_profile_content(username)
|
|
|
|
# Count total items for checkpoint
|
|
total_items = 0
|
|
if content_type in ['spotlight', 'all'] and content['spotlights']:
|
|
total_items += min(len(content['spotlights']), max_downloads)
|
|
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
|
|
total_items += min(len(content['highlights']), max_downloads)
|
|
checkpoint.start(total_items=total_items)
|
|
if checkpoint.is_recovering():
|
|
self.log(f"Snapchat @{username}: recovering — skipping already-processed URLs", "info")
|
|
|
|
# Download spotlights
|
|
if content_type in ['spotlight', 'all'] and content['spotlights']:
|
|
spotlight_items = content['spotlights'][:max_downloads]
|
|
self.log(f"Processing {len(spotlight_items)} spotlights...", "info")
|
|
|
|
if self.activity_manager:
|
|
self.activity_manager.update_status(
|
|
"Downloading spotlights",
|
|
progress_current=0,
|
|
progress_total=len(spotlight_items)
|
|
)
|
|
|
|
for spot_idx, url in enumerate(spotlight_items):
|
|
# Update progress at start of each iteration (fires even on skips)
|
|
if self.activity_manager:
|
|
self.activity_manager.update_status(
|
|
"Downloading spotlights",
|
|
progress_current=spot_idx + 1,
|
|
progress_total=len(spotlight_items)
|
|
)
|
|
|
|
if checkpoint.is_completed(url):
|
|
continue
|
|
|
|
checkpoint.set_current(url)
|
|
|
|
try:
|
|
spotlight = self.get_spotlight_metadata(url)
|
|
if not spotlight or not spotlight.snaps:
|
|
continue
|
|
|
|
snap = spotlight.snaps[0]
|
|
|
|
# Check date filter
|
|
if snap.timestamp < cutoff_date:
|
|
self.log(f"Spotlight {snap.media_id} is older than {days_back} days, skipping", "debug")
|
|
continue
|
|
|
|
# Check if already processed
|
|
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
|
self.log(f"Spotlight {snap.media_id} already processed, skipping", "debug")
|
|
continue
|
|
|
|
# Download
|
|
ext = 'mp4' if snap.media_type == 'video' else 'jpg'
|
|
filename = self._generate_filename(username, snap, ext)
|
|
output_path = str(spotlight_output / filename)
|
|
|
|
if self._download_media_file(snap, output_path):
|
|
self.downloaded_files.add(snap.media_id)
|
|
downloaded_count += 1
|
|
self.log(f"Downloaded spotlight: {filename}", "info")
|
|
|
|
self._record_download(
|
|
username=username,
|
|
url=url,
|
|
filename=filename,
|
|
post_date=snap.timestamp,
|
|
metadata={
|
|
'media_id': snap.media_id,
|
|
'description': snap.description,
|
|
'view_count': snap.view_count,
|
|
'content_type': 'spotlight'
|
|
},
|
|
file_path=output_path,
|
|
deferred=defer_database
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error processing spotlight: {e}", "error")
|
|
|
|
checkpoint.mark_completed(url)
|
|
|
|
# Download highlights (stories)
|
|
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
|
|
highlight_items = content['highlights'][:max_downloads]
|
|
self.log(f"Processing {len(highlight_items)} highlights...", "info")
|
|
|
|
if self.activity_manager:
|
|
self.activity_manager.update_status(
|
|
"Downloading highlights",
|
|
progress_current=0,
|
|
progress_total=len(highlight_items)
|
|
)
|
|
|
|
for hi_idx, url in enumerate(highlight_items):
|
|
# Update progress at start of each iteration (fires even on skips)
|
|
if self.activity_manager:
|
|
self.activity_manager.update_status(
|
|
"Downloading highlights",
|
|
progress_current=hi_idx + 1,
|
|
progress_total=len(highlight_items)
|
|
)
|
|
|
|
if checkpoint.is_completed(url):
|
|
continue
|
|
|
|
checkpoint.set_current(url)
|
|
|
|
try:
|
|
highlight = self.get_highlight_metadata(url)
|
|
if not highlight or not highlight.snaps:
|
|
continue
|
|
|
|
# Check if any snap is within date range
|
|
newest_snap = max(highlight.snaps, key=lambda s: s.timestamp)
|
|
if newest_snap.timestamp < cutoff_date:
|
|
self.log(f"Highlight {highlight.collection_id} is older than {days_back} days, skipping", "debug")
|
|
continue
|
|
|
|
# Check if already processed
|
|
if highlight.collection_id in processed or highlight.collection_id in self.downloaded_files:
|
|
self.log(f"Highlight {highlight.collection_id} already processed, skipping", "debug")
|
|
continue
|
|
|
|
# Separate videos and images
|
|
videos = [s for s in highlight.snaps if s.media_type == 'video']
|
|
images = [s for s in highlight.snaps if s.media_type == 'image']
|
|
|
|
# Download images individually
|
|
for snap in images:
|
|
if snap.timestamp < cutoff_date:
|
|
continue
|
|
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
|
continue
|
|
|
|
filename = self._generate_filename(username, snap, 'jpg')
|
|
output_path = str(stories_output / filename)
|
|
|
|
if self._download_media_file(snap, output_path):
|
|
self.downloaded_files.add(snap.media_id)
|
|
downloaded_count += 1
|
|
self.log(f"Downloaded image: {filename}", "info")
|
|
|
|
self._record_download(
|
|
username=username,
|
|
url=highlight.url,
|
|
filename=filename,
|
|
post_date=snap.timestamp,
|
|
metadata={
|
|
'media_id': snap.media_id,
|
|
'highlight_id': highlight.collection_id,
|
|
'content_type': 'highlight_image'
|
|
},
|
|
file_path=output_path,
|
|
deferred=defer_database
|
|
)
|
|
|
|
# Handle videos - download each clip individually
|
|
if videos:
|
|
for snap in videos:
|
|
if snap.timestamp < cutoff_date:
|
|
continue
|
|
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
|
continue
|
|
|
|
filename = self._generate_filename(username, snap, 'mp4')
|
|
output_path = str(stories_output / filename)
|
|
|
|
if self._download_media_file(snap, output_path):
|
|
self._set_metadata(output_path, snap)
|
|
self.downloaded_files.add(snap.media_id)
|
|
downloaded_count += 1
|
|
self.log(f"Downloaded video: {filename}", "info")
|
|
|
|
self._record_download(
|
|
username=username,
|
|
url=highlight.url,
|
|
filename=filename,
|
|
post_date=snap.timestamp,
|
|
metadata={
|
|
'media_id': snap.media_id,
|
|
'highlight_id': highlight.collection_id,
|
|
'content_type': 'highlight_video'
|
|
},
|
|
file_path=output_path,
|
|
deferred=defer_database
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error processing highlight: {e}", "error")
|
|
|
|
checkpoint.mark_completed(url)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error during download: {e}", "error")
|
|
|
|
checkpoint.finish()
|
|
self.log(f"Downloaded {downloaded_count} files for @{username}", "info")
|
|
return downloaded_count
|
|
|
|
|
|
def test_scraper():
|
|
"""Test the scraper"""
|
|
print("=" * 60)
|
|
print("SNAPCHAT DIRECT SCRAPER TEST")
|
|
print("=" * 60)
|
|
|
|
with SnapchatDirectScraper(headless=True) as scraper:
|
|
username = "evalongoria"
|
|
|
|
# Test download
|
|
count = scraper.download(
|
|
username=username,
|
|
content_type="all",
|
|
days_back=30,
|
|
max_downloads=5,
|
|
spotlight_dir="/tmp/snap_test/spotlight",
|
|
stories_dir="/tmp/snap_test/stories",
|
|
stitch_highlights=True
|
|
)
|
|
|
|
print(f"\nDownloaded {count} files")
|
|
|
|
# Show files
|
|
import os
|
|
for root, dirs, files in os.walk("/tmp/snap_test"):
|
|
for f in files:
|
|
path = os.path.join(root, f)
|
|
size = os.path.getsize(path) / 1024
|
|
print(f" {path}: {size:.1f}KB")
|
|
|
|
print("=" * 60)
|
|
print("TEST COMPLETE")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_scraper()
|