872 lines
36 KiB
Python
872 lines
36 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Snapchat Client Module - Direct HTTP-based Snapchat downloader using curl_cffi.
|
|
|
|
Replaces Playwright-based scraping with direct HTTP requests. Snapchat embeds
|
|
all page data in <script id="__NEXT_DATA__"> JSON tags, so no JavaScript
|
|
execution is needed. Uses story.snapchat.com which may not require Cloudflare.
|
|
|
|
Follows the same pattern as instagram_client_module.py.
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import re
|
|
import subprocess
|
|
import time
|
|
import random
|
|
import platform
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, List, Set
|
|
|
|
from modules.base_module import LoggingMixin
|
|
from modules.snapchat_scraper import SnapMedia, SnapCollection
|
|
|
|
|
|
class SnapchatClientDownloader(LoggingMixin):
|
|
"""Snapchat downloader using direct HTTP via curl_cffi (no Playwright)"""
|
|
|
|
def __init__(self,
|
|
show_progress: bool = True,
|
|
use_database: bool = True,
|
|
log_callback=None,
|
|
unified_db=None):
|
|
"""Initialize the Snapchat Client downloader.
|
|
|
|
Args:
|
|
show_progress: Whether to show download progress
|
|
use_database: Whether to use database for dedup
|
|
log_callback: Optional logging callback
|
|
unified_db: UnifiedDatabase instance
|
|
"""
|
|
self._init_logger('SnapchatClient', log_callback, default_module='Download')
|
|
|
|
self.scraper_id = 'snapchat_client'
|
|
self.show_progress = show_progress
|
|
self.use_database = use_database
|
|
self.download_count = 0
|
|
self.downloaded_files: Set[str] = set()
|
|
self.pending_downloads = []
|
|
|
|
# Session (lazy-initialized)
|
|
self._session = None
|
|
|
|
# Database
|
|
if unified_db and use_database:
|
|
from modules.unified_database import SnapchatDatabaseAdapter
|
|
self.db = SnapchatDatabaseAdapter(unified_db)
|
|
self.unified_db = unified_db
|
|
else:
|
|
self.db = None
|
|
self.unified_db = None
|
|
self.use_database = False
|
|
|
|
# Activity status manager
|
|
try:
|
|
from modules.activity_status import get_activity_manager
|
|
self.activity_manager = get_activity_manager(unified_db)
|
|
except ImportError:
|
|
self.activity_manager = None
|
|
|
|
# Cookie data from DB
|
|
self.cookies = []
|
|
self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
|
|
|
|
def _get_session(self):
|
|
"""Get or create a curl_cffi session with browser TLS fingerprinting."""
|
|
if self._session is None:
|
|
from curl_cffi.requests import Session
|
|
# Try multiple browser versions for curl_cffi compatibility
|
|
for _browser in ("chrome131", "chrome136", "chrome"):
|
|
try:
|
|
self._session = Session(impersonate=_browser)
|
|
break
|
|
except Exception:
|
|
continue
|
|
else:
|
|
self._session = Session()
|
|
self._session.headers.update({
|
|
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'accept-language': 'en-US,en;q=0.9',
|
|
'cache-control': 'no-cache',
|
|
})
|
|
# Load cookies from database
|
|
self._load_cookies()
|
|
return self._session
|
|
|
|
def _load_cookies(self):
|
|
"""Load cookies from database for authenticated requests."""
|
|
if not self.unified_db:
|
|
return
|
|
|
|
# Try snapchat_client cookies first, fall back to snapchat
|
|
for scraper_id in ['snapchat_client', 'snapchat']:
|
|
try:
|
|
cookies = self.unified_db.get_scraper_cookies(scraper_id)
|
|
if cookies:
|
|
self.log(f"Loaded {len(cookies)} cookies from '{scraper_id}' scraper", "debug")
|
|
self.cookies = cookies
|
|
for cookie in cookies:
|
|
name = cookie.get('name', '')
|
|
value = cookie.get('value', '')
|
|
domain = cookie.get('domain', '.snapchat.com')
|
|
if name and value and self._session:
|
|
self._session.cookies.set(name, value, domain=domain)
|
|
|
|
# Check if we have a stored user-agent (important for cf_clearance match)
|
|
try:
|
|
import json as _json
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute(
|
|
"SELECT user_agent FROM scrapers WHERE id = ?",
|
|
(scraper_id,)
|
|
)
|
|
row = cursor.fetchone()
|
|
if row and row[0]:
|
|
self.user_agent = row[0]
|
|
if self._session:
|
|
self._session.headers['User-Agent'] = self.user_agent
|
|
except Exception:
|
|
pass
|
|
|
|
return
|
|
except Exception as e:
|
|
self.log(f"Error loading cookies from '{scraper_id}': {e}", "debug")
|
|
|
|
def _fetch_page(self, url: str) -> Optional[str]:
|
|
"""Fetch a page via HTTP and return the HTML content.
|
|
|
|
Tries story.snapchat.com first (no Cloudflare), falls back to www.snapchat.com.
|
|
"""
|
|
session = self._get_session()
|
|
|
|
# If URL uses www.snapchat.com, try story.snapchat.com first
|
|
story_url = url.replace('www.snapchat.com', 'story.snapchat.com')
|
|
www_url = url.replace('story.snapchat.com', 'www.snapchat.com')
|
|
|
|
# Try story.snapchat.com first (likely no Cloudflare)
|
|
for attempt_url in [story_url, www_url]:
|
|
try:
|
|
resp = session.get(attempt_url, timeout=30)
|
|
if resp.status_code == 200 and '__NEXT_DATA__' in resp.text:
|
|
return resp.text
|
|
elif resp.status_code == 403:
|
|
self.log(f"403 Forbidden from {attempt_url.split('/@')[0]}", "debug")
|
|
continue
|
|
elif resp.status_code != 200:
|
|
self.log(f"HTTP {resp.status_code} from {attempt_url.split('/@')[0]}", "debug")
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"Error fetching {attempt_url.split('/@')[0]}: {e}", "debug")
|
|
continue
|
|
|
|
return None
|
|
|
|
def _extract_next_data(self, html: str) -> Optional[Dict]:
|
|
"""Extract __NEXT_DATA__ JSON from HTML page."""
|
|
match = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
|
|
if not match:
|
|
return None
|
|
try:
|
|
return json.loads(match.group(1))
|
|
except json.JSONDecodeError as e:
|
|
self.log(f"Failed to parse __NEXT_DATA__ JSON: {e}", "error")
|
|
return None
|
|
|
|
def get_profile_content(self, username: str) -> Dict[str, List]:
|
|
"""Get all spotlight URLs, highlight URLs, and inline story/highlight data from a profile.
|
|
|
|
Parses __NEXT_DATA__ JSON to extract:
|
|
- spotlights: list of spotlight URL strings
|
|
- highlights: list of highlight URL strings
|
|
- story_collection: SnapCollection from story.snapList (recent stories), or None
|
|
- highlight_collections: list of SnapCollection from curatedHighlights (inline data)
|
|
|
|
The inline data avoids needing separate HTTP requests for stories and highlights.
|
|
"""
|
|
result = {'spotlights': [], 'highlights': [], 'story_collection': None, 'highlight_collections': []}
|
|
|
|
url = f"https://story.snapchat.com/@{username}"
|
|
self.log(f"Fetching profile for @{username}", "info")
|
|
|
|
html = self._fetch_page(url)
|
|
if not html:
|
|
self.log(f"Failed to fetch profile page for @{username}", "warning")
|
|
return result
|
|
|
|
# Extract spotlight URLs via regex (still needed — spotlight metadata requires per-URL fetch)
|
|
spotlight_pattern = rf'/@{re.escape(username)}/spotlight/([A-Za-z0-9_-]+)'
|
|
spotlight_ids = list(set(re.findall(spotlight_pattern, html)))
|
|
result['spotlights'] = [
|
|
f"https://story.snapchat.com/@{username}/spotlight/{sid}"
|
|
for sid in spotlight_ids
|
|
]
|
|
self.log(f"Found {len(result['spotlights'])} spotlights", "info")
|
|
|
|
# Parse __NEXT_DATA__ for stories and highlights (much more reliable than regex)
|
|
data = self._extract_next_data(html)
|
|
if not data:
|
|
# Fall back to regex for highlights
|
|
highlight_pattern = rf'/@{re.escape(username)}/highlight/([A-Za-z0-9-]+)'
|
|
highlight_ids = list(set(re.findall(highlight_pattern, html)))
|
|
result['highlights'] = [
|
|
f"https://story.snapchat.com/@{username}/highlight/{hid}"
|
|
for hid in highlight_ids
|
|
]
|
|
self.log(f"Found {len(result['highlights'])} highlights (regex fallback)", "info")
|
|
return result
|
|
|
|
props = (data.get('props') or {}).get('pageProps') or {}
|
|
|
|
# Extract story snapList (recent stories — not available via individual URLs)
|
|
story = props.get('story') or {}
|
|
story_snaps = story.get('snapList') or []
|
|
if story_snaps:
|
|
story_id = story.get('storyId') or {}
|
|
if isinstance(story_id, dict):
|
|
story_id = story_id.get('value', 'story')
|
|
story_collection = SnapCollection(
|
|
collection_id=story_id or 'story',
|
|
collection_type='story',
|
|
title=story.get('storyTitle', '') or 'Stories',
|
|
username=username,
|
|
url=url
|
|
)
|
|
for snap_data in story_snaps:
|
|
snap = self._parse_snap_data(snap_data)
|
|
if snap:
|
|
story_collection.snaps.append(snap)
|
|
if story_collection.snaps:
|
|
result['story_collection'] = story_collection
|
|
self.log(f"Found {len(story_collection.snaps)} story snaps", "info")
|
|
|
|
# Extract curatedHighlights inline (avoids per-highlight HTTP requests)
|
|
curated_highlights = props.get('curatedHighlights') or []
|
|
for highlight in curated_highlights:
|
|
highlight_id = highlight.get('highlightId') or {}
|
|
if isinstance(highlight_id, dict):
|
|
highlight_id = highlight_id.get('value', '')
|
|
|
|
title = highlight.get('storyTitle') or {}
|
|
if isinstance(title, dict):
|
|
title = title.get('value', '')
|
|
|
|
collection = SnapCollection(
|
|
collection_id=highlight_id,
|
|
collection_type='highlight',
|
|
title=title or 'Untitled Highlight',
|
|
username=username,
|
|
url=f"https://story.snapchat.com/@{username}/highlight/{highlight_id}"
|
|
)
|
|
for snap_data in highlight.get('snapList') or []:
|
|
snap = self._parse_snap_data(snap_data)
|
|
if snap:
|
|
collection.snaps.append(snap)
|
|
if collection.snaps:
|
|
result['highlight_collections'].append(collection)
|
|
|
|
self.log(f"Found {len(result['highlight_collections'])} highlights (inline)", "info")
|
|
|
|
return result
|
|
|
|
def _parse_snap_data(self, snap_data: Dict) -> Optional[SnapMedia]:
|
|
"""Parse a snap from __NEXT_DATA__ snapList into a SnapMedia object."""
|
|
snap_urls = snap_data.get('snapUrls') or {}
|
|
media_url = snap_urls.get('mediaUrl', '')
|
|
if not media_url:
|
|
return None
|
|
|
|
snap_id = (snap_data.get('snapId') or {}).get('value', '')
|
|
media_id = ''
|
|
if '/d/' in media_url:
|
|
media_id = media_url.split('/d/')[1].split('.')[0]
|
|
|
|
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
|
|
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str and ts_str != '0' else datetime.now()
|
|
|
|
lat = snap_data.get('lat')
|
|
lng = snap_data.get('lng')
|
|
|
|
return SnapMedia(
|
|
media_id=media_id or snap_id,
|
|
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
|
|
media_url=media_url,
|
|
timestamp=timestamp,
|
|
index=snap_data.get('snapIndex', 0),
|
|
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
|
|
lat=float(lat) if lat else None,
|
|
lng=float(lng) if lng else None
|
|
)
|
|
|
|
def get_spotlight_metadata(self, url: str) -> Optional[SnapCollection]:
|
|
"""Extract full metadata from a spotlight URL via __NEXT_DATA__."""
|
|
html = self._fetch_page(url)
|
|
if not html:
|
|
return None
|
|
|
|
data = self._extract_next_data(html)
|
|
if not data:
|
|
return None
|
|
|
|
props = (data.get('props') or {}).get('pageProps') or {}
|
|
feed = props.get('spotlightFeed') or {}
|
|
stories = feed.get('spotlightStories') or []
|
|
|
|
if not stories:
|
|
return None
|
|
|
|
story_data = stories[0]
|
|
story = story_data.get('story') or {}
|
|
metadata = (story_data.get('metadata') or {}).get('videoMetadata') or {}
|
|
|
|
story_id = (story.get('storyId') or {}).get('value', '')
|
|
creator = (metadata.get('creator') or {}).get('personCreator') or {}
|
|
username = creator.get('username', '')
|
|
|
|
collection = SnapCollection(
|
|
collection_id=story_id,
|
|
collection_type='spotlight',
|
|
title=metadata.get('description', ''),
|
|
username=username,
|
|
url=url
|
|
)
|
|
|
|
for snap_data in story.get('snapList') or []:
|
|
snap_id = (snap_data.get('snapId') or {}).get('value', '')
|
|
snap_urls = snap_data.get('snapUrls') or {}
|
|
media_url = snap_urls.get('mediaUrl', '')
|
|
|
|
media_id = ''
|
|
if '/d/' in media_url:
|
|
media_id = media_url.split('/d/')[1].split('.')[0]
|
|
|
|
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
|
|
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
|
|
|
|
snap = SnapMedia(
|
|
media_id=media_id or snap_id,
|
|
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
|
|
media_url=media_url,
|
|
timestamp=timestamp,
|
|
index=snap_data.get('snapIndex', 0),
|
|
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
|
|
duration_ms=int(metadata.get('durationMs', 0)),
|
|
description=metadata.get('description', ''),
|
|
view_count=int(metadata.get('viewCount', 0)),
|
|
width=int(metadata.get('width', 540)),
|
|
height=int(metadata.get('height', 960))
|
|
)
|
|
collection.snaps.append(snap)
|
|
|
|
return collection
|
|
|
|
def get_highlight_metadata(self, url: str) -> Optional[SnapCollection]:
|
|
"""Extract full metadata from a highlight URL via __NEXT_DATA__."""
|
|
html = self._fetch_page(url)
|
|
if not html:
|
|
return None
|
|
|
|
data = self._extract_next_data(html)
|
|
if not data:
|
|
return None
|
|
|
|
props = (data.get('props') or {}).get('pageProps') or {}
|
|
highlight = props.get('highlight') or {}
|
|
|
|
if not highlight:
|
|
return None
|
|
|
|
highlight_id = highlight.get('highlightId') or {}
|
|
if isinstance(highlight_id, dict):
|
|
highlight_id = highlight_id.get('value', '')
|
|
|
|
username_match = re.search(r'@([^/]+)', url)
|
|
username = username_match.group(1) if username_match else ''
|
|
|
|
title = highlight.get('storyTitle') or {}
|
|
if isinstance(title, dict):
|
|
title = title.get('value', '')
|
|
|
|
collection = SnapCollection(
|
|
collection_id=highlight_id,
|
|
collection_type='highlight',
|
|
title=title or 'Untitled Highlight',
|
|
username=username,
|
|
url=url
|
|
)
|
|
|
|
for snap_data in highlight.get('snapList') or []:
|
|
snap_urls = snap_data.get('snapUrls') or {}
|
|
media_url = snap_urls.get('mediaUrl', '')
|
|
|
|
media_id = ''
|
|
if '/d/' in media_url:
|
|
media_id = media_url.split('/d/')[1].split('.')[0]
|
|
|
|
ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0')
|
|
timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now()
|
|
|
|
lat = snap_data.get('lat')
|
|
lng = snap_data.get('lng')
|
|
|
|
snap = SnapMedia(
|
|
media_id=media_id,
|
|
media_type='video' if snap_data.get('snapMediaType') == 1 else 'image',
|
|
media_url=media_url,
|
|
timestamp=timestamp,
|
|
index=snap_data.get('snapIndex', 0),
|
|
thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''),
|
|
lat=float(lat) if lat else None,
|
|
lng=float(lng) if lng else None
|
|
)
|
|
collection.snaps.append(snap)
|
|
|
|
return collection
|
|
|
|
def _download_media_file(self, snap: SnapMedia, output_path: str) -> bool:
|
|
"""Download a single media file via curl_cffi."""
|
|
try:
|
|
url = snap.media_url.replace('&', '&')
|
|
session = self._get_session()
|
|
|
|
resp = session.get(url, timeout=60)
|
|
if resp.status_code == 200 and len(resp.content) > 0:
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
with open(output_path, 'wb') as f:
|
|
f.write(resp.content)
|
|
self._set_metadata(output_path, snap)
|
|
return True
|
|
|
|
self.log(f"Download failed: HTTP {resp.status_code}", "debug")
|
|
return False
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading media: {e}", "error")
|
|
return False
|
|
|
|
def _set_metadata(self, file_path: str, snap: SnapMedia, description: str = None):
|
|
"""Set EXIF metadata and file timestamp."""
|
|
try:
|
|
date_str = snap.timestamp.strftime('%Y:%m:%d %H:%M:%S')
|
|
desc = description or snap.description or ""
|
|
if snap.view_count:
|
|
desc += f" [Views: {snap.view_count}]"
|
|
desc = desc.strip()
|
|
|
|
ext = os.path.splitext(file_path)[1].lower()
|
|
is_video = ext in ['.mp4', '.mov', '.avi', '.webm']
|
|
is_image = ext in ['.jpg', '.jpeg', '.png', '.webp']
|
|
|
|
exif_args = [
|
|
'exiftool', '-overwrite_original', '-ignoreMinorErrors',
|
|
f'-FileModifyDate={date_str}',
|
|
]
|
|
|
|
if is_image:
|
|
exif_args.extend([
|
|
f'-DateTimeOriginal={date_str}',
|
|
f'-CreateDate={date_str}',
|
|
f'-ModifyDate={date_str}',
|
|
f'-MetadataDate={date_str}',
|
|
])
|
|
if desc:
|
|
exif_args.extend([
|
|
f'-ImageDescription={desc}',
|
|
f'-XPComment={desc}',
|
|
f'-UserComment={desc}',
|
|
])
|
|
if snap.lat and snap.lng:
|
|
lat_ref = 'N' if snap.lat >= 0 else 'S'
|
|
lng_ref = 'E' if snap.lng >= 0 else 'W'
|
|
exif_args.extend([
|
|
f'-GPSLatitude={abs(snap.lat)}',
|
|
f'-GPSLatitudeRef={lat_ref}',
|
|
f'-GPSLongitude={abs(snap.lng)}',
|
|
f'-GPSLongitudeRef={lng_ref}',
|
|
])
|
|
|
|
elif is_video:
|
|
exif_args.extend([
|
|
f'-CreateDate={date_str}',
|
|
f'-ModifyDate={date_str}',
|
|
f'-MediaCreateDate={date_str}',
|
|
f'-MediaModifyDate={date_str}',
|
|
f'-TrackCreateDate={date_str}',
|
|
f'-TrackModifyDate={date_str}',
|
|
])
|
|
if desc:
|
|
exif_args.extend([
|
|
f'-Description={desc}',
|
|
f'-Comment={desc}',
|
|
])
|
|
|
|
exif_args.append(file_path)
|
|
subprocess.run(exif_args, capture_output=True, timeout=30)
|
|
|
|
# Set filesystem modification time
|
|
ts = snap.timestamp.timestamp()
|
|
os.utime(file_path, (ts, ts))
|
|
|
|
except Exception as e:
|
|
self.log(f"Warning: Could not set metadata for {file_path}: {e}", "debug")
|
|
|
|
def _generate_filename(self, username: str, snap: SnapMedia, ext: str) -> str:
|
|
"""Generate filename with timestamp and media ID."""
|
|
date_str = snap.timestamp.strftime('%Y%m%d_%H%M%S')
|
|
return f"{username}_{date_str}_{snap.media_id}.{ext}"
|
|
|
|
def _get_processed_posts(self, username: str) -> Set[str]:
|
|
"""Get set of media IDs that have been processed."""
|
|
processed = set()
|
|
if not self.db:
|
|
return processed
|
|
|
|
try:
|
|
with self.db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
SELECT filename, metadata FROM downloads
|
|
WHERE platform = 'snapchat'
|
|
AND source = ?
|
|
''', (username,))
|
|
|
|
for row in cursor.fetchall():
|
|
filename, metadata_str = row
|
|
if filename:
|
|
parts = filename.split('_')
|
|
if len(parts) >= 4:
|
|
media_id = '_'.join(parts[3:]).split('.')[0]
|
|
processed.add(media_id)
|
|
|
|
if metadata_str:
|
|
try:
|
|
metadata = json.loads(metadata_str)
|
|
if 'media_id' in metadata:
|
|
processed.add(metadata['media_id'])
|
|
except (json.JSONDecodeError, TypeError, KeyError):
|
|
pass
|
|
|
|
except Exception as e:
|
|
self.log(f"Error loading processed posts: {e}", "debug")
|
|
|
|
return processed
|
|
|
|
def _record_download(self, username: str, url: str, filename: str,
|
|
post_date=None, metadata: dict = None, file_path: str = None,
|
|
deferred: bool = False):
|
|
"""Record a download in the database."""
|
|
if deferred:
|
|
self.pending_downloads.append({
|
|
'username': username,
|
|
'url': url,
|
|
'filename': filename,
|
|
'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date,
|
|
'file_path': file_path,
|
|
'metadata': metadata
|
|
})
|
|
return True
|
|
|
|
if not self.db:
|
|
return
|
|
|
|
try:
|
|
self.db.mark_downloaded(
|
|
username=username,
|
|
url=url,
|
|
filename=filename,
|
|
post_date=post_date,
|
|
metadata=metadata,
|
|
file_path=file_path
|
|
)
|
|
except Exception as e:
|
|
self.log(f"Failed to record download: {e}", "debug")
|
|
|
|
def get_pending_downloads(self) -> list:
|
|
"""Get list of pending downloads for deferred recording."""
|
|
return self.pending_downloads
|
|
|
|
def clear_pending_downloads(self):
|
|
"""Clear pending downloads list."""
|
|
self.pending_downloads = []
|
|
|
|
def download(self, username: str, content_type: str = "all", days_back: int = 14,
|
|
max_downloads: int = 50, output_dir: str = None,
|
|
spotlight_dir: str = None, stories_dir: str = None,
|
|
stitch_highlights: bool = True, defer_database: bool = False,
|
|
phrase_config: dict = None) -> int:
|
|
"""Download content from a user - compatible with media-downloader interface.
|
|
|
|
Args:
|
|
username: Snapchat username
|
|
content_type: "spotlight", "stories", "highlights", or "all"
|
|
days_back: How many days back to download (filters by post date)
|
|
max_downloads: Maximum items to download per content type
|
|
output_dir: Default output directory (used if specific dirs not set)
|
|
spotlight_dir: Output directory for spotlights
|
|
stories_dir: Output directory for stories/highlights
|
|
stitch_highlights: Ignored (kept for backwards compatibility)
|
|
defer_database: If True, defer database recording
|
|
phrase_config: Not used (for interface compatibility)
|
|
|
|
Returns:
|
|
Number of files downloaded
|
|
"""
|
|
self.defer_database = defer_database
|
|
self.downloaded_files.clear()
|
|
|
|
# Set output directories
|
|
if spotlight_dir:
|
|
spotlight_output = Path(spotlight_dir)
|
|
elif output_dir:
|
|
spotlight_output = Path(output_dir)
|
|
else:
|
|
spotlight_output = Path(f"/opt/media-downloader/downloads/snapchat_client/spotlight/{username}")
|
|
|
|
if stories_dir:
|
|
stories_output = Path(stories_dir)
|
|
elif output_dir:
|
|
stories_output = Path(output_dir)
|
|
else:
|
|
stories_output = Path(f"/opt/media-downloader/downloads/snapchat_client/stories/{username}")
|
|
|
|
spotlight_output.mkdir(parents=True, exist_ok=True)
|
|
stories_output.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Update activity status
|
|
if self.activity_manager:
|
|
self.activity_manager.update_status("Checking Snapchat")
|
|
|
|
# Get processed posts (shared with snapchat module - both use platform='snapchat')
|
|
processed = self._get_processed_posts(username)
|
|
self.log(f"Loaded {len(processed)} processed posts from database", "debug")
|
|
|
|
cutoff_date = datetime.now() - timedelta(days=days_back)
|
|
downloaded_count = 0
|
|
|
|
# Crash recovery checkpoint
|
|
from modules.task_checkpoint import TaskCheckpoint
|
|
checkpoint = TaskCheckpoint(f'snapchat_client:{username}', 'scraping')
|
|
|
|
try:
|
|
# Get profile content via HTTP
|
|
content = self.get_profile_content(username)
|
|
|
|
# Count total items for checkpoint
|
|
total_items = 0
|
|
if content_type in ['spotlight', 'all'] and content['spotlights']:
|
|
total_items += min(len(content['spotlights']), max_downloads)
|
|
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
|
|
total_items += min(len(content['highlights']), max_downloads)
|
|
checkpoint.start(total_items=total_items)
|
|
if checkpoint.is_recovering():
|
|
self.log(f"Snapchat Client @{username}: recovering — skipping already-processed URLs", "info")
|
|
|
|
# Download spotlights
|
|
if content_type in ['spotlight', 'all'] and content['spotlights']:
|
|
spotlight_items = content['spotlights'][:max_downloads]
|
|
self.log(f"Processing {len(spotlight_items)} spotlights...", "info")
|
|
|
|
if self.activity_manager:
|
|
self.activity_manager.update_status(
|
|
"Downloading spotlights",
|
|
progress_current=0,
|
|
progress_total=len(spotlight_items)
|
|
)
|
|
|
|
for spot_idx, url in enumerate(spotlight_items):
|
|
if self.activity_manager:
|
|
self.activity_manager.update_status(
|
|
"Downloading spotlights",
|
|
progress_current=spot_idx + 1,
|
|
progress_total=len(spotlight_items)
|
|
)
|
|
|
|
if checkpoint.is_completed(url):
|
|
continue
|
|
|
|
checkpoint.set_current(url)
|
|
|
|
try:
|
|
# Rate limit between page fetches
|
|
if spot_idx > 0:
|
|
time.sleep(random.uniform(1.5, 2.5))
|
|
|
|
spotlight = self.get_spotlight_metadata(url)
|
|
if not spotlight or not spotlight.snaps:
|
|
continue
|
|
|
|
snap = spotlight.snaps[0]
|
|
|
|
# Check date filter
|
|
if snap.timestamp < cutoff_date:
|
|
self.log(f"Spotlight {snap.media_id} is older than {days_back} days, skipping", "debug")
|
|
continue
|
|
|
|
# Check if already processed
|
|
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
|
self.log(f"Spotlight {snap.media_id} already processed, skipping", "debug")
|
|
continue
|
|
|
|
# Download
|
|
ext = 'mp4' if snap.media_type == 'video' else 'jpg'
|
|
filename = self._generate_filename(username, snap, ext)
|
|
output_path = str(spotlight_output / filename)
|
|
|
|
# Rate limit between CDN downloads
|
|
time.sleep(random.uniform(0.3, 0.5))
|
|
|
|
if self._download_media_file(snap, output_path):
|
|
self.downloaded_files.add(snap.media_id)
|
|
downloaded_count += 1
|
|
self.log(f"Downloaded spotlight: {filename}", "info")
|
|
|
|
self._record_download(
|
|
username=username,
|
|
url=url,
|
|
filename=filename,
|
|
post_date=snap.timestamp,
|
|
metadata={
|
|
'media_id': snap.media_id,
|
|
'description': snap.description,
|
|
'view_count': snap.view_count,
|
|
'content_type': 'spotlight'
|
|
},
|
|
file_path=output_path,
|
|
deferred=defer_database
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error processing spotlight: {e}", "error")
|
|
|
|
checkpoint.mark_completed(url)
|
|
|
|
# Rate limit between content types
|
|
if content_type == 'all' and content['spotlights'] and content['highlights']:
|
|
time.sleep(random.uniform(2, 3))
|
|
|
|
# Download highlights (stories)
|
|
if content_type in ['stories', 'highlights', 'all'] and content['highlights']:
|
|
highlight_items = content['highlights'][:max_downloads]
|
|
self.log(f"Processing {len(highlight_items)} highlights...", "info")
|
|
|
|
if self.activity_manager:
|
|
self.activity_manager.update_status(
|
|
"Downloading highlights",
|
|
progress_current=0,
|
|
progress_total=len(highlight_items)
|
|
)
|
|
|
|
for hi_idx, url in enumerate(highlight_items):
|
|
if self.activity_manager:
|
|
self.activity_manager.update_status(
|
|
"Downloading highlights",
|
|
progress_current=hi_idx + 1,
|
|
progress_total=len(highlight_items)
|
|
)
|
|
|
|
if checkpoint.is_completed(url):
|
|
continue
|
|
|
|
checkpoint.set_current(url)
|
|
|
|
try:
|
|
# Rate limit between page fetches
|
|
if hi_idx > 0:
|
|
time.sleep(random.uniform(1.5, 2.5))
|
|
|
|
highlight = self.get_highlight_metadata(url)
|
|
if not highlight or not highlight.snaps:
|
|
continue
|
|
|
|
# Check if any snap is within date range
|
|
newest_snap = max(highlight.snaps, key=lambda s: s.timestamp)
|
|
if newest_snap.timestamp < cutoff_date:
|
|
self.log(f"Highlight {highlight.collection_id} is older than {days_back} days, skipping", "debug")
|
|
continue
|
|
|
|
# Check if already processed
|
|
if highlight.collection_id in processed or highlight.collection_id in self.downloaded_files:
|
|
self.log(f"Highlight {highlight.collection_id} already processed, skipping", "debug")
|
|
continue
|
|
|
|
# Separate videos and images
|
|
videos = [s for s in highlight.snaps if s.media_type == 'video']
|
|
images = [s for s in highlight.snaps if s.media_type == 'image']
|
|
|
|
# Download images individually
|
|
for snap in images:
|
|
if snap.timestamp < cutoff_date:
|
|
continue
|
|
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
|
continue
|
|
|
|
time.sleep(random.uniform(0.3, 0.5))
|
|
|
|
filename = self._generate_filename(username, snap, 'jpg')
|
|
output_path = str(stories_output / filename)
|
|
|
|
if self._download_media_file(snap, output_path):
|
|
self.downloaded_files.add(snap.media_id)
|
|
downloaded_count += 1
|
|
self.log(f"Downloaded image: {filename}", "info")
|
|
|
|
self._record_download(
|
|
username=username,
|
|
url=highlight.url,
|
|
filename=filename,
|
|
post_date=snap.timestamp,
|
|
metadata={
|
|
'media_id': snap.media_id,
|
|
'highlight_id': highlight.collection_id,
|
|
'content_type': 'highlight_image'
|
|
},
|
|
file_path=output_path,
|
|
deferred=defer_database
|
|
)
|
|
|
|
# Download videos individually
|
|
for snap in videos:
|
|
if snap.timestamp < cutoff_date:
|
|
continue
|
|
if snap.media_id in processed or snap.media_id in self.downloaded_files:
|
|
continue
|
|
|
|
time.sleep(random.uniform(0.3, 0.5))
|
|
|
|
filename = self._generate_filename(username, snap, 'mp4')
|
|
output_path = str(stories_output / filename)
|
|
|
|
if self._download_media_file(snap, output_path):
|
|
self._set_metadata(output_path, snap)
|
|
self.downloaded_files.add(snap.media_id)
|
|
downloaded_count += 1
|
|
self.log(f"Downloaded video: {filename}", "info")
|
|
|
|
self._record_download(
|
|
username=username,
|
|
url=highlight.url,
|
|
filename=filename,
|
|
post_date=snap.timestamp,
|
|
metadata={
|
|
'media_id': snap.media_id,
|
|
'highlight_id': highlight.collection_id,
|
|
'content_type': 'highlight_video'
|
|
},
|
|
file_path=output_path,
|
|
deferred=defer_database
|
|
)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error processing highlight: {e}", "error")
|
|
|
|
checkpoint.mark_completed(url)
|
|
|
|
except Exception as e:
|
|
self.log(f"Error during download: {e}", "error")
|
|
|
|
checkpoint.finish()
|
|
self.log(f"Downloaded {downloaded_count} files for @{username}", "info")
|
|
return downloaded_count
|