#!/usr/bin/env python3 """ Snapchat Client Module - Direct HTTP-based Snapchat downloader using curl_cffi. Replaces Playwright-based scraping with direct HTTP requests. Snapchat embeds all page data in ', html, re.DOTALL) if not match: return None try: return json.loads(match.group(1)) except json.JSONDecodeError as e: self.log(f"Failed to parse __NEXT_DATA__ JSON: {e}", "error") return None def get_profile_content(self, username: str) -> Dict[str, List]: """Get all spotlight URLs, highlight URLs, and inline story/highlight data from a profile. Parses __NEXT_DATA__ JSON to extract: - spotlights: list of spotlight URL strings - highlights: list of highlight URL strings - story_collection: SnapCollection from story.snapList (recent stories), or None - highlight_collections: list of SnapCollection from curatedHighlights (inline data) The inline data avoids needing separate HTTP requests for stories and highlights. """ result = {'spotlights': [], 'highlights': [], 'story_collection': None, 'highlight_collections': []} url = f"https://story.snapchat.com/@{username}" self.log(f"Fetching profile for @{username}", "info") html = self._fetch_page(url) if not html: self.log(f"Failed to fetch profile page for @{username}", "warning") return result # Extract spotlight URLs via regex (still needed — spotlight metadata requires per-URL fetch) spotlight_pattern = rf'/@{re.escape(username)}/spotlight/([A-Za-z0-9_-]+)' spotlight_ids = list(set(re.findall(spotlight_pattern, html))) result['spotlights'] = [ f"https://story.snapchat.com/@{username}/spotlight/{sid}" for sid in spotlight_ids ] self.log(f"Found {len(result['spotlights'])} spotlights", "info") # Parse __NEXT_DATA__ for stories and highlights (much more reliable than regex) data = self._extract_next_data(html) if not data: # Fall back to regex for highlights highlight_pattern = rf'/@{re.escape(username)}/highlight/([A-Za-z0-9-]+)' highlight_ids = list(set(re.findall(highlight_pattern, html))) result['highlights'] = [ f"https://story.snapchat.com/@{username}/highlight/{hid}" for hid in highlight_ids ] self.log(f"Found {len(result['highlights'])} highlights (regex fallback)", "info") return result props = (data.get('props') or {}).get('pageProps') or {} # Extract story snapList (recent stories — not available via individual URLs) story = props.get('story') or {} story_snaps = story.get('snapList') or [] if story_snaps: story_id = story.get('storyId') or {} if isinstance(story_id, dict): story_id = story_id.get('value', 'story') story_collection = SnapCollection( collection_id=story_id or 'story', collection_type='story', title=story.get('storyTitle', '') or 'Stories', username=username, url=url ) for snap_data in story_snaps: snap = self._parse_snap_data(snap_data) if snap: story_collection.snaps.append(snap) if story_collection.snaps: result['story_collection'] = story_collection self.log(f"Found {len(story_collection.snaps)} story snaps", "info") # Extract curatedHighlights inline (avoids per-highlight HTTP requests) curated_highlights = props.get('curatedHighlights') or [] for highlight in curated_highlights: highlight_id = highlight.get('highlightId') or {} if isinstance(highlight_id, dict): highlight_id = highlight_id.get('value', '') title = highlight.get('storyTitle') or {} if isinstance(title, dict): title = title.get('value', '') collection = SnapCollection( collection_id=highlight_id, collection_type='highlight', title=title or 'Untitled Highlight', username=username, url=f"https://story.snapchat.com/@{username}/highlight/{highlight_id}" ) for snap_data in highlight.get('snapList') or []: snap = self._parse_snap_data(snap_data) if snap: collection.snaps.append(snap) if collection.snaps: result['highlight_collections'].append(collection) self.log(f"Found {len(result['highlight_collections'])} highlights (inline)", "info") return result def _parse_snap_data(self, snap_data: Dict) -> Optional[SnapMedia]: """Parse a snap from __NEXT_DATA__ snapList into a SnapMedia object.""" snap_urls = snap_data.get('snapUrls') or {} media_url = snap_urls.get('mediaUrl', '') if not media_url: return None snap_id = (snap_data.get('snapId') or {}).get('value', '') media_id = '' if '/d/' in media_url: media_id = media_url.split('/d/')[1].split('.')[0] ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0') timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str and ts_str != '0' else datetime.now() lat = snap_data.get('lat') lng = snap_data.get('lng') return SnapMedia( media_id=media_id or snap_id, media_type='video' if snap_data.get('snapMediaType') == 1 else 'image', media_url=media_url, timestamp=timestamp, index=snap_data.get('snapIndex', 0), thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''), lat=float(lat) if lat else None, lng=float(lng) if lng else None ) def get_spotlight_metadata(self, url: str) -> Optional[SnapCollection]: """Extract full metadata from a spotlight URL via __NEXT_DATA__.""" html = self._fetch_page(url) if not html: return None data = self._extract_next_data(html) if not data: return None props = (data.get('props') or {}).get('pageProps') or {} feed = props.get('spotlightFeed') or {} stories = feed.get('spotlightStories') or [] if not stories: return None story_data = stories[0] story = story_data.get('story') or {} metadata = (story_data.get('metadata') or {}).get('videoMetadata') or {} story_id = (story.get('storyId') or {}).get('value', '') creator = (metadata.get('creator') or {}).get('personCreator') or {} username = creator.get('username', '') collection = SnapCollection( collection_id=story_id, collection_type='spotlight', title=metadata.get('description', ''), username=username, url=url ) for snap_data in story.get('snapList') or []: snap_id = (snap_data.get('snapId') or {}).get('value', '') snap_urls = snap_data.get('snapUrls') or {} media_url = snap_urls.get('mediaUrl', '') media_id = '' if '/d/' in media_url: media_id = media_url.split('/d/')[1].split('.')[0] ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0') timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now() snap = SnapMedia( media_id=media_id or snap_id, media_type='video' if snap_data.get('snapMediaType') == 1 else 'image', media_url=media_url, timestamp=timestamp, index=snap_data.get('snapIndex', 0), thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''), duration_ms=int(metadata.get('durationMs', 0)), description=metadata.get('description', ''), view_count=int(metadata.get('viewCount', 0)), width=int(metadata.get('width', 540)), height=int(metadata.get('height', 960)) ) collection.snaps.append(snap) return collection def get_highlight_metadata(self, url: str) -> Optional[SnapCollection]: """Extract full metadata from a highlight URL via __NEXT_DATA__.""" html = self._fetch_page(url) if not html: return None data = self._extract_next_data(html) if not data: return None props = (data.get('props') or {}).get('pageProps') or {} highlight = props.get('highlight') or {} if not highlight: return None highlight_id = highlight.get('highlightId') or {} if isinstance(highlight_id, dict): highlight_id = highlight_id.get('value', '') username_match = re.search(r'@([^/]+)', url) username = username_match.group(1) if username_match else '' title = highlight.get('storyTitle') or {} if isinstance(title, dict): title = title.get('value', '') collection = SnapCollection( collection_id=highlight_id, collection_type='highlight', title=title or 'Untitled Highlight', username=username, url=url ) for snap_data in highlight.get('snapList') or []: snap_urls = snap_data.get('snapUrls') or {} media_url = snap_urls.get('mediaUrl', '') media_id = '' if '/d/' in media_url: media_id = media_url.split('/d/')[1].split('.')[0] ts_str = (snap_data.get('timestampInSec') or {}).get('value', '0') timestamp = datetime.fromtimestamp(int(ts_str)) if ts_str else datetime.now() lat = snap_data.get('lat') lng = snap_data.get('lng') snap = SnapMedia( media_id=media_id, media_type='video' if snap_data.get('snapMediaType') == 1 else 'image', media_url=media_url, timestamp=timestamp, index=snap_data.get('snapIndex', 0), thumbnail_url=(snap_urls.get('mediaPreviewUrl') or {}).get('value', ''), lat=float(lat) if lat else None, lng=float(lng) if lng else None ) collection.snaps.append(snap) return collection def _download_media_file(self, snap: SnapMedia, output_path: str) -> bool: """Download a single media file via curl_cffi.""" try: url = snap.media_url.replace('&', '&') session = self._get_session() resp = session.get(url, timeout=60) if resp.status_code == 200 and len(resp.content) > 0: os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, 'wb') as f: f.write(resp.content) self._set_metadata(output_path, snap) return True self.log(f"Download failed: HTTP {resp.status_code}", "debug") return False except Exception as e: self.log(f"Error downloading media: {e}", "error") return False def _set_metadata(self, file_path: str, snap: SnapMedia, description: str = None): """Set EXIF metadata and file timestamp.""" try: date_str = snap.timestamp.strftime('%Y:%m:%d %H:%M:%S') desc = description or snap.description or "" if snap.view_count: desc += f" [Views: {snap.view_count}]" desc = desc.strip() ext = os.path.splitext(file_path)[1].lower() is_video = ext in ['.mp4', '.mov', '.avi', '.webm'] is_image = ext in ['.jpg', '.jpeg', '.png', '.webp'] exif_args = [ 'exiftool', '-overwrite_original', '-ignoreMinorErrors', f'-FileModifyDate={date_str}', ] if is_image: exif_args.extend([ f'-DateTimeOriginal={date_str}', f'-CreateDate={date_str}', f'-ModifyDate={date_str}', f'-MetadataDate={date_str}', ]) if desc: exif_args.extend([ f'-ImageDescription={desc}', f'-XPComment={desc}', f'-UserComment={desc}', ]) if snap.lat and snap.lng: lat_ref = 'N' if snap.lat >= 0 else 'S' lng_ref = 'E' if snap.lng >= 0 else 'W' exif_args.extend([ f'-GPSLatitude={abs(snap.lat)}', f'-GPSLatitudeRef={lat_ref}', f'-GPSLongitude={abs(snap.lng)}', f'-GPSLongitudeRef={lng_ref}', ]) elif is_video: exif_args.extend([ f'-CreateDate={date_str}', f'-ModifyDate={date_str}', f'-MediaCreateDate={date_str}', f'-MediaModifyDate={date_str}', f'-TrackCreateDate={date_str}', f'-TrackModifyDate={date_str}', ]) if desc: exif_args.extend([ f'-Description={desc}', f'-Comment={desc}', ]) exif_args.append(file_path) subprocess.run(exif_args, capture_output=True, timeout=30) # Set filesystem modification time ts = snap.timestamp.timestamp() os.utime(file_path, (ts, ts)) except Exception as e: self.log(f"Warning: Could not set metadata for {file_path}: {e}", "debug") def _generate_filename(self, username: str, snap: SnapMedia, ext: str) -> str: """Generate filename with timestamp and media ID.""" date_str = snap.timestamp.strftime('%Y%m%d_%H%M%S') return f"{username}_{date_str}_{snap.media_id}.{ext}" def _get_processed_posts(self, username: str) -> Set[str]: """Get set of media IDs that have been processed.""" processed = set() if not self.db: return processed try: with self.db.get_connection() as conn: cursor = conn.cursor() cursor.execute(''' SELECT filename, metadata FROM downloads WHERE platform = 'snapchat' AND source = ? ''', (username,)) for row in cursor.fetchall(): filename, metadata_str = row if filename: parts = filename.split('_') if len(parts) >= 4: media_id = '_'.join(parts[3:]).split('.')[0] processed.add(media_id) if metadata_str: try: metadata = json.loads(metadata_str) if 'media_id' in metadata: processed.add(metadata['media_id']) except (json.JSONDecodeError, TypeError, KeyError): pass except Exception as e: self.log(f"Error loading processed posts: {e}", "debug") return processed def _record_download(self, username: str, url: str, filename: str, post_date=None, metadata: dict = None, file_path: str = None, deferred: bool = False): """Record a download in the database.""" if deferred: self.pending_downloads.append({ 'username': username, 'url': url, 'filename': filename, 'post_date': post_date.isoformat() if hasattr(post_date, 'isoformat') else post_date, 'file_path': file_path, 'metadata': metadata }) return True if not self.db: return try: self.db.mark_downloaded( username=username, url=url, filename=filename, post_date=post_date, metadata=metadata, file_path=file_path ) except Exception as e: self.log(f"Failed to record download: {e}", "debug") def get_pending_downloads(self) -> list: """Get list of pending downloads for deferred recording.""" return self.pending_downloads def clear_pending_downloads(self): """Clear pending downloads list.""" self.pending_downloads = [] def download(self, username: str, content_type: str = "all", days_back: int = 14, max_downloads: int = 50, output_dir: str = None, spotlight_dir: str = None, stories_dir: str = None, stitch_highlights: bool = True, defer_database: bool = False, phrase_config: dict = None) -> int: """Download content from a user - compatible with media-downloader interface. Args: username: Snapchat username content_type: "spotlight", "stories", "highlights", or "all" days_back: How many days back to download (filters by post date) max_downloads: Maximum items to download per content type output_dir: Default output directory (used if specific dirs not set) spotlight_dir: Output directory for spotlights stories_dir: Output directory for stories/highlights stitch_highlights: Ignored (kept for backwards compatibility) defer_database: If True, defer database recording phrase_config: Not used (for interface compatibility) Returns: Number of files downloaded """ self.defer_database = defer_database self.downloaded_files.clear() # Set output directories if spotlight_dir: spotlight_output = Path(spotlight_dir) elif output_dir: spotlight_output = Path(output_dir) else: spotlight_output = Path(f"/opt/media-downloader/downloads/snapchat_client/spotlight/{username}") if stories_dir: stories_output = Path(stories_dir) elif output_dir: stories_output = Path(output_dir) else: stories_output = Path(f"/opt/media-downloader/downloads/snapchat_client/stories/{username}") spotlight_output.mkdir(parents=True, exist_ok=True) stories_output.mkdir(parents=True, exist_ok=True) # Update activity status if self.activity_manager: self.activity_manager.update_status("Checking Snapchat") # Get processed posts (shared with snapchat module - both use platform='snapchat') processed = self._get_processed_posts(username) self.log(f"Loaded {len(processed)} processed posts from database", "debug") cutoff_date = datetime.now() - timedelta(days=days_back) downloaded_count = 0 # Crash recovery checkpoint from modules.task_checkpoint import TaskCheckpoint checkpoint = TaskCheckpoint(f'snapchat_client:{username}', 'scraping') try: # Get profile content via HTTP content = self.get_profile_content(username) # Count total items for checkpoint total_items = 0 if content_type in ['spotlight', 'all'] and content['spotlights']: total_items += min(len(content['spotlights']), max_downloads) if content_type in ['stories', 'highlights', 'all'] and content['highlights']: total_items += min(len(content['highlights']), max_downloads) checkpoint.start(total_items=total_items) if checkpoint.is_recovering(): self.log(f"Snapchat Client @{username}: recovering — skipping already-processed URLs", "info") # Download spotlights if content_type in ['spotlight', 'all'] and content['spotlights']: spotlight_items = content['spotlights'][:max_downloads] self.log(f"Processing {len(spotlight_items)} spotlights...", "info") if self.activity_manager: self.activity_manager.update_status( "Downloading spotlights", progress_current=0, progress_total=len(spotlight_items) ) for spot_idx, url in enumerate(spotlight_items): if self.activity_manager: self.activity_manager.update_status( "Downloading spotlights", progress_current=spot_idx + 1, progress_total=len(spotlight_items) ) if checkpoint.is_completed(url): continue checkpoint.set_current(url) try: # Rate limit between page fetches if spot_idx > 0: time.sleep(random.uniform(1.5, 2.5)) spotlight = self.get_spotlight_metadata(url) if not spotlight or not spotlight.snaps: continue snap = spotlight.snaps[0] # Check date filter if snap.timestamp < cutoff_date: self.log(f"Spotlight {snap.media_id} is older than {days_back} days, skipping", "debug") continue # Check if already processed if snap.media_id in processed or snap.media_id in self.downloaded_files: self.log(f"Spotlight {snap.media_id} already processed, skipping", "debug") continue # Download ext = 'mp4' if snap.media_type == 'video' else 'jpg' filename = self._generate_filename(username, snap, ext) output_path = str(spotlight_output / filename) # Rate limit between CDN downloads time.sleep(random.uniform(0.3, 0.5)) if self._download_media_file(snap, output_path): self.downloaded_files.add(snap.media_id) downloaded_count += 1 self.log(f"Downloaded spotlight: {filename}", "info") self._record_download( username=username, url=url, filename=filename, post_date=snap.timestamp, metadata={ 'media_id': snap.media_id, 'description': snap.description, 'view_count': snap.view_count, 'content_type': 'spotlight' }, file_path=output_path, deferred=defer_database ) except Exception as e: self.log(f"Error processing spotlight: {e}", "error") checkpoint.mark_completed(url) # Rate limit between content types if content_type == 'all' and content['spotlights'] and content['highlights']: time.sleep(random.uniform(2, 3)) # Download highlights (stories) if content_type in ['stories', 'highlights', 'all'] and content['highlights']: highlight_items = content['highlights'][:max_downloads] self.log(f"Processing {len(highlight_items)} highlights...", "info") if self.activity_manager: self.activity_manager.update_status( "Downloading highlights", progress_current=0, progress_total=len(highlight_items) ) for hi_idx, url in enumerate(highlight_items): if self.activity_manager: self.activity_manager.update_status( "Downloading highlights", progress_current=hi_idx + 1, progress_total=len(highlight_items) ) if checkpoint.is_completed(url): continue checkpoint.set_current(url) try: # Rate limit between page fetches if hi_idx > 0: time.sleep(random.uniform(1.5, 2.5)) highlight = self.get_highlight_metadata(url) if not highlight or not highlight.snaps: continue # Check if any snap is within date range newest_snap = max(highlight.snaps, key=lambda s: s.timestamp) if newest_snap.timestamp < cutoff_date: self.log(f"Highlight {highlight.collection_id} is older than {days_back} days, skipping", "debug") continue # Check if already processed if highlight.collection_id in processed or highlight.collection_id in self.downloaded_files: self.log(f"Highlight {highlight.collection_id} already processed, skipping", "debug") continue # Separate videos and images videos = [s for s in highlight.snaps if s.media_type == 'video'] images = [s for s in highlight.snaps if s.media_type == 'image'] # Download images individually for snap in images: if snap.timestamp < cutoff_date: continue if snap.media_id in processed or snap.media_id in self.downloaded_files: continue time.sleep(random.uniform(0.3, 0.5)) filename = self._generate_filename(username, snap, 'jpg') output_path = str(stories_output / filename) if self._download_media_file(snap, output_path): self.downloaded_files.add(snap.media_id) downloaded_count += 1 self.log(f"Downloaded image: {filename}", "info") self._record_download( username=username, url=highlight.url, filename=filename, post_date=snap.timestamp, metadata={ 'media_id': snap.media_id, 'highlight_id': highlight.collection_id, 'content_type': 'highlight_image' }, file_path=output_path, deferred=defer_database ) # Download videos individually for snap in videos: if snap.timestamp < cutoff_date: continue if snap.media_id in processed or snap.media_id in self.downloaded_files: continue time.sleep(random.uniform(0.3, 0.5)) filename = self._generate_filename(username, snap, 'mp4') output_path = str(stories_output / filename) if self._download_media_file(snap, output_path): self._set_metadata(output_path, snap) self.downloaded_files.add(snap.media_id) downloaded_count += 1 self.log(f"Downloaded video: {filename}", "info") self._record_download( username=username, url=highlight.url, filename=filename, post_date=snap.timestamp, metadata={ 'media_id': snap.media_id, 'highlight_id': highlight.collection_id, 'content_type': 'highlight_video' }, file_path=output_path, deferred=defer_database ) except Exception as e: self.log(f"Error processing highlight: {e}", "error") checkpoint.mark_completed(url) except Exception as e: self.log(f"Error during download: {e}", "error") checkpoint.finish() self.log(f"Downloaded {downloaded_count} files for @{username}", "info") return downloaded_count