""" XHamster Client - Fetches creator info and videos using yt-dlp Supports: - Creator profiles (xhamster.com/creators/name) - Channels (xhamster.com/channels/name) - Shorts (xhamster.com/creators/name/shorts) - Photo galleries (xhamster.com/creators/name/photos) """ import asyncio import html as html_module import json import os import re import subprocess import tempfile from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional, Tuple from modules.base_module import LoggingMixin from .models import Creator, Post, Attachment class XHamsterClient(LoggingMixin): """ Client for fetching XHamster creator information and videos using yt-dlp Supports: - Creator pages (xhamster.com/creators/name) - Channel pages (xhamster.com/channels/name) - Creator shorts (xhamster.com/creators/name/shorts) """ SERVICE_ID = 'xhamster' PLATFORM = 'xhamster' QUALITY_PRESETS = { 'best': 'bestvideo+bestaudio/best', '1080p': 'bestvideo[height<=1080]+bestaudio/best[height<=1080]/best', '720p': 'bestvideo[height<=720]+bestaudio/best[height<=720]/best', '480p': 'bestvideo[height<=480]+bestaudio/best[height<=480]/best', } def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None): self._init_logger('PaidContent', log_callback, default_module='XHamster') self.ytdlp_path = ytdlp_path or self._find_ytdlp() if not self.ytdlp_path: self.log("yt-dlp not found, XHamster support will be disabled", 'warning') self.unified_db = unified_db self._cookies_file = None self._profile_page_cache: Dict[str, Optional[str]] = {} def _find_ytdlp(self) -> Optional[str]: """Find yt-dlp executable""" common_paths = [ '/opt/media-downloader/venv/bin/yt-dlp', '/usr/local/bin/yt-dlp', '/usr/bin/yt-dlp', '/opt/homebrew/bin/yt-dlp', os.path.expanduser('~/.local/bin/yt-dlp'), ] for path in common_paths: if os.path.isfile(path) and os.access(path, os.X_OK): return path try: result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True) if result.returncode == 0: return result.stdout.strip() except Exception: pass return None def is_available(self) -> bool: """Check if yt-dlp is available""" return self.ytdlp_path is not None def _get_cookies_file(self) -> Optional[str]: """Get path to cookies file, creating it from database if needed""" if self._cookies_file and os.path.exists(self._cookies_file): return self._cookies_file if not self.unified_db: return None try: with self.unified_db.get_connection() as conn: cursor = conn.cursor() cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('xhamster',)) row = cursor.fetchone() if row and row[0]: data = json.loads(row[0]) if isinstance(data, dict) and 'cookies' in data: cookies_list = data['cookies'] elif isinstance(data, list): cookies_list = data else: cookies_list = [] if cookies_list: fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='xhamster_cookies_') with os.fdopen(fd, 'w') as f: f.write("# Netscape HTTP Cookie File\n") for cookie in cookies_list: domain = cookie.get('domain', '') include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE' path = cookie.get('path', '/') secure = 'TRUE' if cookie.get('secure', False) else 'FALSE' expiry = str(int(cookie.get('expirationDate', 0))) name = cookie.get('name', '') value = cookie.get('value', '') f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n") self.log(f"Loaded {len(cookies_list)} cookies from xhamster scraper", 'debug') return self._cookies_file except Exception as e: self.log(f"Could not load cookies: {e}", 'debug') return None def _get_base_cmd(self) -> List[str]: """Get base yt-dlp command with cookies if available""" cmd = [self.ytdlp_path] cookies_file = self._get_cookies_file() if cookies_file: cmd.extend(['--cookies', cookies_file]) return cmd def cleanup(self): """Clean up temporary files""" if self._cookies_file and os.path.exists(self._cookies_file): try: os.unlink(self._cookies_file) except Exception: pass self._cookies_file = None self._profile_page_cache.clear() @staticmethod def extract_creator_id(url: str) -> Optional[Tuple[str, str]]: """ Extract creator type and identifier from XHamster URL Returns: Tuple of (type, id) where type is 'creators' or 'channels' or None if not a valid XHamster creator URL """ patterns = [ (r'xhamster\d*\.com/creators/([a-zA-Z0-9_-]+)', 'creators'), (r'xhamster\d*\.com/channels/([a-zA-Z0-9_-]+)', 'channels'), ] for pattern, creator_type in patterns: match = re.search(pattern, url) if match: return (creator_type, match.group(1)) return None @staticmethod def normalize_creator_url(creator_id: str, creator_type: str = 'creators') -> str: """Convert creator ID to a consistent URL format""" if creator_id.startswith('http://') or creator_id.startswith('https://'): return creator_id if '/' in creator_id: parts = creator_id.split('/', 1) creator_type = parts[0] creator_id = parts[1] return f"https://xhamster.com/{creator_type}/{creator_id}" def _get_listing_url(self, url: str) -> str: """Get the URL to use for listing videos from a creator page. Strips /shorts suffix for the main listing, or keeps it for shorts-only. """ return url.rstrip('/') async def get_creator_info(self, url: str) -> Optional[Dict]: """Get creator information using yt-dlp""" if not self.is_available(): return None creator_type_id = self.extract_creator_id(url) creator_type = creator_type_id[0] if creator_type_id else 'creators' creator_name = None # Try to scrape the display name from the profile page try: page_html = await self.get_profile_page(url) if page_html: name_match = re.search(r']*class="[^"]*name[^"]*"[^>]*>\s*(.+?)\s*', page_html, re.DOTALL) if not name_match: name_match = re.search(r'([^<|]+)', page_html) if name_match: creator_name = html_module.unescape(name_match.group(1).strip()) # Clean up title suffix creator_name = re.sub(r'\s*[-|].*$', '', creator_name).strip() self.log(f"Found creator name from profile page: {creator_name}", 'debug') except Exception as e: self.log(f"Could not scrape creator name: {e}", 'debug') # If page scraping didn't find a name, try yt-dlp if not creator_name: try: listing_url = self._get_listing_url(url) cmd = self._get_base_cmd() + [ '--no-warnings', '--flat-playlist', '-j', '--playlist-items', '1', '--socket-timeout', '30', listing_url ] result = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await result.communicate() if result.returncode == 0: for line in stdout.decode('utf-8', errors='replace').strip().split('\n'): if not line: continue try: data = json.loads(line) creator_name = (data.get('channel') or data.get('uploader') or data.get('playlist_title') or None) if creator_name: creator_name = html_module.unescape(creator_name) break except json.JSONDecodeError: continue except Exception as e: self.log(f"yt-dlp creator info failed: {e}", 'debug') # Fall back to deriving name from URL slug if not creator_name and creator_type_id: creator_name = creator_type_id[1].replace('-', ' ').title() if creator_name: return { 'creator_id': creator_type_id[1] if creator_type_id else None, 'creator_name': creator_name, 'creator_url': url, 'creator_type': creator_type, } return None async def get_creator_videos(self, url: str, since_date: str = None, max_videos: int = None, progress_callback=None) -> List[Dict]: """Get all videos from a creator page using --flat-playlist for speed.""" if not self.is_available(): return [] try: listing_url = self._get_listing_url(url) cmd = self._get_base_cmd() + [ '--no-warnings', '--flat-playlist', '-j', '--socket-timeout', '30', '--retries', '3', listing_url ] if max_videos: cmd.extend(['--playlist-items', f'1:{max_videos}']) self.log(f"Fetching videos from: {url}", 'info') result = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await result.communicate() if result.returncode != 0: error = stderr.decode('utf-8', errors='replace') self.log(f"Failed to get creator videos: {error}", 'warning') return [] videos = [] seen_ids = set() for line in stdout.decode('utf-8', errors='replace').strip().split('\n'): if not line: continue try: data = json.loads(line) if data.get('_type') == 'playlist': continue video_id = data.get('id') video_url = (data.get('webpage_url') or data.get('url') or '') # flat-playlist returns _type=url entries with id=null # Extract video_id from URL: .../videos/{slug}-{xhID} if not video_id and video_url: basename = video_url.rstrip('/').split('/')[-1] # xhamster IDs are the last segment: slug-xhXXXXX id_match = re.search(r'-(xh[A-Za-z0-9]{4,7})$', basename) if id_match: video_id = id_match.group(1) elif basename: video_id = basename if not video_id: continue if video_id in seen_ids: continue seen_ids.add(video_id) upload_date = data.get('upload_date') if upload_date: try: upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat() except ValueError: pass title = data.get('title') if not title: # Derive title from URL slug basename = video_url.rstrip('/').split('/')[-1] if video_url else '' # Remove the xhamster ID suffix slug = re.sub(r'-xh[A-Za-z0-9]{4,7}$', '', basename) title = slug.replace('-', ' ').title() if slug else f'Video {video_id}' else: title = html_module.unescape(title) if not video_url: video_url = f"https://xhamster.com/videos/{video_id}" videos.append({ 'video_id': str(video_id), 'title': title, 'description': data.get('description', ''), 'upload_date': upload_date, 'duration': data.get('duration'), 'view_count': data.get('view_count'), 'thumbnail': data.get('thumbnail'), 'url': video_url, }) if progress_callback: progress_callback(len(videos)) if max_videos and len(videos) >= max_videos: break except json.JSONDecodeError: continue self.log(f"Found {len(videos)} videos", 'info') return videos except Exception as e: self.log(f"Error getting creator videos: {e}", 'error') return [] async def get_creator_shorts(self, url: str, max_items: int = None, progress_callback=None) -> List[Dict]: """Get shorts/moments from a creator page by scraping HTML. Scrapes /creators/{name}/shorts pages and extracts video data from window.initials.momentsComponent.videoListProps.videoThumbProps. """ try: base_url = re.sub(r'/(videos|shorts|photos)/?$', '', url.rstrip('/')) shorts_url = f"{base_url}/shorts" self.log(f"Fetching shorts from: {shorts_url}", 'info') all_shorts = [] seen_ids = set() page = 1 while True: page_url = f"{shorts_url}/{page}" if page > 1 else shorts_url html = await self._fetch_page_html(page_url) if not html: break thumb_props = self._extract_initials_json(html, 'momentsComponent.videoListProps.videoThumbProps') if not thumb_props or not isinstance(thumb_props, list): if page == 1: self.log("No shorts found for this creator", 'debug') break for item in thumb_props: video_id = str(item.get('id', '')) if not video_id: continue if video_id in seen_ids: continue seen_ids.add(video_id) page_url_item = item.get('pageURL', '') # Extract xhID from moment URL: /moments/{slug}-{xhID} xh_id = None if page_url_item: id_match = re.search(r'-(xh[A-Za-z0-9]{4,7})$', page_url_item.rstrip('/').split('/')[-1]) if id_match: xh_id = id_match.group(1) title = item.get('title', '') if title: title = html_module.unescape(title) else: title = f'Short {video_id}' all_shorts.append({ 'video_id': xh_id or video_id, 'title': title, 'description': '', 'upload_date': None, # Shorts listings don't include dates 'duration': None, 'view_count': item.get('views'), 'thumbnail': item.get('thumbURL') or item.get('imageURL'), 'url': page_url_item or f"https://xhamster.com/moments/{video_id}", }) if progress_callback: progress_callback(len(all_shorts)) if max_items and len(all_shorts) >= max_items: break if max_items and len(all_shorts) >= max_items: break # Check pagination pagination = self._extract_initials_json(html, 'momentsComponent.videoListProps.pagination') if not pagination: # Also try top-level pagination pagination = self._extract_initials_json(html, 'pagination') next_page = pagination.get('next', 0) if pagination else 0 if not next_page or next_page <= page: break page = next_page await asyncio.sleep(1) self.log(f"Found {len(all_shorts)} shorts", 'info') return all_shorts except Exception as e: self.log(f"Error getting creator shorts: {e}", 'error') return [] async def get_creator_galleries(self, url: str, max_items: int = None, progress_callback=None) -> List[Dict]: """Get photo gallery listings from a creator page. Scrapes /creators/{name}/photos pages and extracts gallery data from window.initials.userGalleriesCollection. """ try: base_url = re.sub(r'/(videos|shorts|photos)/?$', '', url.rstrip('/')) photos_url = f"{base_url}/photos" self.log(f"Fetching galleries from: {photos_url}", 'info') all_galleries = [] seen_ids = set() page = 1 while True: page_url = f"{photos_url}/{page}" if page > 1 else photos_url html = await self._fetch_page_html(page_url) if not html: break galleries = self._extract_initials_json(html, 'userGalleriesCollection') if not galleries or not isinstance(galleries, list): if page == 1: self.log("No galleries found for this creator", 'debug') break for gallery in galleries: gallery_id = str(gallery.get('galleryID', '')) if not gallery_id: continue if gallery_id in seen_ids: continue seen_ids.add(gallery_id) title = gallery.get('title', '') if title: title = html_module.unescape(title) all_galleries.append({ 'gallery_id': gallery_id, 'title': title or f'Gallery {gallery_id}', 'url': gallery.get('pageURL', ''), 'thumbnail': gallery.get('thumbURL') or gallery.get('imageURL'), 'image_count': gallery.get('quantity', 0), 'views': gallery.get('views', 0), }) if progress_callback: progress_callback(len(all_galleries)) if max_items and len(all_galleries) >= max_items: break if max_items and len(all_galleries) >= max_items: break # Check pagination pagination = self._extract_initials_json(html, 'pagination') max_page = pagination.get('maxPage', 1) if pagination else 1 if page >= max_page: break page += 1 await asyncio.sleep(1) self.log(f"Found {len(all_galleries)} galleries", 'info') return all_galleries except Exception as e: self.log(f"Error getting creator galleries: {e}", 'error') return [] async def get_gallery_images(self, gallery_url: str) -> Optional[Dict]: """Get all images from a single gallery page. Scrapes the gallery page and extracts image data from window.initials.galleryPage.photoItems and metadata from window.initials.photosGalleryModel. """ try: self.log(f"Fetching gallery images: {gallery_url}", 'debug') all_images = [] seen_ids = set() gallery_id = None title = None created = None last_page = 1 page = 1 while page <= last_page: page_url = f"{gallery_url}/{page}" if page > 1 else gallery_url html = await self._fetch_page_html(page_url) if not html: break # Extract gallery metadata on first page if page == 1: gallery_model = self._extract_initials_json(html, 'photosGalleryModel') if not gallery_model: gallery_model = self._extract_initials_json(html, 'galleryPage.galleryModel') if gallery_model: gallery_id = str(gallery_model.get('galleryID') or gallery_model.get('id', '')) title = gallery_model.get('title', '') if title: title = html_module.unescape(title) created_ts = gallery_model.get('created') if created_ts: try: created = datetime.fromtimestamp(int(created_ts)).isoformat() except (ValueError, OSError): pass last_page = gallery_model.get('lastPageNumber', 1) or 1 # Extract images photo_items = self._extract_initials_json(html, 'galleryPage.photoItems') if not photo_items: photo_items = self._extract_initials_json(html, 'photosGalleryModel.photos') if not photo_items or not isinstance(photo_items, list): break for photo in photo_items: image_url = photo.get('imgSrc', '') if not image_url: continue photo_id = str(photo.get('id', '')) if not photo_id: continue if photo_id in seen_ids: continue seen_ids.add(photo_id) all_images.append({ 'id': photo_id, 'url': image_url, 'width': photo.get('originWidth'), 'height': photo.get('originHeight'), }) if page < last_page: await asyncio.sleep(0.5) page += 1 if not all_images: self.log(f"No images found in gallery: {gallery_url}", 'debug') return None # Fallback gallery_id from URL if not gallery_id: id_match = re.search(r'-(\d+)$', gallery_url.rstrip('/').split('/')[-1]) if id_match: gallery_id = id_match.group(1) self.log(f"Found {len(all_images)} images in gallery '{title or gallery_id}'", 'debug') return { 'gallery_id': gallery_id or '', 'title': title or '', 'created': created, 'images': all_images, } except Exception as e: self.log(f"Error getting gallery images: {e}", 'error') return None async def download_image(self, image_url: str, output_path: Path) -> Dict: """Download an image file via aiohttp. Args: image_url: Direct URL to the image output_path: Full file path to save to Returns: Dict with success, file_path, file_size """ try: import aiohttp output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'Referer': 'https://xhamster.com/', } async with aiohttp.ClientSession() as session: async with session.get(image_url, headers=headers, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=60)) as resp: if resp.status != 200: return {'success': False, 'error': f'HTTP {resp.status}'} with open(output_path, 'wb') as f: async for chunk in resp.content.iter_chunked(65536): f.write(chunk) file_size = output_path.stat().st_size if file_size == 0: output_path.unlink(missing_ok=True) return {'success': False, 'error': 'Empty file'} return { 'success': True, 'file_path': str(output_path), 'file_size': file_size, } except Exception as e: self.log(f"Image download failed: {e}", 'debug') return {'success': False, 'error': str(e)} async def download_video(self, video_url: str, output_dir: Path, quality: str = 'best', progress_callback=None) -> Dict: """Download a video - tries direct download first, falls back to yt-dlp""" self.log(f"Downloading video: {video_url}", 'debug') # Try direct download first (yt-dlp's xhamster extractor is often broken) result = await self._download_video_direct(video_url, output_dir, progress_callback) if result and result.get('success'): return result # Fall back to yt-dlp if self.is_available(): result = await self._download_video_ytdlp(video_url, output_dir, quality) if result and result.get('success'): return result return result or {'success': False, 'error': 'All download methods failed'} async def _download_video_direct(self, video_url: str, output_dir: Path, progress_callback=None) -> Optional[Dict]: """Download video directly by scraping the video page for HLS/MP4 URLs""" try: import aiohttp output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', } # Fetch video page async with aiohttp.ClientSession() as session: async with session.get(video_url, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp: if resp.status != 200: return {'success': False, 'error': f'Page fetch failed: HTTP {resp.status}'} page_html = await resp.text() # Extract metadata from page title = None og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', page_html) if not og_match: og_match = re.search(r'<meta\s+content="([^"]+)"\s+property="og:title"', page_html) if og_match: title = html_module.unescape(og_match.group(1).strip()) # Extract upload date from page JSON data upload_date = None timestamp = None created_match = re.search(r'"id"\s*:\s*\d+[^}]*"created"\s*:\s*(\d{8,})', page_html) if not created_match: created_match = re.search(r'"created"\s*:\s*(\d{8,})[^}]*"id"\s*:\s*\d+', page_html) if created_match: timestamp = int(created_match.group(1)) try: upload_date = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d') except (ValueError, OSError): pass if not upload_date: date_match = re.search(r'"datePublished"\s*:\s*"([^"]+)"', page_html) if date_match: upload_date = date_match.group(1)[:10] # Extract video ID from URL video_id = None id_match = re.search(r'-(xh[A-Za-z0-9]{4,7})$', video_url.rstrip('/').split('/')[-1]) if id_match: video_id = id_match.group(1) if not title: title = video_url.rstrip('/').split('/')[-1] filename = f"{video_id}.mp4" if video_id else f"{re.sub(r'[^\\w\\s-]', '', title)[:100].strip()}.mp4" file_path = output_dir / filename # Try to extract video sources from window.initials JSON hls_url_from_json = None mp4_urls_from_json = {} # quality -> url try: initials_match = re.search(r'window\.initials\s*=\s*(\{.+?\});\s*</script>', page_html, re.DOTALL) if initials_match: initials = json.loads(initials_match.group(1)) video_model = initials.get('videoModel', {}) sources = video_model.get('sources', {}) # HLS source hls_data = sources.get('hls') if isinstance(hls_data, dict): hls_url_from_json = hls_data.get('url') elif isinstance(hls_data, str): hls_url_from_json = hls_data # MP4 download sources (keyed by quality like "480p", "720p", "1080p") download_sources = sources.get('download', {}) if isinstance(download_sources, dict): for quality_key, source_data in download_sources.items(): if isinstance(source_data, dict): url = source_data.get('link') or source_data.get('url') if url: mp4_urls_from_json[quality_key] = url elif isinstance(source_data, str): mp4_urls_from_json[quality_key] = source_data # Also check mp4 sources mp4_sources = sources.get('mp4', {}) if isinstance(mp4_sources, dict): for quality_key, source_data in mp4_sources.items(): if quality_key not in mp4_urls_from_json: if isinstance(source_data, dict): url = source_data.get('link') or source_data.get('url') if url: mp4_urls_from_json[quality_key] = url elif isinstance(source_data, str): mp4_urls_from_json[quality_key] = source_data # Also check standard sources standard_sources = sources.get('standard', {}) if isinstance(standard_sources, dict): for quality_key, source_data in standard_sources.items(): if quality_key not in mp4_urls_from_json: if isinstance(source_data, dict): url = source_data.get('link') or source_data.get('url') if url: mp4_urls_from_json[quality_key] = url elif isinstance(source_data, str): mp4_urls_from_json[quality_key] = source_data if hls_url_from_json or mp4_urls_from_json: self.log(f"Extracted video sources from JSON: HLS={'yes' if hls_url_from_json else 'no'}, MP4 qualities={list(mp4_urls_from_json.keys())}", 'debug') except (json.JSONDecodeError, Exception) as e: self.log(f"Could not parse video JSON sources: {e}", 'debug') # Try HLS download first (best quality, up to 4K) m3u8_url = hls_url_from_json if not m3u8_url: m3u8_match = re.search(r'"(https://video[^"]*\.xhcdn\.com/[^"]+\.m3u8[^"]*)"', page_html) if m3u8_match: m3u8_url = m3u8_match.group(1) if m3u8_url: hls_result = await self._download_hls(m3u8_url, file_path) if hls_result: file_size = file_path.stat().st_size self.log(f"HLS download complete: {filename} ({file_size / 1024 / 1024:.1f}MB)", 'debug') return { 'success': True, 'file_path': str(file_path), 'filename': filename, 'file_size': file_size, 'title': title, 'video_id': video_id, 'upload_date': upload_date, 'timestamp': timestamp, } # Fallback: direct MP4 download - prefer JSON sources (highest quality) download_url = None if mp4_urls_from_json: # Select highest quality MP4 from JSON sources quality_priority = ['2160p', '1440p', '1080p', '720p', '480p', '360p', '240p'] for q in quality_priority: if q in mp4_urls_from_json: download_url = mp4_urls_from_json[q] self.log(f"Direct downloading ({q} from JSON): {filename}", 'debug') break if not download_url: # Take any available quality download_url = next(iter(mp4_urls_from_json.values())) self.log(f"Direct downloading (from JSON): {filename}", 'debug') if not download_url: # Regex fallback: extract MP4 URLs from page HTML mp4_urls = re.findall( r'"(https://video[^"]*\.xhcdn\.com/[^"]+\.(?:h264|mp4)[^"]*)"', page_html ) mp4_urls = [u for u in mp4_urls if not u.endswith('.m3u8') and '.mp4' in u] # Filter out preview/sample URLs full_urls = [u for u in mp4_urls if not re.search(r'preview|sample|thumb', u, re.IGNORECASE)] if full_urls: mp4_urls = full_urls mp4_urls = list(dict.fromkeys(mp4_urls)) if not mp4_urls: self.log("No video URL found on video page", 'debug') return None # Take the LAST unique URL (previews tend to appear first in the HTML) download_url = mp4_urls[-1] self.log(f"Direct downloading (regex fallback): {filename}", 'debug') dl_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Referer': 'https://xhamster.com/', } async with aiohttp.ClientSession() as session: async with session.get(download_url, headers=dl_headers, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=600)) as resp: if resp.status != 200: return {'success': False, 'error': f'Download failed: HTTP {resp.status}'} total_size = int(resp.headers.get('Content-Length', 0)) downloaded = 0 with open(file_path, 'wb') as f: async for chunk in resp.content.iter_chunked(65536): f.write(chunk) downloaded += len(chunk) if progress_callback and total_size > 0: progress_callback(downloaded / total_size * 100) file_size = file_path.stat().st_size self.log(f"Direct download complete: {filename} ({file_size / 1024 / 1024:.1f}MB)", 'debug') return { 'success': True, 'file_path': str(file_path), 'filename': filename, 'file_size': file_size, 'title': title, 'video_id': video_id, 'upload_date': upload_date, 'timestamp': timestamp, } except Exception as e: self.log(f"Direct download failed: {e}", 'debug') return None async def _download_hls(self, m3u8_url: str, output_path: Path) -> bool: """Download HLS stream using ffmpeg, selecting best quality""" try: import aiohttp from urllib.parse import urlparse from yarl import URL as YarlURL # Fetch master playlist to find best quality stream headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Referer': 'https://xhamster.com/', } best_stream_url = None best_bandwidth = 0 async with aiohttp.ClientSession() as session: # Use encoded=True to preserve %2B/%3D in CloudFront signed URLs async with session.get(YarlURL(m3u8_url, encoded=True), headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp: if resp.status != 200: self.log(f"HLS master playlist fetch failed: HTTP {resp.status}", 'debug') return False playlist = await resp.text() # Extract query params from master URL for forwarding to variant URLs parsed_master = urlparse(m3u8_url) master_query = parsed_master.query # Parse master playlist for best quality variant lines = playlist.strip().split('\n') # Check if this is already a media playlist (no STREAM-INF) has_variants = any(line.startswith('#EXT-X-STREAM-INF:') for line in lines) if not has_variants: # This is already a media playlist — download directly with ffmpeg self.log("HLS: single stream (no variants), downloading directly", 'debug') best_stream_url = m3u8_url else: for i, line in enumerate(lines): if line.startswith('#EXT-X-STREAM-INF:'): bw_match = re.search(r'BANDWIDTH=(\d+)', line) bandwidth = int(bw_match.group(1)) if bw_match else 0 if bandwidth > best_bandwidth and i + 1 < len(lines): stream_path = lines[i + 1].strip() if stream_path.startswith('http'): best_stream_url = stream_path elif stream_path.startswith('//'): # Protocol-relative URL (different CDN domain) best_stream_url = f"{parsed_master.scheme}:{stream_path}" elif stream_path.startswith('/'): best_stream_url = f"{parsed_master.scheme}://{parsed_master.netloc}{stream_path}" else: m3u8_base = m3u8_url.split('?')[0].rsplit('/', 1)[0] best_stream_url = f"{m3u8_base}/{stream_path}" # Forward signed query params only if variant URL doesn't have its own if master_query and '?' not in best_stream_url: best_stream_url = f"{best_stream_url}?{master_query}" best_bandwidth = bandwidth if not best_stream_url: self.log("No HLS variant found in master playlist", 'debug') return False quality_label = '' for i, line in enumerate(lines): if line.startswith('#EXT-X-STREAM-INF:') and i + 1 < len(lines): rm = re.search(r'RESOLUTION=(\d+x\d+)', line) if rm and int(re.search(r'BANDWIDTH=(\d+)', line).group(1)) == best_bandwidth: quality_label = f" ({rm.group(1)})" break self.log(f"HLS downloading best quality{quality_label}", 'debug') # Use ffmpeg to download cmd = [ 'ffmpeg', '-y', '-headers', 'Referer: https://xhamster.com/\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\r\n', '-i', best_stream_url, '-c', 'copy', '-movflags', '+faststart', str(output_path) ] process = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) _, stderr = await process.communicate() if process.returncode != 0: error = stderr.decode('utf-8', errors='replace')[-500:] self.log(f"ffmpeg HLS download failed: {error}", 'debug') return False return output_path.exists() and output_path.stat().st_size > 0 except Exception as e: self.log(f"HLS download error: {e}", 'debug') return False async def _download_video_ytdlp(self, video_url: str, output_dir: Path, quality: str = 'best') -> Dict: """Download video using yt-dlp (fallback)""" if not self.is_available(): return {'success': False, 'error': 'yt-dlp not available'} try: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s') format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best']) cmd = self._get_base_cmd() + [ '--no-warnings', '-f', format_str, '-o', output_template, '--print-json', '--no-playlist', '--user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', '--concurrent-fragments', '4', '--no-part', '--retries', '20', '--socket-timeout', '30', video_url ] result = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await result.communicate() if result.returncode != 0: error_msg = stderr.decode('utf-8', errors='replace').strip() if 'Video unavailable' in error_msg or 'not available' in error_msg: error_msg = 'Video unavailable or private' elif 'premium' in error_msg.lower(): error_msg = 'Video requires premium access' elif len(error_msg) > 200: error_msg = error_msg[:200] + '...' return {'success': False, 'error': error_msg} # Parse output JSON video_info = None for line in stdout.decode('utf-8', errors='replace').strip().split('\n'): try: video_info = json.loads(line) break except json.JSONDecodeError: continue if not video_info: files = list(output_dir.glob('*.mp4')) if files: file_path = max(files, key=lambda f: f.stat().st_mtime) return { 'success': True, 'file_path': str(file_path), 'filename': file_path.name, 'file_size': file_path.stat().st_size } return {'success': False, 'error': 'Could not find downloaded file'} file_path = video_info.get('_filename') or video_info.get('filename') if file_path: file_path = Path(file_path) return { 'success': True, 'file_path': str(file_path) if file_path else None, 'filename': file_path.name if file_path else None, 'file_size': file_path.stat().st_size if file_path and file_path.exists() else video_info.get('filesize'), 'title': video_info.get('title'), 'duration': video_info.get('duration'), 'video_id': video_info.get('id'), 'upload_date': video_info.get('upload_date'), 'timestamp': video_info.get('timestamp'), 'thumbnail': video_info.get('thumbnail'), } except Exception as e: self.log(f"Error downloading video via yt-dlp: {e}", 'error') return {'success': False, 'error': str(e)} async def get_profile_page(self, url: str) -> Optional[str]: """Fetch profile page HTML via aiohttp. Results are cached.""" base_url = re.sub(r'/(videos|shorts)/?$', '', url) if base_url in self._profile_page_cache: return self._profile_page_cache[base_url] try: import aiohttp headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', } async with aiohttp.ClientSession() as session: async with session.get( base_url, headers=headers, timeout=aiohttp.ClientTimeout(total=15) ) as resp: if resp.status == 200: text = await resp.text() self._profile_page_cache[base_url] = text return text except Exception as e: self.log(f"Could not fetch profile page: {e}", 'debug') self._profile_page_cache[base_url] = None return None async def _fetch_page_html(self, url: str) -> Optional[str]: """Fetch an arbitrary page's HTML via aiohttp (not cached).""" try: import aiohttp headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', } async with aiohttp.ClientSession() as session: async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp: if resp.status == 200: return await resp.text() self.log(f"Page fetch failed: HTTP {resp.status} for {url}", 'debug') except Exception as e: self.log(f"Could not fetch page: {e}", 'debug') return None def _extract_initials_json(self, html: str, key_path: str) -> Optional[Any]: """Extract a value from window.initials JSON embedded in page HTML. Args: html: Page HTML containing window.initials = {...} key_path: Dot-separated path, e.g. 'galleryPage.photoItems' Returns: The extracted value, or None if not found. """ try: match = re.search(r'window\.initials\s*=\s*(\{.+?\});\s*</script>', html, re.DOTALL) if not match: return None data = json.loads(match.group(1)) for key in key_path.split('.'): if isinstance(data, dict): data = data.get(key) else: return None if data is None: return None return data except (json.JSONDecodeError, Exception) as e: self.log(f"Failed to extract initials JSON for '{key_path}': {e}", 'debug') return None async def get_profile_image(self, url: str) -> Optional[str]: """Scrape profile page for avatar/photo URL""" try: page_html = await self.get_profile_page(url) if not page_html: return None # XHamster embeds creator data as JSON in the page. # Look for the main creator's thumbUrl in the pornstarTop JSON block thumb_match = re.search( r'"pornstarTop"\s*:\s*\{[\s\S]*?"thumbUrl"\s*:\s*"([^"]+)"', page_html ) if thumb_match: avatar_url = thumb_match.group(1).replace('\\/', '/') self.log("Found XHamster profile avatar from JSON data", 'debug') return avatar_url # Fallback: CSS background-image on landing-info__logo-image bg_match = re.search( r'landing-info__logo-image["\'][^>]*style="[^"]*url\([\'"]?([^\'")]+)', page_html ) if bg_match: self.log("Found XHamster profile avatar from CSS", 'debug') return bg_match.group(1) # Fallback: og:image meta tag og_match = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_html) if not og_match: og_match = re.search(r'<meta\s+content="([^"]+)"\s+property="og:image"', page_html) if og_match: return og_match.group(1) except Exception as e: self.log(f"Could not fetch profile image: {e}", 'debug') return None async def get_profile_bio(self, url: str) -> Optional[str]: """Scrape bio/about section from profile page""" try: page_html = await self.get_profile_page(url) if not page_html: return None # Look for description/bio sections bio_match = re.search( r'<div[^>]*class="[^"]*about[^"]*"[^>]*>\s*(.*?)\s*</div>', page_html, re.DOTALL ) if bio_match: bio_text = re.sub(r'<[^>]+>', '', bio_match.group(1)).strip() if bio_text: self.log("Found XHamster profile bio", 'debug') return html_module.unescape(bio_text) # Try meta description desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', page_html) if desc_match: bio_text = html_module.unescape(desc_match.group(1).strip()) if bio_text and len(bio_text) > 20: return bio_text except Exception as e: self.log(f"Could not fetch profile bio: {e}", 'debug') return None async def get_creator(self, url: str) -> Optional[Creator]: """Get Creator object from creator URL""" info = await self.get_creator_info(url) if not info: return None creator_type_id = self.extract_creator_id(url) if creator_type_id: creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}" else: creator_id = info.get('creator_id', '') profile_image = await self.get_profile_image(url) return Creator( creator_id=creator_id, service_id='xhamster', platform='xhamster', username=info.get('creator_name', 'Unknown'), display_name=info.get('creator_name'), profile_image_url=profile_image, ) async def get_posts(self, url: str, since_date: str = None, max_videos: int = None, progress_callback=None) -> List[Post]: """Get all content (videos, shorts, galleries) as Post objects. Aggregates regular videos, shorts/moments, and photo galleries into a unified list of Post objects. Deduplicates by post_id so videos and shorts that share an xhID are not counted twice. """ creator_type_id = self.extract_creator_id(url) creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}" if creator_type_id else '' posts = [] seen_post_ids = set() # 1. Regular videos (via yt-dlp --flat-playlist) videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback) for video in videos: vid = video['video_id'] if vid in seen_post_ids: continue seen_post_ids.add(vid) attachment = Attachment( name=f"{vid}.mp4", file_type='video', extension='mp4', server_path=video['url'], download_url=video['url'], duration=video.get('duration'), ) posts.append(Post( post_id=vid, service_id='xhamster', platform='xhamster', creator_id=creator_id, title=video['title'], content=video.get('description') or video['title'], published_at=video.get('upload_date'), attachments=[attachment], )) # 2. Shorts / Moments (HTML scraping) try: shorts = await self.get_creator_shorts(url, max_items=max_videos) for short in shorts: vid = short['video_id'] if vid in seen_post_ids: continue seen_post_ids.add(vid) attachment = Attachment( name=f"{vid}.mp4", file_type='video', extension='mp4', server_path=short['url'], download_url=short['url'], duration=short.get('duration'), ) posts.append(Post( post_id=vid, service_id='xhamster', platform='xhamster', creator_id=creator_id, title=short['title'], content=short.get('description') or short['title'], published_at=short.get('upload_date'), attachments=[attachment], )) except Exception as e: self.log(f"Failed to fetch shorts (continuing with videos): {e}", 'warning') # 3. Photo galleries (HTML scraping) try: galleries = await self.get_creator_galleries(url) for gallery in galleries: gallery_post_id = f"gallery-{gallery['gallery_id']}" if gallery_post_id in seen_post_ids: continue seen_post_ids.add(gallery_post_id) gallery_data = await self.get_gallery_images(gallery['url']) if not gallery_data or not gallery_data.get('images'): continue attachments = [] for img in gallery_data['images']: # Determine extension from URL ext = 'jpg' if img['url']: url_ext = img['url'].rsplit('.', 1)[-1].split('?')[0].lower() if url_ext in ('jpg', 'jpeg', 'png', 'gif', 'webp'): ext = url_ext attachments.append(Attachment( name=f"{img['id']}.{ext}", file_type='image', extension=ext, server_path=img['url'], download_url=img['url'], width=img.get('width'), height=img.get('height'), )) posts.append(Post( post_id=gallery_post_id, service_id='xhamster', platform='xhamster', creator_id=creator_id, title=gallery_data.get('title') or gallery.get('title', ''), content=gallery_data.get('title') or gallery.get('title', ''), published_at=gallery_data.get('created'), attachments=attachments, )) # Small delay between gallery fetches await asyncio.sleep(0.5) except Exception as e: self.log(f"Failed to fetch galleries (continuing with videos/shorts): {e}", 'warning') self.log(f"Total posts: {len(posts)} (videos + shorts + galleries)", 'info') return posts