""" Pornhub Client - Fetches creator info and videos using yt-dlp """ import asyncio import html as html_module import json import os import re import subprocess import tempfile from datetime import datetime from pathlib import Path from typing import Dict, List, Optional, Tuple from modules.base_module import LoggingMixin from .models import Creator, Post, Attachment class PornhubClient(LoggingMixin): """ Client for fetching Pornhub creator information and videos using yt-dlp Supports: - Pornstar pages (pornhub.com/pornstar/name) - Channel pages (pornhub.com/channels/name) - User pages (pornhub.com/users/name) - Model pages (pornhub.com/model/name) """ SERVICE_ID = 'pornhub' PLATFORM = 'pornhub' # Quality presets for yt-dlp # Pornhub serves single combined streams with IDs like '1080p', '720p', etc. # NOT separate video+audio streams like YouTube QUALITY_PRESETS = { 'best': 'bestvideo+bestaudio/best', '1080p': 'bestvideo[height<=1080]+bestaudio/best[height<=1080]/best', '720p': 'bestvideo[height<=720]+bestaudio/best[height<=720]/best', '480p': 'bestvideo[height<=480]+bestaudio/best[height<=480]/best', } def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None): self._init_logger('PaidContent', log_callback, default_module='Pornhub') # Find yt-dlp executable self.ytdlp_path = ytdlp_path or self._find_ytdlp() if not self.ytdlp_path: self.log("yt-dlp not found, Pornhub support will be disabled", 'warning') # Store database reference for cookie access self.unified_db = unified_db self._cookies_file = None # Cache for profile page HTML (avoid re-fetching for avatar/banner/bio) self._profile_page_cache: Dict[str, Optional[str]] = {} def _find_ytdlp(self) -> Optional[str]: """Find yt-dlp executable""" common_paths = [ '/opt/media-downloader/venv/bin/yt-dlp', '/usr/local/bin/yt-dlp', '/usr/bin/yt-dlp', '/opt/homebrew/bin/yt-dlp', os.path.expanduser('~/.local/bin/yt-dlp'), ] for path in common_paths: if os.path.isfile(path) and os.access(path, os.X_OK): return path try: result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True) if result.returncode == 0: return result.stdout.strip() except Exception: pass return None def is_available(self) -> bool: """Check if yt-dlp is available""" return self.ytdlp_path is not None def _get_cookies_file(self) -> Optional[str]: """Get path to cookies file, creating it from database if needed""" if self._cookies_file and os.path.exists(self._cookies_file): return self._cookies_file if not self.unified_db: return None try: with self.unified_db.get_connection() as conn: cursor = conn.cursor() cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('pornhub',)) row = cursor.fetchone() if row and row[0]: data = json.loads(row[0]) # Support both {"cookies": [...]} and [...] formats if isinstance(data, dict) and 'cookies' in data: cookies_list = data['cookies'] elif isinstance(data, list): cookies_list = data else: cookies_list = [] if cookies_list: # Write cookies to temp file in Netscape format fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='pornhub_cookies_') with os.fdopen(fd, 'w') as f: f.write("# Netscape HTTP Cookie File\n") for cookie in cookies_list: domain = cookie.get('domain', '') include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE' path = cookie.get('path', '/') secure = 'TRUE' if cookie.get('secure', False) else 'FALSE' expiry = str(int(cookie.get('expirationDate', 0))) name = cookie.get('name', '') value = cookie.get('value', '') f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n") self.log(f"Loaded {len(cookies_list)} cookies from pornhub scraper", 'debug') return self._cookies_file except Exception as e: self.log(f"Could not load cookies: {e}", 'debug') return None def _get_cookies_list(self) -> Optional[list]: """Get cookies as a list of dicts for aiohttp requests""" if not self.unified_db: return None try: with self.unified_db.get_connection() as conn: cursor = conn.cursor() cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('pornhub',)) row = cursor.fetchone() if row and row[0]: data = json.loads(row[0]) if isinstance(data, dict) and 'cookies' in data: return data['cookies'] elif isinstance(data, list): return data except Exception as e: self.log(f"Could not load cookies list: {e}", 'debug') return None def _get_base_cmd(self) -> List[str]: """Get base yt-dlp command with cookies if available""" cmd = [self.ytdlp_path] cookies_file = self._get_cookies_file() if cookies_file: cmd.extend(['--cookies', cookies_file]) return cmd def cleanup(self): """Clean up temporary files""" if self._cookies_file and os.path.exists(self._cookies_file): try: os.unlink(self._cookies_file) except Exception: pass self._cookies_file = None self._profile_page_cache.clear() @staticmethod def extract_creator_id(url: str) -> Optional[Tuple[str, str]]: """ Extract creator type and identifier from Pornhub URL Returns: Tuple of (type, id) where type is 'pornstar', 'channels', 'users', or 'model' or None if not a valid Pornhub creator URL """ patterns = [ (r'pornhub\.com/pornstar/([a-zA-Z0-9_-]+)', 'pornstar'), (r'pornhub\.com/channels/([a-zA-Z0-9_-]+)', 'channels'), (r'pornhub\.com/users/([a-zA-Z0-9_-]+)', 'users'), (r'pornhub\.com/model/([a-zA-Z0-9_-]+)', 'model'), ] for pattern, creator_type in patterns: match = re.search(pattern, url) if match: return (creator_type, match.group(1)) return None @staticmethod def normalize_creator_url(creator_id: str, creator_type: str = 'pornstar') -> str: """Convert creator ID to a consistent URL format Args: creator_id: Creator name/identifier (may be 'type/name' format) creator_type: Default type if not embedded in creator_id """ # Already a full URL if creator_id.startswith('http://') or creator_id.startswith('https://'): return creator_id # Handle 'type/name' format from URL parser if '/' in creator_id: parts = creator_id.split('/', 1) creator_type = parts[0] creator_id = parts[1] return f"https://www.pornhub.com/{creator_type}/{creator_id}" def _get_listing_url(self, url: str) -> str: """Get the URL to use for listing videos from a creator page. For pornstars and models, append /videos to get the video listing. For channels and users, the base URL already lists videos. """ # Parse out the type parsed = self.extract_creator_id(url) if parsed: creator_type, _ = parsed if creator_type in ('pornstar', 'model'): # Strip any trailing slash and append /videos url = url.rstrip('/') if not url.endswith('/videos'): url = f"{url}/videos" return url async def get_creator_info(self, url: str) -> Optional[Dict]: """ Get creator information using yt-dlp + profile page scraping Returns dict with creator metadata or None if not found """ if not self.is_available(): return None creator_type_id = self.extract_creator_id(url) creator_type = creator_type_id[0] if creator_type_id else 'pornstar' # Try to scrape the display name from the profile page first creator_name = None try: page_html = await self.get_profile_page(url) if page_html: # Look for

Name

inside nameSubscribe div name_match = re.search(r'
.*?]*>\s*(.+?)\s*', page_html, re.DOTALL) if name_match: creator_name = html_module.unescape(name_match.group(1).strip()) self.log(f"Found creator name from profile page: {creator_name}", 'debug') except Exception as e: self.log(f"Could not scrape creator name: {e}", 'debug') # If page scraping didn't find a name, try yt-dlp if not creator_name: try: listing_url = self._get_listing_url(url) cmd = self._get_base_cmd() + [ '--no-warnings', '--flat-playlist', '-j', '--playlist-items', '1', listing_url ] result = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await result.communicate() if result.returncode == 0: for line in stdout.decode('utf-8', errors='replace').strip().split('\n'): if not line: continue try: data = json.loads(line) playlist_title = data.get('playlist_title') or '' creator_name = (data.get('channel') or data.get('uploader') or playlist_title.replace(' - Videos', '') or None) if creator_name: creator_name = html_module.unescape(creator_name) break except json.JSONDecodeError: continue except Exception as e: self.log(f"yt-dlp creator info failed: {e}", 'debug') # Fall back to deriving name from URL slug if not creator_name and creator_type_id: creator_name = creator_type_id[1].replace('-', ' ').title() if creator_name: return { 'creator_id': creator_type_id[1] if creator_type_id else None, 'creator_name': creator_name, 'creator_url': url, 'creator_type': creator_type, } return None async def get_creator_videos(self, url: str, since_date: str = None, max_videos: int = None, progress_callback=None) -> List[Dict]: """ Get all videos from a creator page using --flat-playlist for speed. Args: url: Pornhub creator URL since_date: Only fetch videos published after this date (ISO format) max_videos: Maximum number of videos to fetch progress_callback: Callback function(count) for progress updates Returns: List of video metadata dicts """ if not self.is_available(): return [] try: listing_url = self._get_listing_url(url) # Use --flat-playlist for fast listing (avoids per-video HTTP requests) cmd = self._get_base_cmd() + [ '--no-warnings', '--flat-playlist', '-j', '--socket-timeout', '30', '--retries', '3', listing_url ] if max_videos: cmd.extend(['--playlist-items', f'1:{max_videos}']) self.log(f"Fetching videos from: {url}", 'info') result = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await result.communicate() if result.returncode != 0: error = stderr.decode('utf-8', errors='replace') self.log(f"Failed to get creator videos: {error}", 'warning') return [] videos = [] for line in stdout.decode('utf-8', errors='replace').strip().split('\n'): if not line: continue try: data = json.loads(line) # Skip non-video entries if data.get('_type') == 'playlist': continue video_id = data.get('id') if not video_id: continue # Flat-playlist doesn't provide upload_date for Pornhub, but check anyway upload_date = data.get('upload_date') if upload_date: try: upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat() except ValueError: pass # Decode HTML entities in title (flat-playlist returns them encoded) title = html_module.unescape(data.get('title', f'Video {video_id}')) # Build video URL video_url = (data.get('webpage_url') or data.get('url') or f"https://www.pornhub.com/view_video.php?viewkey={video_id}") videos.append({ 'video_id': video_id, 'title': title, 'description': data.get('description', ''), 'upload_date': upload_date, 'duration': data.get('duration'), 'view_count': data.get('view_count'), 'thumbnail': data.get('thumbnail'), 'url': video_url, }) if progress_callback: progress_callback(len(videos)) if max_videos and len(videos) >= max_videos: break except json.JSONDecodeError: continue self.log(f"Found {len(videos)} videos", 'info') return videos except Exception as e: self.log(f"Error getting creator videos: {e}", 'error') return [] async def download_video(self, video_url: str, output_dir: Path, quality: str = 'best', progress_callback=None) -> Dict: """ Download a video Args: video_url: Pornhub video URL output_dir: Directory to save the video quality: Quality preset progress_callback: Callback for download progress Returns: Dict with success status and file info """ if not self.is_available(): return {'success': False, 'error': 'yt-dlp not available'} try: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s') format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best']) cmd = self._get_base_cmd() + [ '--no-warnings', '-f', format_str, '-o', output_template, '--print-json', '--no-playlist', '--user-agent', 'Mozilla/5.0', '--referer', 'https://www.pornhub.com/', '--merge-output-format', 'mp4', '--concurrent-fragments', '4', '--no-part', '--retries', '20', video_url ] self.log(f"Downloading video: {video_url}", 'debug') result = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await result.communicate() if result.returncode != 0: error_msg = stderr.decode('utf-8', errors='replace').strip() if 'Video unavailable' in error_msg or 'not available' in error_msg: error_msg = 'Video unavailable or private' elif 'premium' in error_msg.lower(): error_msg = 'Video requires premium access' elif len(error_msg) > 200: error_msg = error_msg[:200] + '...' return {'success': False, 'error': error_msg} # Parse output JSON video_info = None for line in stdout.decode('utf-8', errors='replace').strip().split('\n'): try: video_info = json.loads(line) break except json.JSONDecodeError: continue if not video_info: # Try to find downloaded file files = list(output_dir.glob('*.mp4')) if files: file_path = max(files, key=lambda f: f.stat().st_mtime) return { 'success': True, 'file_path': str(file_path), 'filename': file_path.name, 'file_size': file_path.stat().st_size } return {'success': False, 'error': 'Could not find downloaded file'} file_path = video_info.get('_filename') or video_info.get('filename') if file_path: file_path = Path(file_path) return { 'success': True, 'file_path': str(file_path) if file_path else None, 'filename': file_path.name if file_path else None, 'file_size': file_path.stat().st_size if file_path and file_path.exists() else video_info.get('filesize'), 'title': video_info.get('title'), 'duration': video_info.get('duration'), 'video_id': video_info.get('id'), 'upload_date': video_info.get('upload_date'), 'timestamp': video_info.get('timestamp'), 'thumbnail': video_info.get('thumbnail'), } except Exception as e: self.log(f"Error downloading video: {e}", 'error') return {'success': False, 'error': str(e)} async def get_profile_page(self, url: str) -> Optional[str]: """Fetch profile page HTML via aiohttp (with cookies if available). Results are cached to avoid re-fetching for avatar/banner/bio.""" # Strip /videos suffix for profile page base_url = re.sub(r'/videos/?$', '', url) if base_url in self._profile_page_cache: return self._profile_page_cache[base_url] try: import aiohttp headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', } # Build simple cookies dict for the session cookies_dict = {} cookies_list = self._get_cookies_list() if cookies_list: for cookie in cookies_list: name = cookie.get('name', '') value = cookie.get('value', '') if name: cookies_dict[name] = value async with aiohttp.ClientSession(cookies=cookies_dict) as session: async with session.get( base_url, headers=headers, timeout=aiohttp.ClientTimeout(total=15) ) as resp: if resp.status == 200: text = await resp.text() self._profile_page_cache[base_url] = text return text except Exception as e: self.log(f"Could not fetch profile page: {e}", 'debug') self._profile_page_cache[base_url] = None return None async def get_profile_image(self, url: str) -> Optional[str]: """Scrape profile page for avatar/photo URL""" try: page_html = await self.get_profile_page(url) if not page_html: return None # Look for avatar image: avatar_match = re.search(r']*id=["\']getAvatar["\'][^>]*src=["\']([^"\']+)["\']', page_html) if avatar_match: self.log("Found Pornhub profile avatar", 'debug') return avatar_match.group(1) # Try og:image meta tag og_match = re.search(r' Optional[str]: """Scrape bio/about section from profile page""" try: page_html = await self.get_profile_page(url) if not page_html: return None # Look for aboutMeSection -> div with the actual text # Structure:
About Name
Bio text
about_match = re.search( r']*>.*?
[^<]*
\s*
\s*(.*?)\s*
', page_html, re.DOTALL ) if about_match: bio_text = re.sub(r'<[^>]+>', '', about_match.group(1)).strip() if bio_text: self.log("Found Pornhub profile bio", 'debug') return html_module.unescape(bio_text) # Fallback: look for biographyAbout section bio_match = re.search( r'class="biographyAbout[^"]*"[^>]*>.*?
(.*?)
', page_html, re.DOTALL ) if bio_match: bio_text = re.sub(r'<[^>]+>', '', bio_match.group(1)).strip() if bio_text: self.log("Found Pornhub profile bio (fallback)", 'debug') return html_module.unescape(bio_text) except Exception as e: self.log(f"Could not fetch profile bio: {e}", 'debug') return None async def get_profile_banner(self, url: str) -> Optional[str]: """Scrape banner/cover image if available""" try: page_html = await self.get_profile_page(url) if not page_html: return None # Look for cover image: cover_match = re.search( r']*id=["\']coverPictureDefault["\'][^>]*src=["\']([^"\']+)["\']', page_html ) if cover_match: self.log("Found Pornhub profile banner", 'debug') return cover_match.group(1) # Fallback: any img inside coverImage div cover_match = re.search( r'
\s*]*src=["\']([^"\']+)["\']', page_html, re.DOTALL ) if cover_match: self.log("Found Pornhub profile banner (div)", 'debug') return cover_match.group(1) except Exception as e: self.log(f"Could not fetch profile banner: {e}", 'debug') return None async def get_profile_info(self, url: str) -> Optional[Dict]: """Scrape all profile info from the page in one pass""" page_html = await self.get_profile_page(url) if not page_html: return None info = {} # Extract infoPiece data (Gender, Birth Place, Height, etc.) info_pieces = re.findall( r'
\s*\s*(.*?)\s*\s*(.*?)\s*
', page_html, re.DOTALL ) for label, value in info_pieces: label = re.sub(r'<[^>]+>', '', label).strip().rstrip(':') value = re.sub(r'<[^>]+>', '', value).strip() if label and value: info[label.lower().replace(' ', '_')] = value return info if info else None async def get_joined_date(self, url: str) -> Optional[str]: """Extract a joined/career start date from profile info""" try: profile_info = await self.get_profile_info(url) if not profile_info: return None # Pornstar pages have "Career Start and End: 2011 to Present" career = profile_info.get('career_start_and_end') if career: # Extract start year: "2011 to Present" -> "2011" match = re.match(r'(\d{4})', career) if match: return match.group(1) # User/model pages might not have career info but could have other dates return None except Exception as e: self.log(f"Could not get joined date: {e}", 'debug') return None async def get_creator(self, url: str) -> Optional[Creator]: """ Get Creator object from creator URL """ info = await self.get_creator_info(url) if not info: return None # Build creator_id as 'type/name' format creator_type_id = self.extract_creator_id(url) if creator_type_id: creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}" else: creator_id = info.get('creator_id', '') # Profile image is already fetched during get_creator_info (page was cached) profile_image = await self.get_profile_image(url) return Creator( creator_id=creator_id, service_id='pornhub', platform='pornhub', username=info.get('creator_name', 'Unknown'), display_name=info.get('creator_name'), profile_image_url=profile_image, ) async def get_posts(self, url: str, since_date: str = None, max_videos: int = None, progress_callback=None) -> List[Post]: """ Get videos as Post objects """ videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback) # Get creator_id from URL creator_type_id = self.extract_creator_id(url) creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}" if creator_type_id else '' posts = [] for video in videos: # Create attachment for the video attachment = Attachment( name=f"{video['title']}.mp4", file_type='video', extension='.mp4', server_path=video['url'], download_url=video['url'], duration=video.get('duration'), ) post = Post( post_id=video['video_id'], service_id='pornhub', platform='pornhub', creator_id=creator_id, title=video['title'], content=video.get('description') or video['title'], published_at=video.get('upload_date'), attachments=[attachment], ) posts.append(post) return posts