755
modules/paid_content/pornhub_client.py
Normal file
755
modules/paid_content/pornhub_client.py
Normal file
@@ -0,0 +1,755 @@
|
||||
"""
|
||||
Pornhub Client - Fetches creator info and videos using yt-dlp
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html as html_module
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Creator, Post, Attachment
|
||||
|
||||
|
||||
class PornhubClient(LoggingMixin):
|
||||
"""
|
||||
Client for fetching Pornhub creator information and videos using yt-dlp
|
||||
|
||||
Supports:
|
||||
- Pornstar pages (pornhub.com/pornstar/name)
|
||||
- Channel pages (pornhub.com/channels/name)
|
||||
- User pages (pornhub.com/users/name)
|
||||
- Model pages (pornhub.com/model/name)
|
||||
"""
|
||||
|
||||
SERVICE_ID = 'pornhub'
|
||||
PLATFORM = 'pornhub'
|
||||
|
||||
# Quality presets for yt-dlp
|
||||
# Pornhub serves single combined streams with IDs like '1080p', '720p', etc.
|
||||
# NOT separate video+audio streams like YouTube
|
||||
QUALITY_PRESETS = {
|
||||
'best': 'bestvideo+bestaudio/best',
|
||||
'1080p': 'bestvideo[height<=1080]+bestaudio/best[height<=1080]/best',
|
||||
'720p': 'bestvideo[height<=720]+bestaudio/best[height<=720]/best',
|
||||
'480p': 'bestvideo[height<=480]+bestaudio/best[height<=480]/best',
|
||||
}
|
||||
|
||||
def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Pornhub')
|
||||
|
||||
# Find yt-dlp executable
|
||||
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
|
||||
if not self.ytdlp_path:
|
||||
self.log("yt-dlp not found, Pornhub support will be disabled", 'warning')
|
||||
|
||||
# Store database reference for cookie access
|
||||
self.unified_db = unified_db
|
||||
self._cookies_file = None
|
||||
|
||||
# Cache for profile page HTML (avoid re-fetching for avatar/banner/bio)
|
||||
self._profile_page_cache: Dict[str, Optional[str]] = {}
|
||||
|
||||
def _find_ytdlp(self) -> Optional[str]:
|
||||
"""Find yt-dlp executable"""
|
||||
common_paths = [
|
||||
'/opt/media-downloader/venv/bin/yt-dlp',
|
||||
'/usr/local/bin/yt-dlp',
|
||||
'/usr/bin/yt-dlp',
|
||||
'/opt/homebrew/bin/yt-dlp',
|
||||
os.path.expanduser('~/.local/bin/yt-dlp'),
|
||||
]
|
||||
|
||||
for path in common_paths:
|
||||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
return path
|
||||
|
||||
try:
|
||||
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if yt-dlp is available"""
|
||||
return self.ytdlp_path is not None
|
||||
|
||||
def _get_cookies_file(self) -> Optional[str]:
|
||||
"""Get path to cookies file, creating it from database if needed"""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
return self._cookies_file
|
||||
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('pornhub',))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
data = json.loads(row[0])
|
||||
# Support both {"cookies": [...]} and [...] formats
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
cookies_list = data['cookies']
|
||||
elif isinstance(data, list):
|
||||
cookies_list = data
|
||||
else:
|
||||
cookies_list = []
|
||||
|
||||
if cookies_list:
|
||||
# Write cookies to temp file in Netscape format
|
||||
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='pornhub_cookies_')
|
||||
with os.fdopen(fd, 'w') as f:
|
||||
f.write("# Netscape HTTP Cookie File\n")
|
||||
for cookie in cookies_list:
|
||||
domain = cookie.get('domain', '')
|
||||
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
|
||||
path = cookie.get('path', '/')
|
||||
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
|
||||
expiry = str(int(cookie.get('expirationDate', 0)))
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
|
||||
self.log(f"Loaded {len(cookies_list)} cookies from pornhub scraper", 'debug')
|
||||
return self._cookies_file
|
||||
except Exception as e:
|
||||
self.log(f"Could not load cookies: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _get_cookies_list(self) -> Optional[list]:
|
||||
"""Get cookies as a list of dicts for aiohttp requests"""
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('pornhub',))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
data = json.loads(row[0])
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
return data['cookies']
|
||||
elif isinstance(data, list):
|
||||
return data
|
||||
except Exception as e:
|
||||
self.log(f"Could not load cookies list: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _get_base_cmd(self) -> List[str]:
|
||||
"""Get base yt-dlp command with cookies if available"""
|
||||
cmd = [self.ytdlp_path]
|
||||
cookies_file = self._get_cookies_file()
|
||||
if cookies_file:
|
||||
cmd.extend(['--cookies', cookies_file])
|
||||
return cmd
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up temporary files"""
|
||||
if self._cookies_file and os.path.exists(self._cookies_file):
|
||||
try:
|
||||
os.unlink(self._cookies_file)
|
||||
except Exception:
|
||||
pass
|
||||
self._cookies_file = None
|
||||
self._profile_page_cache.clear()
|
||||
|
||||
@staticmethod
|
||||
def extract_creator_id(url: str) -> Optional[Tuple[str, str]]:
|
||||
"""
|
||||
Extract creator type and identifier from Pornhub URL
|
||||
|
||||
Returns:
|
||||
Tuple of (type, id) where type is 'pornstar', 'channels', 'users', or 'model'
|
||||
or None if not a valid Pornhub creator URL
|
||||
"""
|
||||
patterns = [
|
||||
(r'pornhub\.com/pornstar/([a-zA-Z0-9_-]+)', 'pornstar'),
|
||||
(r'pornhub\.com/channels/([a-zA-Z0-9_-]+)', 'channels'),
|
||||
(r'pornhub\.com/users/([a-zA-Z0-9_-]+)', 'users'),
|
||||
(r'pornhub\.com/model/([a-zA-Z0-9_-]+)', 'model'),
|
||||
]
|
||||
|
||||
for pattern, creator_type in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
return (creator_type, match.group(1))
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def normalize_creator_url(creator_id: str, creator_type: str = 'pornstar') -> str:
|
||||
"""Convert creator ID to a consistent URL format
|
||||
|
||||
Args:
|
||||
creator_id: Creator name/identifier (may be 'type/name' format)
|
||||
creator_type: Default type if not embedded in creator_id
|
||||
"""
|
||||
# Already a full URL
|
||||
if creator_id.startswith('http://') or creator_id.startswith('https://'):
|
||||
return creator_id
|
||||
|
||||
# Handle 'type/name' format from URL parser
|
||||
if '/' in creator_id:
|
||||
parts = creator_id.split('/', 1)
|
||||
creator_type = parts[0]
|
||||
creator_id = parts[1]
|
||||
|
||||
return f"https://www.pornhub.com/{creator_type}/{creator_id}"
|
||||
|
||||
def _get_listing_url(self, url: str) -> str:
|
||||
"""Get the URL to use for listing videos from a creator page.
|
||||
|
||||
For pornstars and models, append /videos to get the video listing.
|
||||
For channels and users, the base URL already lists videos.
|
||||
"""
|
||||
# Parse out the type
|
||||
parsed = self.extract_creator_id(url)
|
||||
if parsed:
|
||||
creator_type, _ = parsed
|
||||
if creator_type in ('pornstar', 'model'):
|
||||
# Strip any trailing slash and append /videos
|
||||
url = url.rstrip('/')
|
||||
if not url.endswith('/videos'):
|
||||
url = f"{url}/videos"
|
||||
return url
|
||||
|
||||
async def get_creator_info(self, url: str) -> Optional[Dict]:
|
||||
"""
|
||||
Get creator information using yt-dlp + profile page scraping
|
||||
|
||||
Returns dict with creator metadata or None if not found
|
||||
"""
|
||||
if not self.is_available():
|
||||
return None
|
||||
|
||||
creator_type_id = self.extract_creator_id(url)
|
||||
creator_type = creator_type_id[0] if creator_type_id else 'pornstar'
|
||||
|
||||
# Try to scrape the display name from the profile page first
|
||||
creator_name = None
|
||||
try:
|
||||
page_html = await self.get_profile_page(url)
|
||||
if page_html:
|
||||
# Look for <h1 itemprop="name">Name</h1> inside nameSubscribe div
|
||||
name_match = re.search(r'<div class="nameSubscribe">.*?<h1[^>]*>\s*(.+?)\s*</h1>', page_html, re.DOTALL)
|
||||
if name_match:
|
||||
creator_name = html_module.unescape(name_match.group(1).strip())
|
||||
self.log(f"Found creator name from profile page: {creator_name}", 'debug')
|
||||
except Exception as e:
|
||||
self.log(f"Could not scrape creator name: {e}", 'debug')
|
||||
|
||||
# If page scraping didn't find a name, try yt-dlp
|
||||
if not creator_name:
|
||||
try:
|
||||
listing_url = self._get_listing_url(url)
|
||||
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--playlist-items', '1',
|
||||
listing_url
|
||||
]
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode == 0:
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
playlist_title = data.get('playlist_title') or ''
|
||||
creator_name = (data.get('channel') or data.get('uploader')
|
||||
or playlist_title.replace(' - Videos', '') or None)
|
||||
if creator_name:
|
||||
creator_name = html_module.unescape(creator_name)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
self.log(f"yt-dlp creator info failed: {e}", 'debug')
|
||||
|
||||
# Fall back to deriving name from URL slug
|
||||
if not creator_name and creator_type_id:
|
||||
creator_name = creator_type_id[1].replace('-', ' ').title()
|
||||
|
||||
if creator_name:
|
||||
return {
|
||||
'creator_id': creator_type_id[1] if creator_type_id else None,
|
||||
'creator_name': creator_name,
|
||||
'creator_url': url,
|
||||
'creator_type': creator_type,
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
async def get_creator_videos(self, url: str, since_date: str = None,
|
||||
max_videos: int = None,
|
||||
progress_callback=None) -> List[Dict]:
|
||||
"""
|
||||
Get all videos from a creator page using --flat-playlist for speed.
|
||||
|
||||
Args:
|
||||
url: Pornhub creator URL
|
||||
since_date: Only fetch videos published after this date (ISO format)
|
||||
max_videos: Maximum number of videos to fetch
|
||||
progress_callback: Callback function(count) for progress updates
|
||||
|
||||
Returns:
|
||||
List of video metadata dicts
|
||||
"""
|
||||
if not self.is_available():
|
||||
return []
|
||||
|
||||
try:
|
||||
listing_url = self._get_listing_url(url)
|
||||
|
||||
# Use --flat-playlist for fast listing (avoids per-video HTTP requests)
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'--flat-playlist',
|
||||
'-j',
|
||||
'--socket-timeout', '30',
|
||||
'--retries', '3',
|
||||
listing_url
|
||||
]
|
||||
|
||||
if max_videos:
|
||||
cmd.extend(['--playlist-items', f'1:{max_videos}'])
|
||||
|
||||
self.log(f"Fetching videos from: {url}", 'info')
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error = stderr.decode('utf-8', errors='replace')
|
||||
self.log(f"Failed to get creator videos: {error}", 'warning')
|
||||
return []
|
||||
|
||||
videos = []
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
|
||||
# Skip non-video entries
|
||||
if data.get('_type') == 'playlist':
|
||||
continue
|
||||
|
||||
video_id = data.get('id')
|
||||
if not video_id:
|
||||
continue
|
||||
|
||||
# Flat-playlist doesn't provide upload_date for Pornhub, but check anyway
|
||||
upload_date = data.get('upload_date')
|
||||
if upload_date:
|
||||
try:
|
||||
upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Decode HTML entities in title (flat-playlist returns them encoded)
|
||||
title = html_module.unescape(data.get('title', f'Video {video_id}'))
|
||||
|
||||
# Build video URL
|
||||
video_url = (data.get('webpage_url') or data.get('url')
|
||||
or f"https://www.pornhub.com/view_video.php?viewkey={video_id}")
|
||||
|
||||
videos.append({
|
||||
'video_id': video_id,
|
||||
'title': title,
|
||||
'description': data.get('description', ''),
|
||||
'upload_date': upload_date,
|
||||
'duration': data.get('duration'),
|
||||
'view_count': data.get('view_count'),
|
||||
'thumbnail': data.get('thumbnail'),
|
||||
'url': video_url,
|
||||
})
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(videos))
|
||||
|
||||
if max_videos and len(videos) >= max_videos:
|
||||
break
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
self.log(f"Found {len(videos)} videos", 'info')
|
||||
return videos
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error getting creator videos: {e}", 'error')
|
||||
return []
|
||||
|
||||
async def download_video(self, video_url: str, output_dir: Path, quality: str = 'best',
|
||||
progress_callback=None) -> Dict:
|
||||
"""
|
||||
Download a video
|
||||
|
||||
Args:
|
||||
video_url: Pornhub video URL
|
||||
output_dir: Directory to save the video
|
||||
quality: Quality preset
|
||||
progress_callback: Callback for download progress
|
||||
|
||||
Returns:
|
||||
Dict with success status and file info
|
||||
"""
|
||||
if not self.is_available():
|
||||
return {'success': False, 'error': 'yt-dlp not available'}
|
||||
|
||||
try:
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s')
|
||||
|
||||
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
|
||||
|
||||
cmd = self._get_base_cmd() + [
|
||||
'--no-warnings',
|
||||
'-f', format_str,
|
||||
'-o', output_template,
|
||||
'--print-json',
|
||||
'--no-playlist',
|
||||
'--user-agent', 'Mozilla/5.0',
|
||||
'--referer', 'https://www.pornhub.com/',
|
||||
'--merge-output-format', 'mp4',
|
||||
'--concurrent-fragments', '4',
|
||||
'--no-part',
|
||||
'--retries', '20',
|
||||
video_url
|
||||
]
|
||||
|
||||
self.log(f"Downloading video: {video_url}", 'debug')
|
||||
|
||||
result = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
|
||||
stdout, stderr = await result.communicate()
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = stderr.decode('utf-8', errors='replace').strip()
|
||||
if 'Video unavailable' in error_msg or 'not available' in error_msg:
|
||||
error_msg = 'Video unavailable or private'
|
||||
elif 'premium' in error_msg.lower():
|
||||
error_msg = 'Video requires premium access'
|
||||
elif len(error_msg) > 200:
|
||||
error_msg = error_msg[:200] + '...'
|
||||
|
||||
return {'success': False, 'error': error_msg}
|
||||
|
||||
# Parse output JSON
|
||||
video_info = None
|
||||
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
||||
try:
|
||||
video_info = json.loads(line)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not video_info:
|
||||
# Try to find downloaded file
|
||||
files = list(output_dir.glob('*.mp4'))
|
||||
if files:
|
||||
file_path = max(files, key=lambda f: f.stat().st_mtime)
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(file_path),
|
||||
'filename': file_path.name,
|
||||
'file_size': file_path.stat().st_size
|
||||
}
|
||||
return {'success': False, 'error': 'Could not find downloaded file'}
|
||||
|
||||
file_path = video_info.get('_filename') or video_info.get('filename')
|
||||
if file_path:
|
||||
file_path = Path(file_path)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'file_path': str(file_path) if file_path else None,
|
||||
'filename': file_path.name if file_path else None,
|
||||
'file_size': file_path.stat().st_size if file_path and file_path.exists() else video_info.get('filesize'),
|
||||
'title': video_info.get('title'),
|
||||
'duration': video_info.get('duration'),
|
||||
'video_id': video_info.get('id'),
|
||||
'upload_date': video_info.get('upload_date'),
|
||||
'timestamp': video_info.get('timestamp'),
|
||||
'thumbnail': video_info.get('thumbnail'),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error downloading video: {e}", 'error')
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
async def get_profile_page(self, url: str) -> Optional[str]:
|
||||
"""Fetch profile page HTML via aiohttp (with cookies if available).
|
||||
Results are cached to avoid re-fetching for avatar/banner/bio."""
|
||||
# Strip /videos suffix for profile page
|
||||
base_url = re.sub(r'/videos/?$', '', url)
|
||||
|
||||
if base_url in self._profile_page_cache:
|
||||
return self._profile_page_cache[base_url]
|
||||
|
||||
try:
|
||||
import aiohttp
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
}
|
||||
|
||||
# Build simple cookies dict for the session
|
||||
cookies_dict = {}
|
||||
cookies_list = self._get_cookies_list()
|
||||
if cookies_list:
|
||||
for cookie in cookies_list:
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
if name:
|
||||
cookies_dict[name] = value
|
||||
|
||||
async with aiohttp.ClientSession(cookies=cookies_dict) as session:
|
||||
async with session.get(
|
||||
base_url,
|
||||
headers=headers,
|
||||
timeout=aiohttp.ClientTimeout(total=15)
|
||||
) as resp:
|
||||
if resp.status == 200:
|
||||
text = await resp.text()
|
||||
self._profile_page_cache[base_url] = text
|
||||
return text
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Could not fetch profile page: {e}", 'debug')
|
||||
|
||||
self._profile_page_cache[base_url] = None
|
||||
return None
|
||||
|
||||
async def get_profile_image(self, url: str) -> Optional[str]:
|
||||
"""Scrape profile page for avatar/photo URL"""
|
||||
try:
|
||||
page_html = await self.get_profile_page(url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
# Look for avatar image: <img id="getAvatar" src="...">
|
||||
avatar_match = re.search(r'<img[^>]*id=["\']getAvatar["\'][^>]*src=["\']([^"\']+)["\']', page_html)
|
||||
if avatar_match:
|
||||
self.log("Found Pornhub profile avatar", 'debug')
|
||||
return avatar_match.group(1)
|
||||
|
||||
# Try og:image meta tag
|
||||
og_match = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_html)
|
||||
if not og_match:
|
||||
og_match = re.search(r'<meta\s+content="([^"]+)"\s+property="og:image"', page_html)
|
||||
if og_match:
|
||||
return og_match.group(1)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Could not fetch profile image: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def get_profile_bio(self, url: str) -> Optional[str]:
|
||||
"""Scrape bio/about section from profile page"""
|
||||
try:
|
||||
page_html = await self.get_profile_page(url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
# Look for aboutMeSection -> div with the actual text
|
||||
# Structure: <section class="aboutMeSection ..."><div class="title">About Name</div><div>Bio text</div></section>
|
||||
about_match = re.search(
|
||||
r'<section\s+class="aboutMeSection[^"]*"[^>]*>.*?<div class="title">[^<]*</div>\s*<div>\s*(.*?)\s*</div>',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
if about_match:
|
||||
bio_text = re.sub(r'<[^>]+>', '', about_match.group(1)).strip()
|
||||
if bio_text:
|
||||
self.log("Found Pornhub profile bio", 'debug')
|
||||
return html_module.unescape(bio_text)
|
||||
|
||||
# Fallback: look for biographyAbout section
|
||||
bio_match = re.search(
|
||||
r'class="biographyAbout[^"]*"[^>]*>.*?<div class="content[^"]*">(.*?)</div>',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
if bio_match:
|
||||
bio_text = re.sub(r'<[^>]+>', '', bio_match.group(1)).strip()
|
||||
if bio_text:
|
||||
self.log("Found Pornhub profile bio (fallback)", 'debug')
|
||||
return html_module.unescape(bio_text)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Could not fetch profile bio: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def get_profile_banner(self, url: str) -> Optional[str]:
|
||||
"""Scrape banner/cover image if available"""
|
||||
try:
|
||||
page_html = await self.get_profile_page(url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
# Look for cover image: <img id="coverPictureDefault" src="...">
|
||||
cover_match = re.search(
|
||||
r'<img[^>]*id=["\']coverPictureDefault["\'][^>]*src=["\']([^"\']+)["\']',
|
||||
page_html
|
||||
)
|
||||
if cover_match:
|
||||
self.log("Found Pornhub profile banner", 'debug')
|
||||
return cover_match.group(1)
|
||||
|
||||
# Fallback: any img inside coverImage div
|
||||
cover_match = re.search(
|
||||
r'<div class="coverImage">\s*<img[^>]*src=["\']([^"\']+)["\']',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
if cover_match:
|
||||
self.log("Found Pornhub profile banner (div)", 'debug')
|
||||
return cover_match.group(1)
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Could not fetch profile banner: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
async def get_profile_info(self, url: str) -> Optional[Dict]:
|
||||
"""Scrape all profile info from the page in one pass"""
|
||||
page_html = await self.get_profile_page(url)
|
||||
if not page_html:
|
||||
return None
|
||||
|
||||
info = {}
|
||||
|
||||
# Extract infoPiece data (Gender, Birth Place, Height, etc.)
|
||||
info_pieces = re.findall(
|
||||
r'<div class="infoPiece">\s*<span>\s*(.*?)\s*</span>\s*(.*?)\s*</div>',
|
||||
page_html, re.DOTALL
|
||||
)
|
||||
for label, value in info_pieces:
|
||||
label = re.sub(r'<[^>]+>', '', label).strip().rstrip(':')
|
||||
value = re.sub(r'<[^>]+>', '', value).strip()
|
||||
if label and value:
|
||||
info[label.lower().replace(' ', '_')] = value
|
||||
|
||||
return info if info else None
|
||||
|
||||
async def get_joined_date(self, url: str) -> Optional[str]:
|
||||
"""Extract a joined/career start date from profile info"""
|
||||
try:
|
||||
profile_info = await self.get_profile_info(url)
|
||||
if not profile_info:
|
||||
return None
|
||||
|
||||
# Pornstar pages have "Career Start and End: 2011 to Present"
|
||||
career = profile_info.get('career_start_and_end')
|
||||
if career:
|
||||
# Extract start year: "2011 to Present" -> "2011"
|
||||
match = re.match(r'(\d{4})', career)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# User/model pages might not have career info but could have other dates
|
||||
return None
|
||||
except Exception as e:
|
||||
self.log(f"Could not get joined date: {e}", 'debug')
|
||||
return None
|
||||
|
||||
async def get_creator(self, url: str) -> Optional[Creator]:
|
||||
"""
|
||||
Get Creator object from creator URL
|
||||
"""
|
||||
info = await self.get_creator_info(url)
|
||||
if not info:
|
||||
return None
|
||||
|
||||
# Build creator_id as 'type/name' format
|
||||
creator_type_id = self.extract_creator_id(url)
|
||||
if creator_type_id:
|
||||
creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}"
|
||||
else:
|
||||
creator_id = info.get('creator_id', '')
|
||||
|
||||
# Profile image is already fetched during get_creator_info (page was cached)
|
||||
profile_image = await self.get_profile_image(url)
|
||||
|
||||
return Creator(
|
||||
creator_id=creator_id,
|
||||
service_id='pornhub',
|
||||
platform='pornhub',
|
||||
username=info.get('creator_name', 'Unknown'),
|
||||
display_name=info.get('creator_name'),
|
||||
profile_image_url=profile_image,
|
||||
)
|
||||
|
||||
async def get_posts(self, url: str, since_date: str = None,
|
||||
max_videos: int = None, progress_callback=None) -> List[Post]:
|
||||
"""
|
||||
Get videos as Post objects
|
||||
"""
|
||||
videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback)
|
||||
|
||||
# Get creator_id from URL
|
||||
creator_type_id = self.extract_creator_id(url)
|
||||
creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}" if creator_type_id else ''
|
||||
|
||||
posts = []
|
||||
for video in videos:
|
||||
# Create attachment for the video
|
||||
attachment = Attachment(
|
||||
name=f"{video['title']}.mp4",
|
||||
file_type='video',
|
||||
extension='.mp4',
|
||||
server_path=video['url'],
|
||||
download_url=video['url'],
|
||||
duration=video.get('duration'),
|
||||
)
|
||||
|
||||
post = Post(
|
||||
post_id=video['video_id'],
|
||||
service_id='pornhub',
|
||||
platform='pornhub',
|
||||
creator_id=creator_id,
|
||||
title=video['title'],
|
||||
content=video.get('description') or video['title'],
|
||||
published_at=video.get('upload_date'),
|
||||
attachments=[attachment],
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
return posts
|
||||
Reference in New Issue
Block a user