1415 lines
58 KiB
Python
1415 lines
58 KiB
Python
"""
|
|
XHamster Client - Fetches creator info and videos using yt-dlp
|
|
|
|
Supports:
|
|
- Creator profiles (xhamster.com/creators/name)
|
|
- Channels (xhamster.com/channels/name)
|
|
- Shorts (xhamster.com/creators/name/shorts)
|
|
- Photo galleries (xhamster.com/creators/name/photos)
|
|
"""
|
|
|
|
import asyncio
|
|
import html as html_module
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import tempfile
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
from modules.base_module import LoggingMixin
|
|
from .models import Creator, Post, Attachment
|
|
|
|
|
|
class XHamsterClient(LoggingMixin):
|
|
"""
|
|
Client for fetching XHamster creator information and videos using yt-dlp
|
|
|
|
Supports:
|
|
- Creator pages (xhamster.com/creators/name)
|
|
- Channel pages (xhamster.com/channels/name)
|
|
- Creator shorts (xhamster.com/creators/name/shorts)
|
|
"""
|
|
|
|
SERVICE_ID = 'xhamster'
|
|
PLATFORM = 'xhamster'
|
|
|
|
QUALITY_PRESETS = {
|
|
'best': 'bestvideo+bestaudio/best',
|
|
'1080p': 'bestvideo[height<=1080]+bestaudio/best[height<=1080]/best',
|
|
'720p': 'bestvideo[height<=720]+bestaudio/best[height<=720]/best',
|
|
'480p': 'bestvideo[height<=480]+bestaudio/best[height<=480]/best',
|
|
}
|
|
|
|
def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None):
|
|
self._init_logger('PaidContent', log_callback, default_module='XHamster')
|
|
|
|
self.ytdlp_path = ytdlp_path or self._find_ytdlp()
|
|
if not self.ytdlp_path:
|
|
self.log("yt-dlp not found, XHamster support will be disabled", 'warning')
|
|
|
|
self.unified_db = unified_db
|
|
self._cookies_file = None
|
|
self._profile_page_cache: Dict[str, Optional[str]] = {}
|
|
|
|
def _find_ytdlp(self) -> Optional[str]:
|
|
"""Find yt-dlp executable"""
|
|
common_paths = [
|
|
'/opt/media-downloader/venv/bin/yt-dlp',
|
|
'/usr/local/bin/yt-dlp',
|
|
'/usr/bin/yt-dlp',
|
|
'/opt/homebrew/bin/yt-dlp',
|
|
os.path.expanduser('~/.local/bin/yt-dlp'),
|
|
]
|
|
|
|
for path in common_paths:
|
|
if os.path.isfile(path) and os.access(path, os.X_OK):
|
|
return path
|
|
|
|
try:
|
|
result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
|
|
if result.returncode == 0:
|
|
return result.stdout.strip()
|
|
except Exception:
|
|
pass
|
|
|
|
return None
|
|
|
|
def is_available(self) -> bool:
|
|
"""Check if yt-dlp is available"""
|
|
return self.ytdlp_path is not None
|
|
|
|
def _get_cookies_file(self) -> Optional[str]:
|
|
"""Get path to cookies file, creating it from database if needed"""
|
|
if self._cookies_file and os.path.exists(self._cookies_file):
|
|
return self._cookies_file
|
|
|
|
if not self.unified_db:
|
|
return None
|
|
|
|
try:
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('xhamster',))
|
|
row = cursor.fetchone()
|
|
if row and row[0]:
|
|
data = json.loads(row[0])
|
|
if isinstance(data, dict) and 'cookies' in data:
|
|
cookies_list = data['cookies']
|
|
elif isinstance(data, list):
|
|
cookies_list = data
|
|
else:
|
|
cookies_list = []
|
|
|
|
if cookies_list:
|
|
fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='xhamster_cookies_')
|
|
with os.fdopen(fd, 'w') as f:
|
|
f.write("# Netscape HTTP Cookie File\n")
|
|
for cookie in cookies_list:
|
|
domain = cookie.get('domain', '')
|
|
include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
|
|
path = cookie.get('path', '/')
|
|
secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
|
|
expiry = str(int(cookie.get('expirationDate', 0)))
|
|
name = cookie.get('name', '')
|
|
value = cookie.get('value', '')
|
|
f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
|
|
self.log(f"Loaded {len(cookies_list)} cookies from xhamster scraper", 'debug')
|
|
return self._cookies_file
|
|
except Exception as e:
|
|
self.log(f"Could not load cookies: {e}", 'debug')
|
|
|
|
return None
|
|
|
|
def _get_base_cmd(self) -> List[str]:
|
|
"""Get base yt-dlp command with cookies if available"""
|
|
cmd = [self.ytdlp_path]
|
|
cookies_file = self._get_cookies_file()
|
|
if cookies_file:
|
|
cmd.extend(['--cookies', cookies_file])
|
|
return cmd
|
|
|
|
def cleanup(self):
|
|
"""Clean up temporary files"""
|
|
if self._cookies_file and os.path.exists(self._cookies_file):
|
|
try:
|
|
os.unlink(self._cookies_file)
|
|
except Exception:
|
|
pass
|
|
self._cookies_file = None
|
|
self._profile_page_cache.clear()
|
|
|
|
@staticmethod
|
|
def extract_creator_id(url: str) -> Optional[Tuple[str, str]]:
|
|
"""
|
|
Extract creator type and identifier from XHamster URL
|
|
|
|
Returns:
|
|
Tuple of (type, id) where type is 'creators' or 'channels'
|
|
or None if not a valid XHamster creator URL
|
|
"""
|
|
patterns = [
|
|
(r'xhamster\d*\.com/creators/([a-zA-Z0-9_-]+)', 'creators'),
|
|
(r'xhamster\d*\.com/channels/([a-zA-Z0-9_-]+)', 'channels'),
|
|
]
|
|
|
|
for pattern, creator_type in patterns:
|
|
match = re.search(pattern, url)
|
|
if match:
|
|
return (creator_type, match.group(1))
|
|
|
|
return None
|
|
|
|
@staticmethod
|
|
def normalize_creator_url(creator_id: str, creator_type: str = 'creators') -> str:
|
|
"""Convert creator ID to a consistent URL format"""
|
|
if creator_id.startswith('http://') or creator_id.startswith('https://'):
|
|
return creator_id
|
|
|
|
if '/' in creator_id:
|
|
parts = creator_id.split('/', 1)
|
|
creator_type = parts[0]
|
|
creator_id = parts[1]
|
|
|
|
return f"https://xhamster.com/{creator_type}/{creator_id}"
|
|
|
|
def _get_listing_url(self, url: str) -> str:
|
|
"""Get the URL to use for listing videos from a creator page.
|
|
|
|
Strips /shorts suffix for the main listing, or keeps it for shorts-only.
|
|
"""
|
|
return url.rstrip('/')
|
|
|
|
async def get_creator_info(self, url: str) -> Optional[Dict]:
|
|
"""Get creator information using yt-dlp"""
|
|
if not self.is_available():
|
|
return None
|
|
|
|
creator_type_id = self.extract_creator_id(url)
|
|
creator_type = creator_type_id[0] if creator_type_id else 'creators'
|
|
|
|
creator_name = None
|
|
|
|
# Try to scrape the display name from the profile page
|
|
try:
|
|
page_html = await self.get_profile_page(url)
|
|
if page_html:
|
|
name_match = re.search(r'<h1[^>]*class="[^"]*name[^"]*"[^>]*>\s*(.+?)\s*</h1>', page_html, re.DOTALL)
|
|
if not name_match:
|
|
name_match = re.search(r'<title>([^<|]+)', page_html)
|
|
if name_match:
|
|
creator_name = html_module.unescape(name_match.group(1).strip())
|
|
# Clean up title suffix
|
|
creator_name = re.sub(r'\s*[-|].*$', '', creator_name).strip()
|
|
self.log(f"Found creator name from profile page: {creator_name}", 'debug')
|
|
except Exception as e:
|
|
self.log(f"Could not scrape creator name: {e}", 'debug')
|
|
|
|
# If page scraping didn't find a name, try yt-dlp
|
|
if not creator_name:
|
|
try:
|
|
listing_url = self._get_listing_url(url)
|
|
|
|
cmd = self._get_base_cmd() + [
|
|
'--no-warnings',
|
|
'--flat-playlist',
|
|
'-j',
|
|
'--playlist-items', '1',
|
|
'--socket-timeout', '30',
|
|
listing_url
|
|
]
|
|
|
|
result = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE
|
|
)
|
|
|
|
stdout, stderr = await result.communicate()
|
|
|
|
if result.returncode == 0:
|
|
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
|
if not line:
|
|
continue
|
|
try:
|
|
data = json.loads(line)
|
|
creator_name = (data.get('channel') or data.get('uploader')
|
|
or data.get('playlist_title') or None)
|
|
if creator_name:
|
|
creator_name = html_module.unescape(creator_name)
|
|
break
|
|
except json.JSONDecodeError:
|
|
continue
|
|
except Exception as e:
|
|
self.log(f"yt-dlp creator info failed: {e}", 'debug')
|
|
|
|
# Fall back to deriving name from URL slug
|
|
if not creator_name and creator_type_id:
|
|
creator_name = creator_type_id[1].replace('-', ' ').title()
|
|
|
|
if creator_name:
|
|
return {
|
|
'creator_id': creator_type_id[1] if creator_type_id else None,
|
|
'creator_name': creator_name,
|
|
'creator_url': url,
|
|
'creator_type': creator_type,
|
|
}
|
|
|
|
return None
|
|
|
|
async def get_creator_videos(self, url: str, since_date: str = None,
|
|
max_videos: int = None,
|
|
progress_callback=None) -> List[Dict]:
|
|
"""Get all videos from a creator page using --flat-playlist for speed."""
|
|
if not self.is_available():
|
|
return []
|
|
|
|
try:
|
|
listing_url = self._get_listing_url(url)
|
|
|
|
cmd = self._get_base_cmd() + [
|
|
'--no-warnings',
|
|
'--flat-playlist',
|
|
'-j',
|
|
'--socket-timeout', '30',
|
|
'--retries', '3',
|
|
listing_url
|
|
]
|
|
|
|
if max_videos:
|
|
cmd.extend(['--playlist-items', f'1:{max_videos}'])
|
|
|
|
self.log(f"Fetching videos from: {url}", 'info')
|
|
|
|
result = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE
|
|
)
|
|
|
|
stdout, stderr = await result.communicate()
|
|
|
|
if result.returncode != 0:
|
|
error = stderr.decode('utf-8', errors='replace')
|
|
self.log(f"Failed to get creator videos: {error}", 'warning')
|
|
return []
|
|
|
|
videos = []
|
|
seen_ids = set()
|
|
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
|
if not line:
|
|
continue
|
|
try:
|
|
data = json.loads(line)
|
|
|
|
if data.get('_type') == 'playlist':
|
|
continue
|
|
|
|
video_id = data.get('id')
|
|
video_url = (data.get('webpage_url') or data.get('url') or '')
|
|
|
|
# flat-playlist returns _type=url entries with id=null
|
|
# Extract video_id from URL: .../videos/{slug}-{xhID}
|
|
if not video_id and video_url:
|
|
basename = video_url.rstrip('/').split('/')[-1]
|
|
# xhamster IDs are the last segment: slug-xhXXXXX
|
|
id_match = re.search(r'-(xh[A-Za-z0-9]{4,7})$', basename)
|
|
if id_match:
|
|
video_id = id_match.group(1)
|
|
elif basename:
|
|
video_id = basename
|
|
|
|
if not video_id:
|
|
continue
|
|
|
|
if video_id in seen_ids:
|
|
continue
|
|
seen_ids.add(video_id)
|
|
|
|
upload_date = data.get('upload_date')
|
|
if upload_date:
|
|
try:
|
|
upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat()
|
|
except ValueError:
|
|
pass
|
|
|
|
title = data.get('title')
|
|
if not title:
|
|
# Derive title from URL slug
|
|
basename = video_url.rstrip('/').split('/')[-1] if video_url else ''
|
|
# Remove the xhamster ID suffix
|
|
slug = re.sub(r'-xh[A-Za-z0-9]{4,7}$', '', basename)
|
|
title = slug.replace('-', ' ').title() if slug else f'Video {video_id}'
|
|
else:
|
|
title = html_module.unescape(title)
|
|
|
|
if not video_url:
|
|
video_url = f"https://xhamster.com/videos/{video_id}"
|
|
|
|
videos.append({
|
|
'video_id': str(video_id),
|
|
'title': title,
|
|
'description': data.get('description', ''),
|
|
'upload_date': upload_date,
|
|
'duration': data.get('duration'),
|
|
'view_count': data.get('view_count'),
|
|
'thumbnail': data.get('thumbnail'),
|
|
'url': video_url,
|
|
})
|
|
|
|
if progress_callback:
|
|
progress_callback(len(videos))
|
|
|
|
if max_videos and len(videos) >= max_videos:
|
|
break
|
|
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
self.log(f"Found {len(videos)} videos", 'info')
|
|
return videos
|
|
|
|
except Exception as e:
|
|
self.log(f"Error getting creator videos: {e}", 'error')
|
|
return []
|
|
|
|
async def get_creator_shorts(self, url: str, max_items: int = None,
|
|
progress_callback=None) -> List[Dict]:
|
|
"""Get shorts/moments from a creator page by scraping HTML.
|
|
|
|
Scrapes /creators/{name}/shorts pages and extracts video data from
|
|
window.initials.momentsComponent.videoListProps.videoThumbProps.
|
|
"""
|
|
try:
|
|
base_url = re.sub(r'/(videos|shorts|photos)/?$', '', url.rstrip('/'))
|
|
shorts_url = f"{base_url}/shorts"
|
|
|
|
self.log(f"Fetching shorts from: {shorts_url}", 'info')
|
|
|
|
all_shorts = []
|
|
seen_ids = set()
|
|
page = 1
|
|
|
|
while True:
|
|
page_url = f"{shorts_url}/{page}" if page > 1 else shorts_url
|
|
html = await self._fetch_page_html(page_url)
|
|
if not html:
|
|
break
|
|
|
|
thumb_props = self._extract_initials_json(html, 'momentsComponent.videoListProps.videoThumbProps')
|
|
if not thumb_props or not isinstance(thumb_props, list):
|
|
if page == 1:
|
|
self.log("No shorts found for this creator", 'debug')
|
|
break
|
|
|
|
for item in thumb_props:
|
|
video_id = str(item.get('id', ''))
|
|
if not video_id:
|
|
continue
|
|
|
|
if video_id in seen_ids:
|
|
continue
|
|
seen_ids.add(video_id)
|
|
|
|
page_url_item = item.get('pageURL', '')
|
|
# Extract xhID from moment URL: /moments/{slug}-{xhID}
|
|
xh_id = None
|
|
if page_url_item:
|
|
id_match = re.search(r'-(xh[A-Za-z0-9]{4,7})$', page_url_item.rstrip('/').split('/')[-1])
|
|
if id_match:
|
|
xh_id = id_match.group(1)
|
|
|
|
title = item.get('title', '')
|
|
if title:
|
|
title = html_module.unescape(title)
|
|
else:
|
|
title = f'Short {video_id}'
|
|
|
|
all_shorts.append({
|
|
'video_id': xh_id or video_id,
|
|
'title': title,
|
|
'description': '',
|
|
'upload_date': None, # Shorts listings don't include dates
|
|
'duration': None,
|
|
'view_count': item.get('views'),
|
|
'thumbnail': item.get('thumbURL') or item.get('imageURL'),
|
|
'url': page_url_item or f"https://xhamster.com/moments/{video_id}",
|
|
})
|
|
|
|
if progress_callback:
|
|
progress_callback(len(all_shorts))
|
|
|
|
if max_items and len(all_shorts) >= max_items:
|
|
break
|
|
|
|
if max_items and len(all_shorts) >= max_items:
|
|
break
|
|
|
|
# Check pagination
|
|
pagination = self._extract_initials_json(html, 'momentsComponent.videoListProps.pagination')
|
|
if not pagination:
|
|
# Also try top-level pagination
|
|
pagination = self._extract_initials_json(html, 'pagination')
|
|
|
|
next_page = pagination.get('next', 0) if pagination else 0
|
|
if not next_page or next_page <= page:
|
|
break
|
|
|
|
page = next_page
|
|
await asyncio.sleep(1)
|
|
|
|
self.log(f"Found {len(all_shorts)} shorts", 'info')
|
|
return all_shorts
|
|
|
|
except Exception as e:
|
|
self.log(f"Error getting creator shorts: {e}", 'error')
|
|
return []
|
|
|
|
async def get_creator_galleries(self, url: str, max_items: int = None,
|
|
progress_callback=None) -> List[Dict]:
|
|
"""Get photo gallery listings from a creator page.
|
|
|
|
Scrapes /creators/{name}/photos pages and extracts gallery data from
|
|
window.initials.userGalleriesCollection.
|
|
"""
|
|
try:
|
|
base_url = re.sub(r'/(videos|shorts|photos)/?$', '', url.rstrip('/'))
|
|
photos_url = f"{base_url}/photos"
|
|
|
|
self.log(f"Fetching galleries from: {photos_url}", 'info')
|
|
|
|
all_galleries = []
|
|
seen_ids = set()
|
|
page = 1
|
|
|
|
while True:
|
|
page_url = f"{photos_url}/{page}" if page > 1 else photos_url
|
|
html = await self._fetch_page_html(page_url)
|
|
if not html:
|
|
break
|
|
|
|
galleries = self._extract_initials_json(html, 'userGalleriesCollection')
|
|
if not galleries or not isinstance(galleries, list):
|
|
if page == 1:
|
|
self.log("No galleries found for this creator", 'debug')
|
|
break
|
|
|
|
for gallery in galleries:
|
|
gallery_id = str(gallery.get('galleryID', ''))
|
|
if not gallery_id:
|
|
continue
|
|
|
|
if gallery_id in seen_ids:
|
|
continue
|
|
seen_ids.add(gallery_id)
|
|
|
|
title = gallery.get('title', '')
|
|
if title:
|
|
title = html_module.unescape(title)
|
|
|
|
all_galleries.append({
|
|
'gallery_id': gallery_id,
|
|
'title': title or f'Gallery {gallery_id}',
|
|
'url': gallery.get('pageURL', ''),
|
|
'thumbnail': gallery.get('thumbURL') or gallery.get('imageURL'),
|
|
'image_count': gallery.get('quantity', 0),
|
|
'views': gallery.get('views', 0),
|
|
})
|
|
|
|
if progress_callback:
|
|
progress_callback(len(all_galleries))
|
|
|
|
if max_items and len(all_galleries) >= max_items:
|
|
break
|
|
|
|
if max_items and len(all_galleries) >= max_items:
|
|
break
|
|
|
|
# Check pagination
|
|
pagination = self._extract_initials_json(html, 'pagination')
|
|
max_page = pagination.get('maxPage', 1) if pagination else 1
|
|
if page >= max_page:
|
|
break
|
|
|
|
page += 1
|
|
await asyncio.sleep(1)
|
|
|
|
self.log(f"Found {len(all_galleries)} galleries", 'info')
|
|
return all_galleries
|
|
|
|
except Exception as e:
|
|
self.log(f"Error getting creator galleries: {e}", 'error')
|
|
return []
|
|
|
|
async def get_gallery_images(self, gallery_url: str) -> Optional[Dict]:
|
|
"""Get all images from a single gallery page.
|
|
|
|
Scrapes the gallery page and extracts image data from
|
|
window.initials.galleryPage.photoItems and metadata from
|
|
window.initials.photosGalleryModel.
|
|
"""
|
|
try:
|
|
self.log(f"Fetching gallery images: {gallery_url}", 'debug')
|
|
|
|
all_images = []
|
|
seen_ids = set()
|
|
gallery_id = None
|
|
title = None
|
|
created = None
|
|
last_page = 1
|
|
page = 1
|
|
|
|
while page <= last_page:
|
|
page_url = f"{gallery_url}/{page}" if page > 1 else gallery_url
|
|
html = await self._fetch_page_html(page_url)
|
|
if not html:
|
|
break
|
|
|
|
# Extract gallery metadata on first page
|
|
if page == 1:
|
|
gallery_model = self._extract_initials_json(html, 'photosGalleryModel')
|
|
if not gallery_model:
|
|
gallery_model = self._extract_initials_json(html, 'galleryPage.galleryModel')
|
|
|
|
if gallery_model:
|
|
gallery_id = str(gallery_model.get('galleryID') or gallery_model.get('id', ''))
|
|
title = gallery_model.get('title', '')
|
|
if title:
|
|
title = html_module.unescape(title)
|
|
|
|
created_ts = gallery_model.get('created')
|
|
if created_ts:
|
|
try:
|
|
created = datetime.fromtimestamp(int(created_ts)).isoformat()
|
|
except (ValueError, OSError):
|
|
pass
|
|
|
|
last_page = gallery_model.get('lastPageNumber', 1) or 1
|
|
|
|
# Extract images
|
|
photo_items = self._extract_initials_json(html, 'galleryPage.photoItems')
|
|
if not photo_items:
|
|
photo_items = self._extract_initials_json(html, 'photosGalleryModel.photos')
|
|
|
|
if not photo_items or not isinstance(photo_items, list):
|
|
break
|
|
|
|
for photo in photo_items:
|
|
image_url = photo.get('imgSrc', '')
|
|
if not image_url:
|
|
continue
|
|
|
|
photo_id = str(photo.get('id', ''))
|
|
if not photo_id:
|
|
continue
|
|
|
|
if photo_id in seen_ids:
|
|
continue
|
|
seen_ids.add(photo_id)
|
|
|
|
all_images.append({
|
|
'id': photo_id,
|
|
'url': image_url,
|
|
'width': photo.get('originWidth'),
|
|
'height': photo.get('originHeight'),
|
|
})
|
|
|
|
if page < last_page:
|
|
await asyncio.sleep(0.5)
|
|
page += 1
|
|
|
|
if not all_images:
|
|
self.log(f"No images found in gallery: {gallery_url}", 'debug')
|
|
return None
|
|
|
|
# Fallback gallery_id from URL
|
|
if not gallery_id:
|
|
id_match = re.search(r'-(\d+)$', gallery_url.rstrip('/').split('/')[-1])
|
|
if id_match:
|
|
gallery_id = id_match.group(1)
|
|
|
|
self.log(f"Found {len(all_images)} images in gallery '{title or gallery_id}'", 'debug')
|
|
|
|
return {
|
|
'gallery_id': gallery_id or '',
|
|
'title': title or '',
|
|
'created': created,
|
|
'images': all_images,
|
|
}
|
|
|
|
except Exception as e:
|
|
self.log(f"Error getting gallery images: {e}", 'error')
|
|
return None
|
|
|
|
async def download_image(self, image_url: str, output_path: Path) -> Dict:
|
|
"""Download an image file via aiohttp.
|
|
|
|
Args:
|
|
image_url: Direct URL to the image
|
|
output_path: Full file path to save to
|
|
|
|
Returns:
|
|
Dict with success, file_path, file_size
|
|
"""
|
|
try:
|
|
import aiohttp
|
|
|
|
output_path = Path(output_path)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
'Referer': 'https://xhamster.com/',
|
|
}
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(image_url, headers=headers,
|
|
allow_redirects=True,
|
|
timeout=aiohttp.ClientTimeout(total=60)) as resp:
|
|
if resp.status != 200:
|
|
return {'success': False, 'error': f'HTTP {resp.status}'}
|
|
|
|
with open(output_path, 'wb') as f:
|
|
async for chunk in resp.content.iter_chunked(65536):
|
|
f.write(chunk)
|
|
|
|
file_size = output_path.stat().st_size
|
|
if file_size == 0:
|
|
output_path.unlink(missing_ok=True)
|
|
return {'success': False, 'error': 'Empty file'}
|
|
|
|
return {
|
|
'success': True,
|
|
'file_path': str(output_path),
|
|
'file_size': file_size,
|
|
}
|
|
|
|
except Exception as e:
|
|
self.log(f"Image download failed: {e}", 'debug')
|
|
return {'success': False, 'error': str(e)}
|
|
|
|
async def download_video(self, video_url: str, output_dir: Path, quality: str = 'best',
|
|
progress_callback=None) -> Dict:
|
|
"""Download a video - tries direct download first, falls back to yt-dlp"""
|
|
self.log(f"Downloading video: {video_url}", 'debug')
|
|
|
|
# Try direct download first (yt-dlp's xhamster extractor is often broken)
|
|
result = await self._download_video_direct(video_url, output_dir, progress_callback)
|
|
if result and result.get('success'):
|
|
return result
|
|
|
|
# Fall back to yt-dlp
|
|
if self.is_available():
|
|
result = await self._download_video_ytdlp(video_url, output_dir, quality)
|
|
if result and result.get('success'):
|
|
return result
|
|
|
|
return result or {'success': False, 'error': 'All download methods failed'}
|
|
|
|
async def _download_video_direct(self, video_url: str, output_dir: Path, progress_callback=None) -> Optional[Dict]:
|
|
"""Download video directly by scraping the video page for HLS/MP4 URLs"""
|
|
try:
|
|
import aiohttp
|
|
|
|
output_dir = Path(output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
}
|
|
|
|
# Fetch video page
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(video_url, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
|
|
if resp.status != 200:
|
|
return {'success': False, 'error': f'Page fetch failed: HTTP {resp.status}'}
|
|
page_html = await resp.text()
|
|
|
|
# Extract metadata from page
|
|
title = None
|
|
og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', page_html)
|
|
if not og_match:
|
|
og_match = re.search(r'<meta\s+content="([^"]+)"\s+property="og:title"', page_html)
|
|
if og_match:
|
|
title = html_module.unescape(og_match.group(1).strip())
|
|
|
|
# Extract upload date from page JSON data
|
|
upload_date = None
|
|
timestamp = None
|
|
created_match = re.search(r'"id"\s*:\s*\d+[^}]*"created"\s*:\s*(\d{8,})', page_html)
|
|
if not created_match:
|
|
created_match = re.search(r'"created"\s*:\s*(\d{8,})[^}]*"id"\s*:\s*\d+', page_html)
|
|
if created_match:
|
|
timestamp = int(created_match.group(1))
|
|
try:
|
|
upload_date = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d')
|
|
except (ValueError, OSError):
|
|
pass
|
|
if not upload_date:
|
|
date_match = re.search(r'"datePublished"\s*:\s*"([^"]+)"', page_html)
|
|
if date_match:
|
|
upload_date = date_match.group(1)[:10]
|
|
|
|
# Extract video ID from URL
|
|
video_id = None
|
|
id_match = re.search(r'-(xh[A-Za-z0-9]{4,7})$', video_url.rstrip('/').split('/')[-1])
|
|
if id_match:
|
|
video_id = id_match.group(1)
|
|
|
|
if not title:
|
|
title = video_url.rstrip('/').split('/')[-1]
|
|
|
|
filename = f"{video_id}.mp4" if video_id else f"{re.sub(r'[^\\w\\s-]', '', title)[:100].strip()}.mp4"
|
|
file_path = output_dir / filename
|
|
|
|
# Try to extract video sources from window.initials JSON
|
|
hls_url_from_json = None
|
|
mp4_urls_from_json = {} # quality -> url
|
|
try:
|
|
initials_match = re.search(r'window\.initials\s*=\s*(\{.+?\});\s*</script>', page_html, re.DOTALL)
|
|
if initials_match:
|
|
initials = json.loads(initials_match.group(1))
|
|
video_model = initials.get('videoModel', {})
|
|
sources = video_model.get('sources', {})
|
|
|
|
# HLS source
|
|
hls_data = sources.get('hls')
|
|
if isinstance(hls_data, dict):
|
|
hls_url_from_json = hls_data.get('url')
|
|
elif isinstance(hls_data, str):
|
|
hls_url_from_json = hls_data
|
|
|
|
# MP4 download sources (keyed by quality like "480p", "720p", "1080p")
|
|
download_sources = sources.get('download', {})
|
|
if isinstance(download_sources, dict):
|
|
for quality_key, source_data in download_sources.items():
|
|
if isinstance(source_data, dict):
|
|
url = source_data.get('link') or source_data.get('url')
|
|
if url:
|
|
mp4_urls_from_json[quality_key] = url
|
|
elif isinstance(source_data, str):
|
|
mp4_urls_from_json[quality_key] = source_data
|
|
|
|
# Also check mp4 sources
|
|
mp4_sources = sources.get('mp4', {})
|
|
if isinstance(mp4_sources, dict):
|
|
for quality_key, source_data in mp4_sources.items():
|
|
if quality_key not in mp4_urls_from_json:
|
|
if isinstance(source_data, dict):
|
|
url = source_data.get('link') or source_data.get('url')
|
|
if url:
|
|
mp4_urls_from_json[quality_key] = url
|
|
elif isinstance(source_data, str):
|
|
mp4_urls_from_json[quality_key] = source_data
|
|
|
|
# Also check standard sources
|
|
standard_sources = sources.get('standard', {})
|
|
if isinstance(standard_sources, dict):
|
|
for quality_key, source_data in standard_sources.items():
|
|
if quality_key not in mp4_urls_from_json:
|
|
if isinstance(source_data, dict):
|
|
url = source_data.get('link') or source_data.get('url')
|
|
if url:
|
|
mp4_urls_from_json[quality_key] = url
|
|
elif isinstance(source_data, str):
|
|
mp4_urls_from_json[quality_key] = source_data
|
|
|
|
if hls_url_from_json or mp4_urls_from_json:
|
|
self.log(f"Extracted video sources from JSON: HLS={'yes' if hls_url_from_json else 'no'}, MP4 qualities={list(mp4_urls_from_json.keys())}", 'debug')
|
|
except (json.JSONDecodeError, Exception) as e:
|
|
self.log(f"Could not parse video JSON sources: {e}", 'debug')
|
|
|
|
# Try HLS download first (best quality, up to 4K)
|
|
m3u8_url = hls_url_from_json
|
|
if not m3u8_url:
|
|
m3u8_match = re.search(r'"(https://video[^"]*\.xhcdn\.com/[^"]+\.m3u8[^"]*)"', page_html)
|
|
if m3u8_match:
|
|
m3u8_url = m3u8_match.group(1)
|
|
|
|
if m3u8_url:
|
|
hls_result = await self._download_hls(m3u8_url, file_path)
|
|
if hls_result:
|
|
file_size = file_path.stat().st_size
|
|
self.log(f"HLS download complete: {filename} ({file_size / 1024 / 1024:.1f}MB)", 'debug')
|
|
return {
|
|
'success': True,
|
|
'file_path': str(file_path),
|
|
'filename': filename,
|
|
'file_size': file_size,
|
|
'title': title,
|
|
'video_id': video_id,
|
|
'upload_date': upload_date,
|
|
'timestamp': timestamp,
|
|
}
|
|
|
|
# Fallback: direct MP4 download - prefer JSON sources (highest quality)
|
|
download_url = None
|
|
|
|
if mp4_urls_from_json:
|
|
# Select highest quality MP4 from JSON sources
|
|
quality_priority = ['2160p', '1440p', '1080p', '720p', '480p', '360p', '240p']
|
|
for q in quality_priority:
|
|
if q in mp4_urls_from_json:
|
|
download_url = mp4_urls_from_json[q]
|
|
self.log(f"Direct downloading ({q} from JSON): {filename}", 'debug')
|
|
break
|
|
if not download_url:
|
|
# Take any available quality
|
|
download_url = next(iter(mp4_urls_from_json.values()))
|
|
self.log(f"Direct downloading (from JSON): {filename}", 'debug')
|
|
|
|
if not download_url:
|
|
# Regex fallback: extract MP4 URLs from page HTML
|
|
mp4_urls = re.findall(
|
|
r'"(https://video[^"]*\.xhcdn\.com/[^"]+\.(?:h264|mp4)[^"]*)"',
|
|
page_html
|
|
)
|
|
mp4_urls = [u for u in mp4_urls if not u.endswith('.m3u8') and '.mp4' in u]
|
|
# Filter out preview/sample URLs
|
|
full_urls = [u for u in mp4_urls if not re.search(r'preview|sample|thumb', u, re.IGNORECASE)]
|
|
if full_urls:
|
|
mp4_urls = full_urls
|
|
mp4_urls = list(dict.fromkeys(mp4_urls))
|
|
|
|
if not mp4_urls:
|
|
self.log("No video URL found on video page", 'debug')
|
|
return None
|
|
|
|
# Take the LAST unique URL (previews tend to appear first in the HTML)
|
|
download_url = mp4_urls[-1]
|
|
self.log(f"Direct downloading (regex fallback): {filename}", 'debug')
|
|
dl_headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
'Referer': 'https://xhamster.com/',
|
|
}
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(download_url, headers=dl_headers,
|
|
allow_redirects=True,
|
|
timeout=aiohttp.ClientTimeout(total=600)) as resp:
|
|
if resp.status != 200:
|
|
return {'success': False, 'error': f'Download failed: HTTP {resp.status}'}
|
|
|
|
total_size = int(resp.headers.get('Content-Length', 0))
|
|
downloaded = 0
|
|
|
|
with open(file_path, 'wb') as f:
|
|
async for chunk in resp.content.iter_chunked(65536):
|
|
f.write(chunk)
|
|
downloaded += len(chunk)
|
|
if progress_callback and total_size > 0:
|
|
progress_callback(downloaded / total_size * 100)
|
|
|
|
file_size = file_path.stat().st_size
|
|
self.log(f"Direct download complete: {filename} ({file_size / 1024 / 1024:.1f}MB)", 'debug')
|
|
|
|
return {
|
|
'success': True,
|
|
'file_path': str(file_path),
|
|
'filename': filename,
|
|
'file_size': file_size,
|
|
'title': title,
|
|
'video_id': video_id,
|
|
'upload_date': upload_date,
|
|
'timestamp': timestamp,
|
|
}
|
|
|
|
except Exception as e:
|
|
self.log(f"Direct download failed: {e}", 'debug')
|
|
return None
|
|
|
|
async def _download_hls(self, m3u8_url: str, output_path: Path) -> bool:
|
|
"""Download HLS stream using ffmpeg, selecting best quality"""
|
|
try:
|
|
import aiohttp
|
|
from urllib.parse import urlparse
|
|
from yarl import URL as YarlURL
|
|
|
|
# Fetch master playlist to find best quality stream
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
'Referer': 'https://xhamster.com/',
|
|
}
|
|
|
|
best_stream_url = None
|
|
best_bandwidth = 0
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
# Use encoded=True to preserve %2B/%3D in CloudFront signed URLs
|
|
async with session.get(YarlURL(m3u8_url, encoded=True), headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
|
|
if resp.status != 200:
|
|
self.log(f"HLS master playlist fetch failed: HTTP {resp.status}", 'debug')
|
|
return False
|
|
playlist = await resp.text()
|
|
|
|
# Extract query params from master URL for forwarding to variant URLs
|
|
parsed_master = urlparse(m3u8_url)
|
|
master_query = parsed_master.query
|
|
|
|
# Parse master playlist for best quality variant
|
|
lines = playlist.strip().split('\n')
|
|
|
|
# Check if this is already a media playlist (no STREAM-INF)
|
|
has_variants = any(line.startswith('#EXT-X-STREAM-INF:') for line in lines)
|
|
|
|
if not has_variants:
|
|
# This is already a media playlist — download directly with ffmpeg
|
|
self.log("HLS: single stream (no variants), downloading directly", 'debug')
|
|
best_stream_url = m3u8_url
|
|
else:
|
|
for i, line in enumerate(lines):
|
|
if line.startswith('#EXT-X-STREAM-INF:'):
|
|
bw_match = re.search(r'BANDWIDTH=(\d+)', line)
|
|
bandwidth = int(bw_match.group(1)) if bw_match else 0
|
|
if bandwidth > best_bandwidth and i + 1 < len(lines):
|
|
stream_path = lines[i + 1].strip()
|
|
if stream_path.startswith('http'):
|
|
best_stream_url = stream_path
|
|
elif stream_path.startswith('//'):
|
|
# Protocol-relative URL (different CDN domain)
|
|
best_stream_url = f"{parsed_master.scheme}:{stream_path}"
|
|
elif stream_path.startswith('/'):
|
|
best_stream_url = f"{parsed_master.scheme}://{parsed_master.netloc}{stream_path}"
|
|
else:
|
|
m3u8_base = m3u8_url.split('?')[0].rsplit('/', 1)[0]
|
|
best_stream_url = f"{m3u8_base}/{stream_path}"
|
|
# Forward signed query params only if variant URL doesn't have its own
|
|
if master_query and '?' not in best_stream_url:
|
|
best_stream_url = f"{best_stream_url}?{master_query}"
|
|
best_bandwidth = bandwidth
|
|
|
|
if not best_stream_url:
|
|
self.log("No HLS variant found in master playlist", 'debug')
|
|
return False
|
|
|
|
quality_label = ''
|
|
for i, line in enumerate(lines):
|
|
if line.startswith('#EXT-X-STREAM-INF:') and i + 1 < len(lines):
|
|
rm = re.search(r'RESOLUTION=(\d+x\d+)', line)
|
|
if rm and int(re.search(r'BANDWIDTH=(\d+)', line).group(1)) == best_bandwidth:
|
|
quality_label = f" ({rm.group(1)})"
|
|
break
|
|
|
|
self.log(f"HLS downloading best quality{quality_label}", 'debug')
|
|
|
|
# Use ffmpeg to download
|
|
cmd = [
|
|
'ffmpeg', '-y',
|
|
'-headers', 'Referer: https://xhamster.com/\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\r\n',
|
|
'-i', best_stream_url,
|
|
'-c', 'copy',
|
|
'-movflags', '+faststart',
|
|
str(output_path)
|
|
]
|
|
|
|
process = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE
|
|
)
|
|
_, stderr = await process.communicate()
|
|
|
|
if process.returncode != 0:
|
|
error = stderr.decode('utf-8', errors='replace')[-500:]
|
|
self.log(f"ffmpeg HLS download failed: {error}", 'debug')
|
|
return False
|
|
|
|
return output_path.exists() and output_path.stat().st_size > 0
|
|
|
|
except Exception as e:
|
|
self.log(f"HLS download error: {e}", 'debug')
|
|
return False
|
|
|
|
async def _download_video_ytdlp(self, video_url: str, output_dir: Path, quality: str = 'best') -> Dict:
|
|
"""Download video using yt-dlp (fallback)"""
|
|
if not self.is_available():
|
|
return {'success': False, 'error': 'yt-dlp not available'}
|
|
|
|
try:
|
|
output_dir = Path(output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s')
|
|
|
|
format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])
|
|
|
|
cmd = self._get_base_cmd() + [
|
|
'--no-warnings',
|
|
'-f', format_str,
|
|
'-o', output_template,
|
|
'--print-json',
|
|
'--no-playlist',
|
|
'--user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
'--concurrent-fragments', '4',
|
|
'--no-part',
|
|
'--retries', '20',
|
|
'--socket-timeout', '30',
|
|
video_url
|
|
]
|
|
|
|
result = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE
|
|
)
|
|
|
|
stdout, stderr = await result.communicate()
|
|
|
|
if result.returncode != 0:
|
|
error_msg = stderr.decode('utf-8', errors='replace').strip()
|
|
if 'Video unavailable' in error_msg or 'not available' in error_msg:
|
|
error_msg = 'Video unavailable or private'
|
|
elif 'premium' in error_msg.lower():
|
|
error_msg = 'Video requires premium access'
|
|
elif len(error_msg) > 200:
|
|
error_msg = error_msg[:200] + '...'
|
|
|
|
return {'success': False, 'error': error_msg}
|
|
|
|
# Parse output JSON
|
|
video_info = None
|
|
for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
|
|
try:
|
|
video_info = json.loads(line)
|
|
break
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
if not video_info:
|
|
files = list(output_dir.glob('*.mp4'))
|
|
if files:
|
|
file_path = max(files, key=lambda f: f.stat().st_mtime)
|
|
return {
|
|
'success': True,
|
|
'file_path': str(file_path),
|
|
'filename': file_path.name,
|
|
'file_size': file_path.stat().st_size
|
|
}
|
|
return {'success': False, 'error': 'Could not find downloaded file'}
|
|
|
|
file_path = video_info.get('_filename') or video_info.get('filename')
|
|
if file_path:
|
|
file_path = Path(file_path)
|
|
|
|
return {
|
|
'success': True,
|
|
'file_path': str(file_path) if file_path else None,
|
|
'filename': file_path.name if file_path else None,
|
|
'file_size': file_path.stat().st_size if file_path and file_path.exists() else video_info.get('filesize'),
|
|
'title': video_info.get('title'),
|
|
'duration': video_info.get('duration'),
|
|
'video_id': video_info.get('id'),
|
|
'upload_date': video_info.get('upload_date'),
|
|
'timestamp': video_info.get('timestamp'),
|
|
'thumbnail': video_info.get('thumbnail'),
|
|
}
|
|
|
|
except Exception as e:
|
|
self.log(f"Error downloading video via yt-dlp: {e}", 'error')
|
|
return {'success': False, 'error': str(e)}
|
|
|
|
async def get_profile_page(self, url: str) -> Optional[str]:
|
|
"""Fetch profile page HTML via aiohttp. Results are cached."""
|
|
base_url = re.sub(r'/(videos|shorts)/?$', '', url)
|
|
|
|
if base_url in self._profile_page_cache:
|
|
return self._profile_page_cache[base_url]
|
|
|
|
try:
|
|
import aiohttp
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
}
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(
|
|
base_url,
|
|
headers=headers,
|
|
timeout=aiohttp.ClientTimeout(total=15)
|
|
) as resp:
|
|
if resp.status == 200:
|
|
text = await resp.text()
|
|
self._profile_page_cache[base_url] = text
|
|
return text
|
|
|
|
except Exception as e:
|
|
self.log(f"Could not fetch profile page: {e}", 'debug')
|
|
|
|
self._profile_page_cache[base_url] = None
|
|
return None
|
|
|
|
async def _fetch_page_html(self, url: str) -> Optional[str]:
|
|
"""Fetch an arbitrary page's HTML via aiohttp (not cached)."""
|
|
try:
|
|
import aiohttp
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
}
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
|
|
if resp.status == 200:
|
|
return await resp.text()
|
|
self.log(f"Page fetch failed: HTTP {resp.status} for {url}", 'debug')
|
|
|
|
except Exception as e:
|
|
self.log(f"Could not fetch page: {e}", 'debug')
|
|
|
|
return None
|
|
|
|
def _extract_initials_json(self, html: str, key_path: str) -> Optional[Any]:
|
|
"""Extract a value from window.initials JSON embedded in page HTML.
|
|
|
|
Args:
|
|
html: Page HTML containing window.initials = {...}
|
|
key_path: Dot-separated path, e.g. 'galleryPage.photoItems'
|
|
|
|
Returns:
|
|
The extracted value, or None if not found.
|
|
"""
|
|
try:
|
|
match = re.search(r'window\.initials\s*=\s*(\{.+?\});\s*</script>', html, re.DOTALL)
|
|
if not match:
|
|
return None
|
|
|
|
data = json.loads(match.group(1))
|
|
|
|
for key in key_path.split('.'):
|
|
if isinstance(data, dict):
|
|
data = data.get(key)
|
|
else:
|
|
return None
|
|
if data is None:
|
|
return None
|
|
|
|
return data
|
|
|
|
except (json.JSONDecodeError, Exception) as e:
|
|
self.log(f"Failed to extract initials JSON for '{key_path}': {e}", 'debug')
|
|
return None
|
|
|
|
async def get_profile_image(self, url: str) -> Optional[str]:
|
|
"""Scrape profile page for avatar/photo URL"""
|
|
try:
|
|
page_html = await self.get_profile_page(url)
|
|
if not page_html:
|
|
return None
|
|
|
|
# XHamster embeds creator data as JSON in the page.
|
|
# Look for the main creator's thumbUrl in the pornstarTop JSON block
|
|
thumb_match = re.search(
|
|
r'"pornstarTop"\s*:\s*\{[\s\S]*?"thumbUrl"\s*:\s*"([^"]+)"',
|
|
page_html
|
|
)
|
|
if thumb_match:
|
|
avatar_url = thumb_match.group(1).replace('\\/', '/')
|
|
self.log("Found XHamster profile avatar from JSON data", 'debug')
|
|
return avatar_url
|
|
|
|
# Fallback: CSS background-image on landing-info__logo-image
|
|
bg_match = re.search(
|
|
r'landing-info__logo-image["\'][^>]*style="[^"]*url\([\'"]?([^\'")]+)',
|
|
page_html
|
|
)
|
|
if bg_match:
|
|
self.log("Found XHamster profile avatar from CSS", 'debug')
|
|
return bg_match.group(1)
|
|
|
|
# Fallback: og:image meta tag
|
|
og_match = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_html)
|
|
if not og_match:
|
|
og_match = re.search(r'<meta\s+content="([^"]+)"\s+property="og:image"', page_html)
|
|
if og_match:
|
|
return og_match.group(1)
|
|
|
|
except Exception as e:
|
|
self.log(f"Could not fetch profile image: {e}", 'debug')
|
|
|
|
return None
|
|
|
|
async def get_profile_bio(self, url: str) -> Optional[str]:
|
|
"""Scrape bio/about section from profile page"""
|
|
try:
|
|
page_html = await self.get_profile_page(url)
|
|
if not page_html:
|
|
return None
|
|
|
|
# Look for description/bio sections
|
|
bio_match = re.search(
|
|
r'<div[^>]*class="[^"]*about[^"]*"[^>]*>\s*(.*?)\s*</div>',
|
|
page_html, re.DOTALL
|
|
)
|
|
if bio_match:
|
|
bio_text = re.sub(r'<[^>]+>', '', bio_match.group(1)).strip()
|
|
if bio_text:
|
|
self.log("Found XHamster profile bio", 'debug')
|
|
return html_module.unescape(bio_text)
|
|
|
|
# Try meta description
|
|
desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', page_html)
|
|
if desc_match:
|
|
bio_text = html_module.unescape(desc_match.group(1).strip())
|
|
if bio_text and len(bio_text) > 20:
|
|
return bio_text
|
|
|
|
except Exception as e:
|
|
self.log(f"Could not fetch profile bio: {e}", 'debug')
|
|
|
|
return None
|
|
|
|
async def get_creator(self, url: str) -> Optional[Creator]:
|
|
"""Get Creator object from creator URL"""
|
|
info = await self.get_creator_info(url)
|
|
if not info:
|
|
return None
|
|
|
|
creator_type_id = self.extract_creator_id(url)
|
|
if creator_type_id:
|
|
creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}"
|
|
else:
|
|
creator_id = info.get('creator_id', '')
|
|
|
|
profile_image = await self.get_profile_image(url)
|
|
|
|
return Creator(
|
|
creator_id=creator_id,
|
|
service_id='xhamster',
|
|
platform='xhamster',
|
|
username=info.get('creator_name', 'Unknown'),
|
|
display_name=info.get('creator_name'),
|
|
profile_image_url=profile_image,
|
|
)
|
|
|
|
async def get_posts(self, url: str, since_date: str = None,
|
|
max_videos: int = None, progress_callback=None) -> List[Post]:
|
|
"""Get all content (videos, shorts, galleries) as Post objects.
|
|
|
|
Aggregates regular videos, shorts/moments, and photo galleries into a
|
|
unified list of Post objects. Deduplicates by post_id so videos and
|
|
shorts that share an xhID are not counted twice.
|
|
"""
|
|
creator_type_id = self.extract_creator_id(url)
|
|
creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}" if creator_type_id else ''
|
|
|
|
posts = []
|
|
seen_post_ids = set()
|
|
|
|
# 1. Regular videos (via yt-dlp --flat-playlist)
|
|
videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback)
|
|
for video in videos:
|
|
vid = video['video_id']
|
|
if vid in seen_post_ids:
|
|
continue
|
|
seen_post_ids.add(vid)
|
|
|
|
attachment = Attachment(
|
|
name=f"{vid}.mp4",
|
|
file_type='video',
|
|
extension='mp4',
|
|
server_path=video['url'],
|
|
download_url=video['url'],
|
|
duration=video.get('duration'),
|
|
)
|
|
posts.append(Post(
|
|
post_id=vid,
|
|
service_id='xhamster',
|
|
platform='xhamster',
|
|
creator_id=creator_id,
|
|
title=video['title'],
|
|
content=video.get('description') or video['title'],
|
|
published_at=video.get('upload_date'),
|
|
attachments=[attachment],
|
|
))
|
|
|
|
# 2. Shorts / Moments (HTML scraping)
|
|
try:
|
|
shorts = await self.get_creator_shorts(url, max_items=max_videos)
|
|
for short in shorts:
|
|
vid = short['video_id']
|
|
if vid in seen_post_ids:
|
|
continue
|
|
seen_post_ids.add(vid)
|
|
|
|
attachment = Attachment(
|
|
name=f"{vid}.mp4",
|
|
file_type='video',
|
|
extension='mp4',
|
|
server_path=short['url'],
|
|
download_url=short['url'],
|
|
duration=short.get('duration'),
|
|
)
|
|
posts.append(Post(
|
|
post_id=vid,
|
|
service_id='xhamster',
|
|
platform='xhamster',
|
|
creator_id=creator_id,
|
|
title=short['title'],
|
|
content=short.get('description') or short['title'],
|
|
published_at=short.get('upload_date'),
|
|
attachments=[attachment],
|
|
))
|
|
except Exception as e:
|
|
self.log(f"Failed to fetch shorts (continuing with videos): {e}", 'warning')
|
|
|
|
# 3. Photo galleries (HTML scraping)
|
|
try:
|
|
galleries = await self.get_creator_galleries(url)
|
|
for gallery in galleries:
|
|
gallery_post_id = f"gallery-{gallery['gallery_id']}"
|
|
if gallery_post_id in seen_post_ids:
|
|
continue
|
|
seen_post_ids.add(gallery_post_id)
|
|
|
|
gallery_data = await self.get_gallery_images(gallery['url'])
|
|
if not gallery_data or not gallery_data.get('images'):
|
|
continue
|
|
|
|
attachments = []
|
|
for img in gallery_data['images']:
|
|
# Determine extension from URL
|
|
ext = 'jpg'
|
|
if img['url']:
|
|
url_ext = img['url'].rsplit('.', 1)[-1].split('?')[0].lower()
|
|
if url_ext in ('jpg', 'jpeg', 'png', 'gif', 'webp'):
|
|
ext = url_ext
|
|
|
|
attachments.append(Attachment(
|
|
name=f"{img['id']}.{ext}",
|
|
file_type='image',
|
|
extension=ext,
|
|
server_path=img['url'],
|
|
download_url=img['url'],
|
|
width=img.get('width'),
|
|
height=img.get('height'),
|
|
))
|
|
|
|
posts.append(Post(
|
|
post_id=gallery_post_id,
|
|
service_id='xhamster',
|
|
platform='xhamster',
|
|
creator_id=creator_id,
|
|
title=gallery_data.get('title') or gallery.get('title', ''),
|
|
content=gallery_data.get('title') or gallery.get('title', ''),
|
|
published_at=gallery_data.get('created'),
|
|
attachments=attachments,
|
|
))
|
|
|
|
# Small delay between gallery fetches
|
|
await asyncio.sleep(0.5)
|
|
|
|
except Exception as e:
|
|
self.log(f"Failed to fetch galleries (continuing with videos/shorts): {e}", 'warning')
|
|
|
|
self.log(f"Total posts: {len(posts)} (videos + shorts + galleries)", 'info')
|
|
return posts
|