media-downloader/modules/paid_content/xhamster_client.py

"""
XHamster Client - Fetches creator info and videos using yt-dlp

Supports:
- Creator profiles (xhamster.com/creators/name)
- Channels (xhamster.com/channels/name)
- Shorts (xhamster.com/creators/name/shorts)
- Photo galleries (xhamster.com/creators/name/photos)
"""

import asyncio
import html as html_module
import json
import os
import re
import subprocess
import tempfile
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

from modules.base_module import LoggingMixin
from .models import Creator, Post, Attachment


class XHamsterClient(LoggingMixin):
    """
    Client for fetching XHamster creator information and videos using yt-dlp

    Supports:
    - Creator pages (xhamster.com/creators/name)
    - Channel pages (xhamster.com/channels/name)
    - Creator shorts (xhamster.com/creators/name/shorts)
    """

    SERVICE_ID = 'xhamster'
    PLATFORM = 'xhamster'

    QUALITY_PRESETS = {
        'best': 'bestvideo+bestaudio/best',
        '1080p': 'bestvideo[height<=1080]+bestaudio/best[height<=1080]/best',
        '720p': 'bestvideo[height<=720]+bestaudio/best[height<=720]/best',
        '480p': 'bestvideo[height<=480]+bestaudio/best[height<=480]/best',
    }

    def __init__(self, ytdlp_path: str = None, unified_db=None, log_callback=None):
        self._init_logger('PaidContent', log_callback, default_module='XHamster')

        self.ytdlp_path = ytdlp_path or self._find_ytdlp()
        if not self.ytdlp_path:
            self.log("yt-dlp not found, XHamster support will be disabled", 'warning')

        self.unified_db = unified_db
        self._cookies_file = None
        self._profile_page_cache: Dict[str, Optional[str]] = {}

    def _find_ytdlp(self) -> Optional[str]:
        """Find yt-dlp executable"""
        common_paths = [
            '/opt/media-downloader/venv/bin/yt-dlp',
            '/usr/local/bin/yt-dlp',
            '/usr/bin/yt-dlp',
            '/opt/homebrew/bin/yt-dlp',
            os.path.expanduser('~/.local/bin/yt-dlp'),
        ]

        for path in common_paths:
            if os.path.isfile(path) and os.access(path, os.X_OK):
                return path

        try:
            result = subprocess.run(['which', 'yt-dlp'], capture_output=True, text=True)
            if result.returncode == 0:
                return result.stdout.strip()
        except Exception:
            pass

        return None

    def is_available(self) -> bool:
        """Check if yt-dlp is available"""
        return self.ytdlp_path is not None

    def _get_cookies_file(self) -> Optional[str]:
        """Get path to cookies file, creating it from database if needed"""
        if self._cookies_file and os.path.exists(self._cookies_file):
            return self._cookies_file

        if not self.unified_db:
            return None

        try:
            with self.unified_db.get_connection() as conn:
                cursor = conn.cursor()
                cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", ('xhamster',))
                row = cursor.fetchone()
                if row and row[0]:
                    data = json.loads(row[0])
                    if isinstance(data, dict) and 'cookies' in data:
                        cookies_list = data['cookies']
                    elif isinstance(data, list):
                        cookies_list = data
                    else:
                        cookies_list = []

                    if cookies_list:
                        fd, self._cookies_file = tempfile.mkstemp(suffix='.txt', prefix='xhamster_cookies_')
                        with os.fdopen(fd, 'w') as f:
                            f.write("# Netscape HTTP Cookie File\n")
                            for cookie in cookies_list:
                                domain = cookie.get('domain', '')
                                include_subdomains = 'TRUE' if domain.startswith('.') else 'FALSE'
                                path = cookie.get('path', '/')
                                secure = 'TRUE' if cookie.get('secure', False) else 'FALSE'
                                expiry = str(int(cookie.get('expirationDate', 0)))
                                name = cookie.get('name', '')
                                value = cookie.get('value', '')
                                f.write(f"{domain}\t{include_subdomains}\t{path}\t{secure}\t{expiry}\t{name}\t{value}\n")
                        self.log(f"Loaded {len(cookies_list)} cookies from xhamster scraper", 'debug')
                        return self._cookies_file
        except Exception as e:
            self.log(f"Could not load cookies: {e}", 'debug')

        return None

    def _get_base_cmd(self) -> List[str]:
        """Get base yt-dlp command with cookies if available"""
        cmd = [self.ytdlp_path]
        cookies_file = self._get_cookies_file()
        if cookies_file:
            cmd.extend(['--cookies', cookies_file])
        return cmd

    def cleanup(self):
        """Clean up temporary files"""
        if self._cookies_file and os.path.exists(self._cookies_file):
            try:
                os.unlink(self._cookies_file)
            except Exception:
                pass
            self._cookies_file = None
        self._profile_page_cache.clear()

    @staticmethod
    def extract_creator_id(url: str) -> Optional[Tuple[str, str]]:
        """
        Extract creator type and identifier from XHamster URL

        Returns:
            Tuple of (type, id) where type is 'creators' or 'channels'
            or None if not a valid XHamster creator URL
        """
        patterns = [
            (r'xhamster\d*\.com/creators/([a-zA-Z0-9_-]+)', 'creators'),
            (r'xhamster\d*\.com/channels/([a-zA-Z0-9_-]+)', 'channels'),
        ]

        for pattern, creator_type in patterns:
            match = re.search(pattern, url)
            if match:
                return (creator_type, match.group(1))

        return None

    @staticmethod
    def normalize_creator_url(creator_id: str, creator_type: str = 'creators') -> str:
        """Convert creator ID to a consistent URL format"""
        if creator_id.startswith('http://') or creator_id.startswith('https://'):
            return creator_id

        if '/' in creator_id:
            parts = creator_id.split('/', 1)
            creator_type = parts[0]
            creator_id = parts[1]

        return f"https://xhamster.com/{creator_type}/{creator_id}"

    def _get_listing_url(self, url: str) -> str:
        """Get the URL to use for listing videos from a creator page.

        Strips /shorts suffix for the main listing, or keeps it for shorts-only.
        """
        return url.rstrip('/')

    async def get_creator_info(self, url: str) -> Optional[Dict]:
        """Get creator information using yt-dlp"""
        if not self.is_available():
            return None

        creator_type_id = self.extract_creator_id(url)
        creator_type = creator_type_id[0] if creator_type_id else 'creators'

        creator_name = None

        # Try to scrape the display name from the profile page
        try:
            page_html = await self.get_profile_page(url)
            if page_html:
                name_match = re.search(r'<h1[^>]*class="[^"]*name[^"]*"[^>]*>\s*(.+?)\s*</h1>', page_html, re.DOTALL)
                if not name_match:
                    name_match = re.search(r'<title>([^<|]+)', page_html)
                if name_match:
                    creator_name = html_module.unescape(name_match.group(1).strip())
                    # Clean up title suffix
                    creator_name = re.sub(r'\s*[-|].*$', '', creator_name).strip()
                    self.log(f"Found creator name from profile page: {creator_name}", 'debug')
        except Exception as e:
            self.log(f"Could not scrape creator name: {e}", 'debug')

        # If page scraping didn't find a name, try yt-dlp
        if not creator_name:
            try:
                listing_url = self._get_listing_url(url)

                cmd = self._get_base_cmd() + [
                    '--no-warnings',
                    '--flat-playlist',
                    '-j',
                    '--playlist-items', '1',
                    '--socket-timeout', '30',
                    listing_url
                ]

                result = await asyncio.create_subprocess_exec(
                    *cmd,
                    stdout=asyncio.subprocess.PIPE,
                    stderr=asyncio.subprocess.PIPE
                )

                stdout, stderr = await result.communicate()

                if result.returncode == 0:
                    for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
                        if not line:
                            continue
                        try:
                            data = json.loads(line)
                            creator_name = (data.get('channel') or data.get('uploader')
                                            or data.get('playlist_title') or None)
                            if creator_name:
                                creator_name = html_module.unescape(creator_name)
                                break
                        except json.JSONDecodeError:
                            continue
            except Exception as e:
                self.log(f"yt-dlp creator info failed: {e}", 'debug')

        # Fall back to deriving name from URL slug
        if not creator_name and creator_type_id:
            creator_name = creator_type_id[1].replace('-', ' ').title()

        if creator_name:
            return {
                'creator_id': creator_type_id[1] if creator_type_id else None,
                'creator_name': creator_name,
                'creator_url': url,
                'creator_type': creator_type,
            }

        return None

    async def get_creator_videos(self, url: str, since_date: str = None,
                                  max_videos: int = None,
                                  progress_callback=None) -> List[Dict]:
        """Get all videos from a creator page using --flat-playlist for speed."""
        if not self.is_available():
            return []

        try:
            listing_url = self._get_listing_url(url)

            cmd = self._get_base_cmd() + [
                '--no-warnings',
                '--flat-playlist',
                '-j',
                '--socket-timeout', '30',
                '--retries', '3',
                listing_url
            ]

            if max_videos:
                cmd.extend(['--playlist-items', f'1:{max_videos}'])

            self.log(f"Fetching videos from: {url}", 'info')

            result = await asyncio.create_subprocess_exec(
                *cmd,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE
            )

            stdout, stderr = await result.communicate()

            if result.returncode != 0:
                error = stderr.decode('utf-8', errors='replace')
                self.log(f"Failed to get creator videos: {error}", 'warning')
                return []

            videos = []
            seen_ids = set()
            for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
                if not line:
                    continue
                try:
                    data = json.loads(line)

                    if data.get('_type') == 'playlist':
                        continue

                    video_id = data.get('id')
                    video_url = (data.get('webpage_url') or data.get('url') or '')

                    # flat-playlist returns _type=url entries with id=null
                    # Extract video_id from URL: .../videos/{slug}-{xhID}
                    if not video_id and video_url:
                        basename = video_url.rstrip('/').split('/')[-1]
                        # xhamster IDs are the last segment: slug-xhXXXXX
                        id_match = re.search(r'-(xh[A-Za-z0-9]{4,7})$', basename)
                        if id_match:
                            video_id = id_match.group(1)
                        elif basename:
                            video_id = basename

                    if not video_id:
                        continue

                    if video_id in seen_ids:
                        continue
                    seen_ids.add(video_id)

                    upload_date = data.get('upload_date')
                    if upload_date:
                        try:
                            upload_date = datetime.strptime(upload_date, '%Y%m%d').isoformat()
                        except ValueError:
                            pass

                    title = data.get('title')
                    if not title:
                        # Derive title from URL slug
                        basename = video_url.rstrip('/').split('/')[-1] if video_url else ''
                        # Remove the xhamster ID suffix
                        slug = re.sub(r'-xh[A-Za-z0-9]{4,7}$', '', basename)
                        title = slug.replace('-', ' ').title() if slug else f'Video {video_id}'
                    else:
                        title = html_module.unescape(title)

                    if not video_url:
                        video_url = f"https://xhamster.com/videos/{video_id}"

                    videos.append({
                        'video_id': str(video_id),
                        'title': title,
                        'description': data.get('description', ''),
                        'upload_date': upload_date,
                        'duration': data.get('duration'),
                        'view_count': data.get('view_count'),
                        'thumbnail': data.get('thumbnail'),
                        'url': video_url,
                    })

                    if progress_callback:
                        progress_callback(len(videos))

                    if max_videos and len(videos) >= max_videos:
                        break

                except json.JSONDecodeError:
                    continue

            self.log(f"Found {len(videos)} videos", 'info')
            return videos

        except Exception as e:
            self.log(f"Error getting creator videos: {e}", 'error')
            return []

    async def get_creator_shorts(self, url: str, max_items: int = None,
                                progress_callback=None) -> List[Dict]:
        """Get shorts/moments from a creator page by scraping HTML.

        Scrapes /creators/{name}/shorts pages and extracts video data from
        window.initials.momentsComponent.videoListProps.videoThumbProps.
        """
        try:
            base_url = re.sub(r'/(videos|shorts|photos)/?$', '', url.rstrip('/'))
            shorts_url = f"{base_url}/shorts"

            self.log(f"Fetching shorts from: {shorts_url}", 'info')

            all_shorts = []
            seen_ids = set()
            page = 1

            while True:
                page_url = f"{shorts_url}/{page}" if page > 1 else shorts_url
                html = await self._fetch_page_html(page_url)
                if not html:
                    break

                thumb_props = self._extract_initials_json(html, 'momentsComponent.videoListProps.videoThumbProps')
                if not thumb_props or not isinstance(thumb_props, list):
                    if page == 1:
                        self.log("No shorts found for this creator", 'debug')
                    break

                for item in thumb_props:
                    video_id = str(item.get('id', ''))
                    if not video_id:
                        continue

                    if video_id in seen_ids:
                        continue
                    seen_ids.add(video_id)

                    page_url_item = item.get('pageURL', '')
                    # Extract xhID from moment URL: /moments/{slug}-{xhID}
                    xh_id = None
                    if page_url_item:
                        id_match = re.search(r'-(xh[A-Za-z0-9]{4,7})$', page_url_item.rstrip('/').split('/')[-1])
                        if id_match:
                            xh_id = id_match.group(1)

                    title = item.get('title', '')
                    if title:
                        title = html_module.unescape(title)
                    else:
                        title = f'Short {video_id}'

                    all_shorts.append({
                        'video_id': xh_id or video_id,
                        'title': title,
                        'description': '',
                        'upload_date': None,  # Shorts listings don't include dates
                        'duration': None,
                        'view_count': item.get('views'),
                        'thumbnail': item.get('thumbURL') or item.get('imageURL'),
                        'url': page_url_item or f"https://xhamster.com/moments/{video_id}",
                    })

                    if progress_callback:
                        progress_callback(len(all_shorts))

                    if max_items and len(all_shorts) >= max_items:
                        break

                if max_items and len(all_shorts) >= max_items:
                    break

                # Check pagination
                pagination = self._extract_initials_json(html, 'momentsComponent.videoListProps.pagination')
                if not pagination:
                    # Also try top-level pagination
                    pagination = self._extract_initials_json(html, 'pagination')

                next_page = pagination.get('next', 0) if pagination else 0
                if not next_page or next_page <= page:
                    break

                page = next_page
                await asyncio.sleep(1)

            self.log(f"Found {len(all_shorts)} shorts", 'info')
            return all_shorts

        except Exception as e:
            self.log(f"Error getting creator shorts: {e}", 'error')
            return []

    async def get_creator_galleries(self, url: str, max_items: int = None,
                                     progress_callback=None) -> List[Dict]:
        """Get photo gallery listings from a creator page.

        Scrapes /creators/{name}/photos pages and extracts gallery data from
        window.initials.userGalleriesCollection.
        """
        try:
            base_url = re.sub(r'/(videos|shorts|photos)/?$', '', url.rstrip('/'))
            photos_url = f"{base_url}/photos"

            self.log(f"Fetching galleries from: {photos_url}", 'info')

            all_galleries = []
            seen_ids = set()
            page = 1

            while True:
                page_url = f"{photos_url}/{page}" if page > 1 else photos_url
                html = await self._fetch_page_html(page_url)
                if not html:
                    break

                galleries = self._extract_initials_json(html, 'userGalleriesCollection')
                if not galleries or not isinstance(galleries, list):
                    if page == 1:
                        self.log("No galleries found for this creator", 'debug')
                    break

                for gallery in galleries:
                    gallery_id = str(gallery.get('galleryID', ''))
                    if not gallery_id:
                        continue

                    if gallery_id in seen_ids:
                        continue
                    seen_ids.add(gallery_id)

                    title = gallery.get('title', '')
                    if title:
                        title = html_module.unescape(title)

                    all_galleries.append({
                        'gallery_id': gallery_id,
                        'title': title or f'Gallery {gallery_id}',
                        'url': gallery.get('pageURL', ''),
                        'thumbnail': gallery.get('thumbURL') or gallery.get('imageURL'),
                        'image_count': gallery.get('quantity', 0),
                        'views': gallery.get('views', 0),
                    })

                    if progress_callback:
                        progress_callback(len(all_galleries))

                    if max_items and len(all_galleries) >= max_items:
                        break

                if max_items and len(all_galleries) >= max_items:
                    break

                # Check pagination
                pagination = self._extract_initials_json(html, 'pagination')
                max_page = pagination.get('maxPage', 1) if pagination else 1
                if page >= max_page:
                    break

                page += 1
                await asyncio.sleep(1)

            self.log(f"Found {len(all_galleries)} galleries", 'info')
            return all_galleries

        except Exception as e:
            self.log(f"Error getting creator galleries: {e}", 'error')
            return []

    async def get_gallery_images(self, gallery_url: str) -> Optional[Dict]:
        """Get all images from a single gallery page.

        Scrapes the gallery page and extracts image data from
        window.initials.galleryPage.photoItems and metadata from
        window.initials.photosGalleryModel.
        """
        try:
            self.log(f"Fetching gallery images: {gallery_url}", 'debug')

            all_images = []
            seen_ids = set()
            gallery_id = None
            title = None
            created = None
            last_page = 1
            page = 1

            while page <= last_page:
                page_url = f"{gallery_url}/{page}" if page > 1 else gallery_url
                html = await self._fetch_page_html(page_url)
                if not html:
                    break

                # Extract gallery metadata on first page
                if page == 1:
                    gallery_model = self._extract_initials_json(html, 'photosGalleryModel')
                    if not gallery_model:
                        gallery_model = self._extract_initials_json(html, 'galleryPage.galleryModel')

                    if gallery_model:
                        gallery_id = str(gallery_model.get('galleryID') or gallery_model.get('id', ''))
                        title = gallery_model.get('title', '')
                        if title:
                            title = html_module.unescape(title)

                        created_ts = gallery_model.get('created')
                        if created_ts:
                            try:
                                created = datetime.fromtimestamp(int(created_ts)).isoformat()
                            except (ValueError, OSError):
                                pass

                        last_page = gallery_model.get('lastPageNumber', 1) or 1

                # Extract images
                photo_items = self._extract_initials_json(html, 'galleryPage.photoItems')
                if not photo_items:
                    photo_items = self._extract_initials_json(html, 'photosGalleryModel.photos')

                if not photo_items or not isinstance(photo_items, list):
                    break

                for photo in photo_items:
                    image_url = photo.get('imgSrc', '')
                    if not image_url:
                        continue

                    photo_id = str(photo.get('id', ''))
                    if not photo_id:
                        continue

                    if photo_id in seen_ids:
                        continue
                    seen_ids.add(photo_id)

                    all_images.append({
                        'id': photo_id,
                        'url': image_url,
                        'width': photo.get('originWidth'),
                        'height': photo.get('originHeight'),
                    })

                if page < last_page:
                    await asyncio.sleep(0.5)
                page += 1

            if not all_images:
                self.log(f"No images found in gallery: {gallery_url}", 'debug')
                return None

            # Fallback gallery_id from URL
            if not gallery_id:
                id_match = re.search(r'-(\d+)$', gallery_url.rstrip('/').split('/')[-1])
                if id_match:
                    gallery_id = id_match.group(1)

            self.log(f"Found {len(all_images)} images in gallery '{title or gallery_id}'", 'debug')

            return {
                'gallery_id': gallery_id or '',
                'title': title or '',
                'created': created,
                'images': all_images,
            }

        except Exception as e:
            self.log(f"Error getting gallery images: {e}", 'error')
            return None

    async def download_image(self, image_url: str, output_path: Path) -> Dict:
        """Download an image file via aiohttp.

        Args:
            image_url: Direct URL to the image
            output_path: Full file path to save to

        Returns:
            Dict with success, file_path, file_size
        """
        try:
            import aiohttp

            output_path = Path(output_path)
            output_path.parent.mkdir(parents=True, exist_ok=True)

            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
                'Referer': 'https://xhamster.com/',
            }

            async with aiohttp.ClientSession() as session:
                async with session.get(image_url, headers=headers,
                                       allow_redirects=True,
                                       timeout=aiohttp.ClientTimeout(total=60)) as resp:
                    if resp.status != 200:
                        return {'success': False, 'error': f'HTTP {resp.status}'}

                    with open(output_path, 'wb') as f:
                        async for chunk in resp.content.iter_chunked(65536):
                            f.write(chunk)

            file_size = output_path.stat().st_size
            if file_size == 0:
                output_path.unlink(missing_ok=True)
                return {'success': False, 'error': 'Empty file'}

            return {
                'success': True,
                'file_path': str(output_path),
                'file_size': file_size,
            }

        except Exception as e:
            self.log(f"Image download failed: {e}", 'debug')
            return {'success': False, 'error': str(e)}

    async def download_video(self, video_url: str, output_dir: Path, quality: str = 'best',
                            progress_callback=None) -> Dict:
        """Download a video - tries direct download first, falls back to yt-dlp"""
        self.log(f"Downloading video: {video_url}", 'debug')

        # Try direct download first (yt-dlp's xhamster extractor is often broken)
        result = await self._download_video_direct(video_url, output_dir, progress_callback)
        if result and result.get('success'):
            return result

        # Fall back to yt-dlp
        if self.is_available():
            result = await self._download_video_ytdlp(video_url, output_dir, quality)
            if result and result.get('success'):
                return result

        return result or {'success': False, 'error': 'All download methods failed'}

    async def _download_video_direct(self, video_url: str, output_dir: Path, progress_callback=None) -> Optional[Dict]:
        """Download video directly by scraping the video page for HLS/MP4 URLs"""
        try:
            import aiohttp

            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)

            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
            }

            # Fetch video page
            async with aiohttp.ClientSession() as session:
                async with session.get(video_url, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
                    if resp.status != 200:
                        return {'success': False, 'error': f'Page fetch failed: HTTP {resp.status}'}
                    page_html = await resp.text()

            # Extract metadata from page
            title = None
            og_match = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', page_html)
            if not og_match:
                og_match = re.search(r'<meta\s+content="([^"]+)"\s+property="og:title"', page_html)
            if og_match:
                title = html_module.unescape(og_match.group(1).strip())

            # Extract upload date from page JSON data
            upload_date = None
            timestamp = None
            created_match = re.search(r'"id"\s*:\s*\d+[^}]*"created"\s*:\s*(\d{8,})', page_html)
            if not created_match:
                created_match = re.search(r'"created"\s*:\s*(\d{8,})[^}]*"id"\s*:\s*\d+', page_html)
            if created_match:
                timestamp = int(created_match.group(1))
                try:
                    upload_date = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d')
                except (ValueError, OSError):
                    pass
            if not upload_date:
                date_match = re.search(r'"datePublished"\s*:\s*"([^"]+)"', page_html)
                if date_match:
                    upload_date = date_match.group(1)[:10]

            # Extract video ID from URL
            video_id = None
            id_match = re.search(r'-(xh[A-Za-z0-9]{4,7})$', video_url.rstrip('/').split('/')[-1])
            if id_match:
                video_id = id_match.group(1)

            if not title:
                title = video_url.rstrip('/').split('/')[-1]

            filename = f"{video_id}.mp4" if video_id else f"{re.sub(r'[^\\w\\s-]', '', title)[:100].strip()}.mp4"
            file_path = output_dir / filename

            # Try to extract video sources from window.initials JSON
            hls_url_from_json = None
            mp4_urls_from_json = {}  # quality -> url
            try:
                initials_match = re.search(r'window\.initials\s*=\s*(\{.+?\});\s*</script>', page_html, re.DOTALL)
                if initials_match:
                    initials = json.loads(initials_match.group(1))
                    video_model = initials.get('videoModel', {})
                    sources = video_model.get('sources', {})

                    # HLS source
                    hls_data = sources.get('hls')
                    if isinstance(hls_data, dict):
                        hls_url_from_json = hls_data.get('url')
                    elif isinstance(hls_data, str):
                        hls_url_from_json = hls_data

                    # MP4 download sources (keyed by quality like "480p", "720p", "1080p")
                    download_sources = sources.get('download', {})
                    if isinstance(download_sources, dict):
                        for quality_key, source_data in download_sources.items():
                            if isinstance(source_data, dict):
                                url = source_data.get('link') or source_data.get('url')
                                if url:
                                    mp4_urls_from_json[quality_key] = url
                            elif isinstance(source_data, str):
                                mp4_urls_from_json[quality_key] = source_data

                    # Also check mp4 sources
                    mp4_sources = sources.get('mp4', {})
                    if isinstance(mp4_sources, dict):
                        for quality_key, source_data in mp4_sources.items():
                            if quality_key not in mp4_urls_from_json:
                                if isinstance(source_data, dict):
                                    url = source_data.get('link') or source_data.get('url')
                                    if url:
                                        mp4_urls_from_json[quality_key] = url
                                elif isinstance(source_data, str):
                                    mp4_urls_from_json[quality_key] = source_data

                    # Also check standard sources
                    standard_sources = sources.get('standard', {})
                    if isinstance(standard_sources, dict):
                        for quality_key, source_data in standard_sources.items():
                            if quality_key not in mp4_urls_from_json:
                                if isinstance(source_data, dict):
                                    url = source_data.get('link') or source_data.get('url')
                                    if url:
                                        mp4_urls_from_json[quality_key] = url
                                elif isinstance(source_data, str):
                                    mp4_urls_from_json[quality_key] = source_data

                    if hls_url_from_json or mp4_urls_from_json:
                        self.log(f"Extracted video sources from JSON: HLS={'yes' if hls_url_from_json else 'no'}, MP4 qualities={list(mp4_urls_from_json.keys())}", 'debug')
            except (json.JSONDecodeError, Exception) as e:
                self.log(f"Could not parse video JSON sources: {e}", 'debug')

            # Try HLS download first (best quality, up to 4K)
            m3u8_url = hls_url_from_json
            if not m3u8_url:
                m3u8_match = re.search(r'"(https://video[^"]*\.xhcdn\.com/[^"]+\.m3u8[^"]*)"', page_html)
                if m3u8_match:
                    m3u8_url = m3u8_match.group(1)

            if m3u8_url:
                hls_result = await self._download_hls(m3u8_url, file_path)
                if hls_result:
                    file_size = file_path.stat().st_size
                    self.log(f"HLS download complete: {filename} ({file_size / 1024 / 1024:.1f}MB)", 'debug')
                    return {
                        'success': True,
                        'file_path': str(file_path),
                        'filename': filename,
                        'file_size': file_size,
                        'title': title,
                        'video_id': video_id,
                        'upload_date': upload_date,
                        'timestamp': timestamp,
                    }

            # Fallback: direct MP4 download - prefer JSON sources (highest quality)
            download_url = None

            if mp4_urls_from_json:
                # Select highest quality MP4 from JSON sources
                quality_priority = ['2160p', '1440p', '1080p', '720p', '480p', '360p', '240p']
                for q in quality_priority:
                    if q in mp4_urls_from_json:
                        download_url = mp4_urls_from_json[q]
                        self.log(f"Direct downloading ({q} from JSON): {filename}", 'debug')
                        break
                if not download_url:
                    # Take any available quality
                    download_url = next(iter(mp4_urls_from_json.values()))
                    self.log(f"Direct downloading (from JSON): {filename}", 'debug')

            if not download_url:
                # Regex fallback: extract MP4 URLs from page HTML
                mp4_urls = re.findall(
                    r'"(https://video[^"]*\.xhcdn\.com/[^"]+\.(?:h264|mp4)[^"]*)"',
                    page_html
                )
                mp4_urls = [u for u in mp4_urls if not u.endswith('.m3u8') and '.mp4' in u]
                # Filter out preview/sample URLs
                full_urls = [u for u in mp4_urls if not re.search(r'preview|sample|thumb', u, re.IGNORECASE)]
                if full_urls:
                    mp4_urls = full_urls
                mp4_urls = list(dict.fromkeys(mp4_urls))

                if not mp4_urls:
                    self.log("No video URL found on video page", 'debug')
                    return None

                # Take the LAST unique URL (previews tend to appear first in the HTML)
                download_url = mp4_urls[-1]
                self.log(f"Direct downloading (regex fallback): {filename}", 'debug')
            dl_headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                'Referer': 'https://xhamster.com/',
            }

            async with aiohttp.ClientSession() as session:
                async with session.get(download_url, headers=dl_headers,
                                       allow_redirects=True,
                                       timeout=aiohttp.ClientTimeout(total=600)) as resp:
                    if resp.status != 200:
                        return {'success': False, 'error': f'Download failed: HTTP {resp.status}'}

                    total_size = int(resp.headers.get('Content-Length', 0))
                    downloaded = 0

                    with open(file_path, 'wb') as f:
                        async for chunk in resp.content.iter_chunked(65536):
                            f.write(chunk)
                            downloaded += len(chunk)
                            if progress_callback and total_size > 0:
                                progress_callback(downloaded / total_size * 100)

            file_size = file_path.stat().st_size
            self.log(f"Direct download complete: {filename} ({file_size / 1024 / 1024:.1f}MB)", 'debug')

            return {
                'success': True,
                'file_path': str(file_path),
                'filename': filename,
                'file_size': file_size,
                'title': title,
                'video_id': video_id,
                'upload_date': upload_date,
                'timestamp': timestamp,
            }

        except Exception as e:
            self.log(f"Direct download failed: {e}", 'debug')
            return None

    async def _download_hls(self, m3u8_url: str, output_path: Path) -> bool:
        """Download HLS stream using ffmpeg, selecting best quality"""
        try:
            import aiohttp
            from urllib.parse import urlparse
            from yarl import URL as YarlURL

            # Fetch master playlist to find best quality stream
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                'Referer': 'https://xhamster.com/',
            }

            best_stream_url = None
            best_bandwidth = 0

            async with aiohttp.ClientSession() as session:
                # Use encoded=True to preserve %2B/%3D in CloudFront signed URLs
                async with session.get(YarlURL(m3u8_url, encoded=True), headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
                    if resp.status != 200:
                        self.log(f"HLS master playlist fetch failed: HTTP {resp.status}", 'debug')
                        return False
                    playlist = await resp.text()

            # Extract query params from master URL for forwarding to variant URLs
            parsed_master = urlparse(m3u8_url)
            master_query = parsed_master.query

            # Parse master playlist for best quality variant
            lines = playlist.strip().split('\n')

            # Check if this is already a media playlist (no STREAM-INF)
            has_variants = any(line.startswith('#EXT-X-STREAM-INF:') for line in lines)

            if not has_variants:
                # This is already a media playlist — download directly with ffmpeg
                self.log("HLS: single stream (no variants), downloading directly", 'debug')
                best_stream_url = m3u8_url
            else:
                for i, line in enumerate(lines):
                    if line.startswith('#EXT-X-STREAM-INF:'):
                        bw_match = re.search(r'BANDWIDTH=(\d+)', line)
                        bandwidth = int(bw_match.group(1)) if bw_match else 0
                        if bandwidth > best_bandwidth and i + 1 < len(lines):
                            stream_path = lines[i + 1].strip()
                            if stream_path.startswith('http'):
                                best_stream_url = stream_path
                            elif stream_path.startswith('//'):
                                # Protocol-relative URL (different CDN domain)
                                best_stream_url = f"{parsed_master.scheme}:{stream_path}"
                            elif stream_path.startswith('/'):
                                best_stream_url = f"{parsed_master.scheme}://{parsed_master.netloc}{stream_path}"
                            else:
                                m3u8_base = m3u8_url.split('?')[0].rsplit('/', 1)[0]
                                best_stream_url = f"{m3u8_base}/{stream_path}"
                            # Forward signed query params only if variant URL doesn't have its own
                            if master_query and '?' not in best_stream_url:
                                best_stream_url = f"{best_stream_url}?{master_query}"
                            best_bandwidth = bandwidth

            if not best_stream_url:
                self.log("No HLS variant found in master playlist", 'debug')
                return False

            quality_label = ''
            for i, line in enumerate(lines):
                if line.startswith('#EXT-X-STREAM-INF:') and i + 1 < len(lines):
                    rm = re.search(r'RESOLUTION=(\d+x\d+)', line)
                    if rm and int(re.search(r'BANDWIDTH=(\d+)', line).group(1)) == best_bandwidth:
                        quality_label = f" ({rm.group(1)})"
                        break

            self.log(f"HLS downloading best quality{quality_label}", 'debug')

            # Use ffmpeg to download
            cmd = [
                'ffmpeg', '-y',
                '-headers', 'Referer: https://xhamster.com/\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\r\n',
                '-i', best_stream_url,
                '-c', 'copy',
                '-movflags', '+faststart',
                str(output_path)
            ]

            process = await asyncio.create_subprocess_exec(
                *cmd,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE
            )
            _, stderr = await process.communicate()

            if process.returncode != 0:
                error = stderr.decode('utf-8', errors='replace')[-500:]
                self.log(f"ffmpeg HLS download failed: {error}", 'debug')
                return False

            return output_path.exists() and output_path.stat().st_size > 0

        except Exception as e:
            self.log(f"HLS download error: {e}", 'debug')
            return False

    async def _download_video_ytdlp(self, video_url: str, output_dir: Path, quality: str = 'best') -> Dict:
        """Download video using yt-dlp (fallback)"""
        if not self.is_available():
            return {'success': False, 'error': 'yt-dlp not available'}

        try:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)

            output_template = str(output_dir / '%(title).100s_%(id)s.%(ext)s')

            format_str = self.QUALITY_PRESETS.get(quality, self.QUALITY_PRESETS['best'])

            cmd = self._get_base_cmd() + [
                '--no-warnings',
                '-f', format_str,
                '-o', output_template,
                '--print-json',
                '--no-playlist',
                '--user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
                '--concurrent-fragments', '4',
                '--no-part',
                '--retries', '20',
                '--socket-timeout', '30',
                video_url
            ]

            result = await asyncio.create_subprocess_exec(
                *cmd,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE
            )

            stdout, stderr = await result.communicate()

            if result.returncode != 0:
                error_msg = stderr.decode('utf-8', errors='replace').strip()
                if 'Video unavailable' in error_msg or 'not available' in error_msg:
                    error_msg = 'Video unavailable or private'
                elif 'premium' in error_msg.lower():
                    error_msg = 'Video requires premium access'
                elif len(error_msg) > 200:
                    error_msg = error_msg[:200] + '...'

                return {'success': False, 'error': error_msg}

            # Parse output JSON
            video_info = None
            for line in stdout.decode('utf-8', errors='replace').strip().split('\n'):
                try:
                    video_info = json.loads(line)
                    break
                except json.JSONDecodeError:
                    continue

            if not video_info:
                files = list(output_dir.glob('*.mp4'))
                if files:
                    file_path = max(files, key=lambda f: f.stat().st_mtime)
                    return {
                        'success': True,
                        'file_path': str(file_path),
                        'filename': file_path.name,
                        'file_size': file_path.stat().st_size
                    }
                return {'success': False, 'error': 'Could not find downloaded file'}

            file_path = video_info.get('_filename') or video_info.get('filename')
            if file_path:
                file_path = Path(file_path)

            return {
                'success': True,
                'file_path': str(file_path) if file_path else None,
                'filename': file_path.name if file_path else None,
                'file_size': file_path.stat().st_size if file_path and file_path.exists() else video_info.get('filesize'),
                'title': video_info.get('title'),
                'duration': video_info.get('duration'),
                'video_id': video_info.get('id'),
                'upload_date': video_info.get('upload_date'),
                'timestamp': video_info.get('timestamp'),
                'thumbnail': video_info.get('thumbnail'),
            }

        except Exception as e:
            self.log(f"Error downloading video via yt-dlp: {e}", 'error')
            return {'success': False, 'error': str(e)}

    async def get_profile_page(self, url: str) -> Optional[str]:
        """Fetch profile page HTML via aiohttp. Results are cached."""
        base_url = re.sub(r'/(videos|shorts)/?$', '', url)

        if base_url in self._profile_page_cache:
            return self._profile_page_cache[base_url]

        try:
            import aiohttp

            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
            }

            async with aiohttp.ClientSession() as session:
                async with session.get(
                    base_url,
                    headers=headers,
                    timeout=aiohttp.ClientTimeout(total=15)
                ) as resp:
                    if resp.status == 200:
                        text = await resp.text()
                        self._profile_page_cache[base_url] = text
                        return text

        except Exception as e:
            self.log(f"Could not fetch profile page: {e}", 'debug')

        self._profile_page_cache[base_url] = None
        return None

    async def _fetch_page_html(self, url: str) -> Optional[str]:
        """Fetch an arbitrary page's HTML via aiohttp (not cached)."""
        try:
            import aiohttp

            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
            }

            async with aiohttp.ClientSession() as session:
                async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
                    if resp.status == 200:
                        return await resp.text()
                    self.log(f"Page fetch failed: HTTP {resp.status} for {url}", 'debug')

        except Exception as e:
            self.log(f"Could not fetch page: {e}", 'debug')

        return None

    def _extract_initials_json(self, html: str, key_path: str) -> Optional[Any]:
        """Extract a value from window.initials JSON embedded in page HTML.

        Args:
            html: Page HTML containing window.initials = {...}
            key_path: Dot-separated path, e.g. 'galleryPage.photoItems'

        Returns:
            The extracted value, or None if not found.
        """
        try:
            match = re.search(r'window\.initials\s*=\s*(\{.+?\});\s*</script>', html, re.DOTALL)
            if not match:
                return None

            data = json.loads(match.group(1))

            for key in key_path.split('.'):
                if isinstance(data, dict):
                    data = data.get(key)
                else:
                    return None
                if data is None:
                    return None

            return data

        except (json.JSONDecodeError, Exception) as e:
            self.log(f"Failed to extract initials JSON for '{key_path}': {e}", 'debug')
            return None

    async def get_profile_image(self, url: str) -> Optional[str]:
        """Scrape profile page for avatar/photo URL"""
        try:
            page_html = await self.get_profile_page(url)
            if not page_html:
                return None

            # XHamster embeds creator data as JSON in the page.
            # Look for the main creator's thumbUrl in the pornstarTop JSON block
            thumb_match = re.search(
                r'"pornstarTop"\s*:\s*\{[\s\S]*?"thumbUrl"\s*:\s*"([^"]+)"',
                page_html
            )
            if thumb_match:
                avatar_url = thumb_match.group(1).replace('\\/', '/')
                self.log("Found XHamster profile avatar from JSON data", 'debug')
                return avatar_url

            # Fallback: CSS background-image on landing-info__logo-image
            bg_match = re.search(
                r'landing-info__logo-image["\'][^>]*style="[^"]*url\([\'"]?([^\'")]+)',
                page_html
            )
            if bg_match:
                self.log("Found XHamster profile avatar from CSS", 'debug')
                return bg_match.group(1)

            # Fallback: og:image meta tag
            og_match = re.search(r'<meta\s+property="og:image"\s+content="([^"]+)"', page_html)
            if not og_match:
                og_match = re.search(r'<meta\s+content="([^"]+)"\s+property="og:image"', page_html)
            if og_match:
                return og_match.group(1)

        except Exception as e:
            self.log(f"Could not fetch profile image: {e}", 'debug')

        return None

    async def get_profile_bio(self, url: str) -> Optional[str]:
        """Scrape bio/about section from profile page"""
        try:
            page_html = await self.get_profile_page(url)
            if not page_html:
                return None

            # Look for description/bio sections
            bio_match = re.search(
                r'<div[^>]*class="[^"]*about[^"]*"[^>]*>\s*(.*?)\s*</div>',
                page_html, re.DOTALL
            )
            if bio_match:
                bio_text = re.sub(r'<[^>]+>', '', bio_match.group(1)).strip()
                if bio_text:
                    self.log("Found XHamster profile bio", 'debug')
                    return html_module.unescape(bio_text)

            # Try meta description
            desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]+)"', page_html)
            if desc_match:
                bio_text = html_module.unescape(desc_match.group(1).strip())
                if bio_text and len(bio_text) > 20:
                    return bio_text

        except Exception as e:
            self.log(f"Could not fetch profile bio: {e}", 'debug')

        return None

    async def get_creator(self, url: str) -> Optional[Creator]:
        """Get Creator object from creator URL"""
        info = await self.get_creator_info(url)
        if not info:
            return None

        creator_type_id = self.extract_creator_id(url)
        if creator_type_id:
            creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}"
        else:
            creator_id = info.get('creator_id', '')

        profile_image = await self.get_profile_image(url)

        return Creator(
            creator_id=creator_id,
            service_id='xhamster',
            platform='xhamster',
            username=info.get('creator_name', 'Unknown'),
            display_name=info.get('creator_name'),
            profile_image_url=profile_image,
        )

    async def get_posts(self, url: str, since_date: str = None,
                        max_videos: int = None, progress_callback=None) -> List[Post]:
        """Get all content (videos, shorts, galleries) as Post objects.

        Aggregates regular videos, shorts/moments, and photo galleries into a
        unified list of Post objects.  Deduplicates by post_id so videos and
        shorts that share an xhID are not counted twice.
        """
        creator_type_id = self.extract_creator_id(url)
        creator_id = f"{creator_type_id[0]}/{creator_type_id[1]}" if creator_type_id else ''

        posts = []
        seen_post_ids = set()

        # 1. Regular videos (via yt-dlp --flat-playlist)
        videos = await self.get_creator_videos(url, since_date, max_videos, progress_callback)
        for video in videos:
            vid = video['video_id']
            if vid in seen_post_ids:
                continue
            seen_post_ids.add(vid)

            attachment = Attachment(
                name=f"{vid}.mp4",
                file_type='video',
                extension='mp4',
                server_path=video['url'],
                download_url=video['url'],
                duration=video.get('duration'),
            )
            posts.append(Post(
                post_id=vid,
                service_id='xhamster',
                platform='xhamster',
                creator_id=creator_id,
                title=video['title'],
                content=video.get('description') or video['title'],
                published_at=video.get('upload_date'),
                attachments=[attachment],
            ))

        # 2. Shorts / Moments (HTML scraping)
        try:
            shorts = await self.get_creator_shorts(url, max_items=max_videos)
            for short in shorts:
                vid = short['video_id']
                if vid in seen_post_ids:
                    continue
                seen_post_ids.add(vid)

                attachment = Attachment(
                    name=f"{vid}.mp4",
                    file_type='video',
                    extension='mp4',
                    server_path=short['url'],
                    download_url=short['url'],
                    duration=short.get('duration'),
                )
                posts.append(Post(
                    post_id=vid,
                    service_id='xhamster',
                    platform='xhamster',
                    creator_id=creator_id,
                    title=short['title'],
                    content=short.get('description') or short['title'],
                    published_at=short.get('upload_date'),
                    attachments=[attachment],
                ))
        except Exception as e:
            self.log(f"Failed to fetch shorts (continuing with videos): {e}", 'warning')

        # 3. Photo galleries (HTML scraping)
        try:
            galleries = await self.get_creator_galleries(url)
            for gallery in galleries:
                gallery_post_id = f"gallery-{gallery['gallery_id']}"
                if gallery_post_id in seen_post_ids:
                    continue
                seen_post_ids.add(gallery_post_id)

                gallery_data = await self.get_gallery_images(gallery['url'])
                if not gallery_data or not gallery_data.get('images'):
                    continue

                attachments = []
                for img in gallery_data['images']:
                    # Determine extension from URL
                    ext = 'jpg'
                    if img['url']:
                        url_ext = img['url'].rsplit('.', 1)[-1].split('?')[0].lower()
                        if url_ext in ('jpg', 'jpeg', 'png', 'gif', 'webp'):
                            ext = url_ext

                    attachments.append(Attachment(
                        name=f"{img['id']}.{ext}",
                        file_type='image',
                        extension=ext,
                        server_path=img['url'],
                        download_url=img['url'],
                        width=img.get('width'),
                        height=img.get('height'),
                    ))

                posts.append(Post(
                    post_id=gallery_post_id,
                    service_id='xhamster',
                    platform='xhamster',
                    creator_id=creator_id,
                    title=gallery_data.get('title') or gallery.get('title', ''),
                    content=gallery_data.get('title') or gallery.get('title', ''),
                    published_at=gallery_data.get('created'),
                    attachments=attachments,
                ))

                # Small delay between gallery fetches
                await asyncio.sleep(0.5)

        except Exception as e:
            self.log(f"Failed to fetch galleries (continuing with videos/shorts): {e}", 'warning')

        self.log(f"Total posts: {len(posts)} (videos + shorts + galleries)", 'info')
        return posts