media-downloader/modules/paid_content/besteyecandy_client.py

"""
BestEyeCandy.com Client for Paid Content

Scrapes celebrity photo galleries from BestEyeCandy.com.
Each celeb has a unique CID and paginated photo listings.

Optimization: Full-res URLs follow a predictable pattern. We visit ONE
detail page to determine the pattern (server hostname + name format),
then construct all remaining URLs from photo IDs found on listing pages.
"""

import asyncio
import html
import json
import re
from datetime import datetime, timezone
from typing import Dict, List, Optional, Set
from urllib.parse import urlparse

import aiohttp

from modules.base_module import LoggingMixin
from .models import Post, Attachment


class BestEyeCandyClient(LoggingMixin):
    """Client for scraping BestEyeCandy.com celebrity photo galleries."""

    SERVICE_ID = 'besteyecandy'
    PLATFORM = 'besteyecandy'
    BASE_URL = 'https://besteyecandy.com'

    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                       '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9',
    }

    def __init__(self, unified_db=None, log_callback=None):
        self._init_logger('PaidContent', log_callback, default_module='BestEyeCandy')
        self.unified_db = unified_db

    # ------------------------------------------------------------------
    # Cookie support
    # ------------------------------------------------------------------

    def _get_cookies(self) -> Optional[list]:
        """Load cookies from the scrapers table for besteyecandy."""
        if not self.unified_db:
            return None

        try:
            with self.unified_db.get_connection() as conn:
                cursor = conn.cursor()
                cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?",
                               (self.SERVICE_ID,))
                row = cursor.fetchone()
                if row and row[0]:
                    data = json.loads(row[0])
                    if isinstance(data, dict) and 'cookies' in data:
                        return data['cookies']
                    elif isinstance(data, list):
                        return data
        except Exception as e:
            self.log(f"Could not load cookies: {e}", 'debug')

        return None

    def _build_cookie_jar(self, cookies_list: list) -> aiohttp.CookieJar:
        """Build an aiohttp CookieJar from a list of cookie dicts."""
        jar = aiohttp.CookieJar(unsafe=True)
        for cookie in cookies_list:
            from http.cookies import Morsel
            import types

            name = cookie.get('name', '')
            value = cookie.get('value', '')
            domain = cookie.get('domain', '')
            path = cookie.get('path', '/')

            # Use SimpleCookie approach
            from http.cookies import SimpleCookie
            sc = SimpleCookie()
            sc[name] = value
            sc[name]['domain'] = domain
            sc[name]['path'] = path
            if cookie.get('secure'):
                sc[name]['secure'] = True

            jar.update_cookies(sc, urlparse(f"https://{domain.lstrip('.')}"))

        return jar

    def _create_session(self, timeout: aiohttp.ClientTimeout = None) -> aiohttp.ClientSession:
        """Create an aiohttp session with cookies loaded from DB."""
        if timeout is None:
            timeout = aiohttp.ClientTimeout(total=60)

        cookies_list = self._get_cookies()
        if cookies_list:
            jar = self._build_cookie_jar(cookies_list)
            self.log(f"Loaded {len(cookies_list)} cookies for session", 'debug')
            return aiohttp.ClientSession(timeout=timeout, cookie_jar=jar)
        else:
            self.log("No cookies found for besteyecandy, requests may fail", 'warning')
            return aiohttp.ClientSession(timeout=timeout)

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    async def get_profile_info(self, cid: str, celeb_slug: str) -> Optional[Dict]:
        """Fetch page 1 of a celeb's listing and return profile-like info."""
        url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
               f'sortedby-age/page-1/{celeb_slug}.html')

        try:
            async with self._create_session() as session:
                async with session.get(url, headers=self.HEADERS,
                                       allow_redirects=True) as resp:
                    if resp.status != 200:
                        self.log(f"BestEyeCandy cid {cid} returned HTTP {resp.status}",
                                 'warning')
                        return None
                    page_html = await resp.text()
        except Exception as e:
            self.log(f"Failed to fetch BestEyeCandy cid {cid}: {e}", 'error')
            return None

        # Extract celeb name from page title or heading
        celeb_name = self._extract_celeb_name(page_html) or celeb_slug.replace('-', ' ')

        # Extract total photos and pages
        total_photos = self._extract_total_photos(page_html)
        photos_per_page = len(self._extract_photo_ids(page_html)) or 48
        page_count = self._extract_page_count(page_html,
                                               photos_per_page=photos_per_page)

        celeb_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
                     f'sortedby-age/page-1/{celeb_slug}.html')

        return {
            'username': celeb_slug,
            'display_name': celeb_name,
            'post_count': total_photos,
            'page_count': page_count,
            'celeb_url': celeb_url,
        }

    async def get_posts(self, cid: str, celeb_slug: str,
                        known_post_ids: Optional[Set[str]] = None,
                        progress_callback=None) -> List[Post]:
        """Scrape all listing pages and return posts with full-res image URLs.

        Each listing page becomes one Post with ~48 Attachments (one per photo).
        Post IDs are "page_N" (e.g. "page_1", "page_2", ...).

        Phase 1: Fetch page 1, get first photo ID, visit detail page to learn
                 the full-res URL pattern.
        Phase 2: Paginate all listing pages, build one Post per page.
        """
        known = known_post_ids or set()
        posts: List[Post] = []
        total_photos = 0
        url_pattern = None

        try:
            async with self._create_session() as session:
                # -- Phase 1: Fetch page 1 and determine full-res URL pattern --
                page1_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
                             f'sortedby-age/page-1/{celeb_slug}.html')

                page_html = await self._fetch_page(session, page1_url)
                if page_html is None:
                    return []

                # Estimate page count for progress display
                photos_per_page = len(self._extract_photo_ids(page_html)) or 48
                estimated_pages = self._extract_page_count(
                    page_html, photos_per_page=photos_per_page)
                self.log(f"Estimated {estimated_pages} pages of photos "
                         f"({photos_per_page}/page)", 'info')

                # Discover full-res URL pattern from first photo
                first_page_ids = self._extract_photo_ids(page_html)
                if first_page_ids:
                    url_pattern = await self._discover_url_pattern(
                        session, first_page_ids[0], cid, celeb_slug)

                if not url_pattern:
                    self.log("Could not determine full-res URL pattern", 'error')
                    return []

                self.log(f"URL pattern: server={url_pattern['server']}, "
                         f"name_format={url_pattern['name_format']}, "
                         f"ext={url_pattern['ext']}", 'info')

                # -- Phase 2: Paginate all pages, one Post per page --
                page_num = 0
                has_next = True  # start with page 1

                while has_next:
                    page_num += 1

                    if page_num == 1:
                        # Already fetched page 1
                        pass
                    else:
                        await asyncio.sleep(2)  # Rate limit

                        page_url = (
                            f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
                            f'sortedby-age/page-{page_num}/{celeb_slug}.html')

                        page_html = await self._fetch_page(session, page_url)
                        if page_html is None:
                            self.log(f"Failed to fetch page {page_num}, stopping",
                                     'warning')
                            break

                    page_ids = self._extract_photo_ids(page_html)
                    if not page_ids:
                        self.log(f"Page {page_num}: no photos, stopping", 'info')
                        break

                    total_photos += len(page_ids)
                    has_next = self._has_next_page(page_html)

                    # Check if this page-post is already known
                    post_id = f"page_{page_num}"
                    if post_id in known:
                        self.log(f"Page {page_num}: already known, skipping",
                                 'debug')
                        if progress_callback:
                            progress_callback(
                                f"Page {page_num}/~{estimated_pages} — "
                                f"{total_photos} photos (skipped known)")
                        continue

                    # Build attachments for all photos on this page
                    attachments = []
                    for photo_id in page_ids:
                        dl_url = self._construct_full_res_url(url_pattern, photo_id)
                        filename = dl_url.rsplit('/', 1)[-1]

                        attachments.append(Attachment(
                            name=filename,
                            file_type='image',
                            extension=url_pattern.get('ext', 'jpg'),
                            server_path=dl_url,
                            download_url=dl_url,
                        ))

                    post = Post(
                        post_id=post_id,
                        service_id=self.SERVICE_ID,
                        platform=self.PLATFORM,
                        creator_id=cid,
                        title=f"Page {page_num}",
                        content=f"{len(page_ids)} photos",
                        published_at=datetime.now(tz=timezone.utc).isoformat(),
                        attachments=attachments,
                    )
                    posts.append(post)

                    if progress_callback:
                        progress_callback(
                            f"Page {page_num}/~{estimated_pages} — "
                            f"{total_photos} photos")

                    self.log(f"Page {page_num}/~{estimated_pages}: "
                             f"{len(page_ids)} photos", 'debug')

        except Exception as e:
            self.log(f"Error scraping BestEyeCandy: {e}", 'error')

        self.log(f"Total: {len(posts)} new page-posts with "
                 f"{total_photos} photos across all pages", 'info')
        return posts

    # ------------------------------------------------------------------
    # URL pattern discovery
    # ------------------------------------------------------------------

    async def _discover_url_pattern(self, session: aiohttp.ClientSession,
                                    photo_id: str, cid: str,
                                    celeb_slug: str) -> Optional[Dict]:
        """Visit a detail page to discover the full-res URL pattern.

        Returns dict with keys: server, dir_pattern, name_format, ext
        """
        detail_url = (f'{self.BASE_URL}/section/celeb-photogallery/'
                      f'cid-{cid}/{celeb_slug}/photo-{photo_id}.html')

        await asyncio.sleep(2)  # Rate limit
        page_html = await self._fetch_page(session, detail_url)
        if page_html is None:
            return None

        # Look for full-res image URL in the detail page
        # Pattern: <img src="https://euX.besteyecandy.com/section/large-photos/area-female/besteyecandy-{ID}/{Name}_{ID}_BestEyeCandyCOM.jpg">
        # or <a href="..."> with similar pattern
        patterns = [
            r'(https?://[a-z0-9]+\.besteyecandy\.com/section/large-photos/[^"\'>\s]+)',
            r'(https?://[a-z0-9]+\.besteyecandy\.com/[^"\'>\s]*besteyecandy-' + re.escape(photo_id) + r'[^"\'>\s]*)',
        ]

        full_res_url = None
        for pattern in patterns:
            match = re.search(pattern, page_html)
            if match:
                full_res_url = match.group(1)
                break

        if not full_res_url:
            self.log(f"Could not find full-res URL on detail page for photo {photo_id}",
                     'error')
            return None

        self.log(f"Found full-res URL: {full_res_url}", 'debug')

        # Parse the URL to extract the pattern components
        parsed = urlparse(full_res_url)
        server = parsed.netloc  # e.g., eu4.besteyecandy.com

        # Extract name format from the filename
        # e.g., Myleene_Klass_7727820_BestEyeCandyCOM.jpg
        filename = parsed.path.rsplit('/', 1)[-1]
        ext = filename.rsplit('.', 1)[-1] if '.' in filename else 'jpg'

        # Extract the path pattern (everything before the filename)
        path_dir = parsed.path.rsplit('/', 1)[0]  # e.g., /section/large-photos/area-female/besteyecandy-7727820

        # The directory pattern includes the photo ID, extract the base
        # e.g., /section/large-photos/area-female/besteyecandy-{ID}
        dir_pattern = re.sub(re.escape(photo_id), '{ID}', path_dir)

        # Extract the name format by removing the photo ID
        # e.g., Myleene_Klass_{ID}_BestEyeCandyCOM.jpg -> Myleene_Klass_{ID}_BestEyeCandyCOM
        name_without_ext = filename.rsplit('.', 1)[0]
        name_format = name_without_ext.replace(photo_id, '{ID}')

        return {
            'server': server,
            'dir_pattern': dir_pattern,
            'name_format': name_format,
            'ext': ext,
            'example_url': full_res_url,
        }

    def _construct_full_res_url(self, url_pattern: Dict, photo_id: str) -> str:
        """Construct the full-res URL for a photo ID using the discovered pattern."""
        dir_path = url_pattern['dir_pattern'].replace('{ID}', photo_id)
        filename = url_pattern['name_format'].replace('{ID}', photo_id) + '.' + url_pattern['ext']
        return f"https://{url_pattern['server']}{dir_path}/{filename}"

    # ------------------------------------------------------------------
    # HTML parsing helpers
    # ------------------------------------------------------------------

    def _extract_photo_ids(self, page_html: str) -> List[str]:
        """Extract photo IDs from a listing page.

        Photo links look like: href="...photo-12345.html"
        """
        ids = re.findall(r'href="[^"]*photo-(\d+)\.html"', page_html)
        # Deduplicate while preserving order
        seen = set()
        unique_ids = []
        for pid in ids:
            if pid not in seen:
                seen.add(pid)
                unique_ids.append(pid)
        return unique_ids

    @staticmethod
    def _extract_celeb_name(page_html: str) -> Optional[str]:
        """Extract celebrity name from the page."""
        # Try <title> tag: "Myleene Klass Photo Collection @ ...::: BestEyeCandy.com :::..."
        m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
        if m:
            title = html.unescape(m.group(1).strip())
            # Remove everything from "Photo Collection" or "@" onwards
            title = re.sub(r'\s*Photo\s+Collection.*$', '', title,
                           flags=re.IGNORECASE).strip()
            title = re.sub(r'\s*@.*$', '', title).strip()
            # Fallback: remove BestEyeCandy suffix
            title = re.sub(r'\s*[-\u2013\u2014|]?\s*\.{0,3}:{0,3}\s*BestEyeCandy.*$', '',
                           title, flags=re.IGNORECASE).strip()
            if title:
                return title

        # Try <h1> or <h2>
        m = re.search(r'<h[12][^>]*>([^<]+)</h[12]>', page_html)
        if m:
            return html.unescape(m.group(1).strip())

        return None

    @staticmethod
    def _extract_total_photos(page_html: str) -> int:
        """Extract total photo count from the page.

        Handles European format (15.660) and US format (15,660).
        """
        # Look for "N.NNN photos" or "N,NNN photos" or "NNN photos"
        # Require leading digit to avoid matching ", photo" from keywords
        m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
        if m:
            num_str = m.group(1)
            # European format uses dots as thousands separators: 15.660
            # US format uses commas: 15,660
            # Remove both dots and commas (they're thousands separators)
            num_str = num_str.replace('.', '').replace(',', '')
            try:
                return int(num_str)
            except ValueError:
                pass
        return 0

    @staticmethod
    def _extract_page_count(page_html: str, photos_per_page: int = 48) -> int:
        """Extract total page count from the listing page.

        Uses total photo count divided by photos per page, or falls back
        to finding the maximum page number in pagination links.
        """
        # Method 1: Calculate from total photos
        m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
        if m:
            num_str = m.group(1).replace('.', '').replace(',', '')
            try:
                total = int(num_str)
                if total > 0:
                    return (total + photos_per_page - 1) // photos_per_page
            except ValueError:
                pass

        # Method 2: Find max page-N in pagination links for same celeb
        page_nums = [int(x) for x in re.findall(r'/page-(\d+)/', page_html)]
        if page_nums:
            return max(page_nums)

        return 1

    @staticmethod
    def _has_next_page(page_html: str) -> bool:
        """Check if there's a 'Next Page' link on the current page."""
        return 'alt="Next Page"' in page_html

    # ------------------------------------------------------------------
    # Utility helpers
    # ------------------------------------------------------------------

    async def _fetch_page(self, session: aiohttp.ClientSession,
                          url: str) -> Optional[str]:
        """Fetch a single page, return HTML or None."""
        try:
            async with session.get(url, headers=self.HEADERS,
                                   allow_redirects=True) as resp:
                if resp.status != 200:
                    self.log(f"HTTP {resp.status} for {url}", 'warning')
                    return None
                return await resp.text()
        except Exception as e:
            self.log(f"Error fetching {url}: {e}", 'warning')
            return None