Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/modules/paid_content/besteyecandy_client.py
+++ b/modules/paid_content/besteyecandy_client.py
@@ -0,0 +1,468 @@
+"""
+BestEyeCandy.com Client for Paid Content
+
+Scrapes celebrity photo galleries from BestEyeCandy.com.
+Each celeb has a unique CID and paginated photo listings.
+
+Optimization: Full-res URLs follow a predictable pattern. We visit ONE
+detail page to determine the pattern (server hostname + name format),
+then construct all remaining URLs from photo IDs found on listing pages.
+"""
+
+import asyncio
+import html
+import json
+import re
+from datetime import datetime, timezone
+from typing import Dict, List, Optional, Set
+from urllib.parse import urlparse
+
+import aiohttp
+
+from modules.base_module import LoggingMixin
+from .models import Post, Attachment
+
+
+class BestEyeCandyClient(LoggingMixin):
+    """Client for scraping BestEyeCandy.com celebrity photo galleries."""
+
+    SERVICE_ID = 'besteyecandy'
+    PLATFORM = 'besteyecandy'
+    BASE_URL = 'https://besteyecandy.com'
+
+    HEADERS = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+                       '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.9',
+    }
+
+    def __init__(self, unified_db=None, log_callback=None):
+        self._init_logger('PaidContent', log_callback, default_module='BestEyeCandy')
+        self.unified_db = unified_db
+
+    # ------------------------------------------------------------------
+    # Cookie support
+    # ------------------------------------------------------------------
+
+    def _get_cookies(self) -> Optional[list]:
+        """Load cookies from the scrapers table for besteyecandy."""
+        if not self.unified_db:
+            return None
+
+        try:
+            with self.unified_db.get_connection() as conn:
+                cursor = conn.cursor()
+                cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?",
+                               (self.SERVICE_ID,))
+                row = cursor.fetchone()
+                if row and row[0]:
+                    data = json.loads(row[0])
+                    if isinstance(data, dict) and 'cookies' in data:
+                        return data['cookies']
+                    elif isinstance(data, list):
+                        return data
+        except Exception as e:
+            self.log(f"Could not load cookies: {e}", 'debug')
+
+        return None
+
+    def _build_cookie_jar(self, cookies_list: list) -> aiohttp.CookieJar:
+        """Build an aiohttp CookieJar from a list of cookie dicts."""
+        jar = aiohttp.CookieJar(unsafe=True)
+        for cookie in cookies_list:
+            from http.cookies import Morsel
+            import types
+
+            name = cookie.get('name', '')
+            value = cookie.get('value', '')
+            domain = cookie.get('domain', '')
+            path = cookie.get('path', '/')
+
+            # Use SimpleCookie approach
+            from http.cookies import SimpleCookie
+            sc = SimpleCookie()
+            sc[name] = value
+            sc[name]['domain'] = domain
+            sc[name]['path'] = path
+            if cookie.get('secure'):
+                sc[name]['secure'] = True
+
+            jar.update_cookies(sc, urlparse(f"https://{domain.lstrip('.')}"))
+
+        return jar
+
+    def _create_session(self, timeout: aiohttp.ClientTimeout = None) -> aiohttp.ClientSession:
+        """Create an aiohttp session with cookies loaded from DB."""
+        if timeout is None:
+            timeout = aiohttp.ClientTimeout(total=60)
+
+        cookies_list = self._get_cookies()
+        if cookies_list:
+            jar = self._build_cookie_jar(cookies_list)
+            self.log(f"Loaded {len(cookies_list)} cookies for session", 'debug')
+            return aiohttp.ClientSession(timeout=timeout, cookie_jar=jar)
+        else:
+            self.log("No cookies found for besteyecandy, requests may fail", 'warning')
+            return aiohttp.ClientSession(timeout=timeout)
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    async def get_profile_info(self, cid: str, celeb_slug: str) -> Optional[Dict]:
+        """Fetch page 1 of a celeb's listing and return profile-like info."""
+        url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
+               f'sortedby-age/page-1/{celeb_slug}.html')
+
+        try:
+            async with self._create_session() as session:
+                async with session.get(url, headers=self.HEADERS,
+                                       allow_redirects=True) as resp:
+                    if resp.status != 200:
+                        self.log(f"BestEyeCandy cid {cid} returned HTTP {resp.status}",
+                                 'warning')
+                        return None
+                    page_html = await resp.text()
+        except Exception as e:
+            self.log(f"Failed to fetch BestEyeCandy cid {cid}: {e}", 'error')
+            return None
+
+        # Extract celeb name from page title or heading
+        celeb_name = self._extract_celeb_name(page_html) or celeb_slug.replace('-', ' ')
+
+        # Extract total photos and pages
+        total_photos = self._extract_total_photos(page_html)
+        photos_per_page = len(self._extract_photo_ids(page_html)) or 48
+        page_count = self._extract_page_count(page_html,
+                                               photos_per_page=photos_per_page)
+
+        celeb_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
+                     f'sortedby-age/page-1/{celeb_slug}.html')
+
+        return {
+            'username': celeb_slug,
+            'display_name': celeb_name,
+            'post_count': total_photos,
+            'page_count': page_count,
+            'celeb_url': celeb_url,
+        }
+
+    async def get_posts(self, cid: str, celeb_slug: str,
+                        known_post_ids: Optional[Set[str]] = None,
+                        progress_callback=None) -> List[Post]:
+        """Scrape all listing pages and return posts with full-res image URLs.
+
+        Each listing page becomes one Post with ~48 Attachments (one per photo).
+        Post IDs are "page_N" (e.g. "page_1", "page_2", ...).
+
+        Phase 1: Fetch page 1, get first photo ID, visit detail page to learn
+                 the full-res URL pattern.
+        Phase 2: Paginate all listing pages, build one Post per page.
+        """
+        known = known_post_ids or set()
+        posts: List[Post] = []
+        total_photos = 0
+        url_pattern = None
+
+        try:
+            async with self._create_session() as session:
+                # -- Phase 1: Fetch page 1 and determine full-res URL pattern --
+                page1_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
+                             f'sortedby-age/page-1/{celeb_slug}.html')
+
+                page_html = await self._fetch_page(session, page1_url)
+                if page_html is None:
+                    return []
+
+                # Estimate page count for progress display
+                photos_per_page = len(self._extract_photo_ids(page_html)) or 48
+                estimated_pages = self._extract_page_count(
+                    page_html, photos_per_page=photos_per_page)
+                self.log(f"Estimated {estimated_pages} pages of photos "
+                         f"({photos_per_page}/page)", 'info')
+
+                # Discover full-res URL pattern from first photo
+                first_page_ids = self._extract_photo_ids(page_html)
+                if first_page_ids:
+                    url_pattern = await self._discover_url_pattern(
+                        session, first_page_ids[0], cid, celeb_slug)
+
+                if not url_pattern:
+                    self.log("Could not determine full-res URL pattern", 'error')
+                    return []
+
+                self.log(f"URL pattern: server={url_pattern['server']}, "
+                         f"name_format={url_pattern['name_format']}, "
+                         f"ext={url_pattern['ext']}", 'info')
+
+                # -- Phase 2: Paginate all pages, one Post per page --
+                page_num = 0
+                has_next = True  # start with page 1
+
+                while has_next:
+                    page_num += 1
+
+                    if page_num == 1:
+                        # Already fetched page 1
+                        pass
+                    else:
+                        await asyncio.sleep(2)  # Rate limit
+
+                        page_url = (
+                            f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
+                            f'sortedby-age/page-{page_num}/{celeb_slug}.html')
+
+                        page_html = await self._fetch_page(session, page_url)
+                        if page_html is None:
+                            self.log(f"Failed to fetch page {page_num}, stopping",
+                                     'warning')
+                            break
+
+                    page_ids = self._extract_photo_ids(page_html)
+                    if not page_ids:
+                        self.log(f"Page {page_num}: no photos, stopping", 'info')
+                        break
+
+                    total_photos += len(page_ids)
+                    has_next = self._has_next_page(page_html)
+
+                    # Check if this page-post is already known
+                    post_id = f"page_{page_num}"
+                    if post_id in known:
+                        self.log(f"Page {page_num}: already known, skipping",
+                                 'debug')
+                        if progress_callback:
+                            progress_callback(
+                                f"Page {page_num}/~{estimated_pages} — "
+                                f"{total_photos} photos (skipped known)")
+                        continue
+
+                    # Build attachments for all photos on this page
+                    attachments = []
+                    for photo_id in page_ids:
+                        dl_url = self._construct_full_res_url(url_pattern, photo_id)
+                        filename = dl_url.rsplit('/', 1)[-1]
+
+                        attachments.append(Attachment(
+                            name=filename,
+                            file_type='image',
+                            extension=url_pattern.get('ext', 'jpg'),
+                            server_path=dl_url,
+                            download_url=dl_url,
+                        ))
+
+                    post = Post(
+                        post_id=post_id,
+                        service_id=self.SERVICE_ID,
+                        platform=self.PLATFORM,
+                        creator_id=cid,
+                        title=f"Page {page_num}",
+                        content=f"{len(page_ids)} photos",
+                        published_at=datetime.now(tz=timezone.utc).isoformat(),
+                        attachments=attachments,
+                    )
+                    posts.append(post)
+
+                    if progress_callback:
+                        progress_callback(
+                            f"Page {page_num}/~{estimated_pages} — "
+                            f"{total_photos} photos")
+
+                    self.log(f"Page {page_num}/~{estimated_pages}: "
+                             f"{len(page_ids)} photos", 'debug')
+
+        except Exception as e:
+            self.log(f"Error scraping BestEyeCandy: {e}", 'error')
+
+        self.log(f"Total: {len(posts)} new page-posts with "
+                 f"{total_photos} photos across all pages", 'info')
+        return posts
+
+    # ------------------------------------------------------------------
+    # URL pattern discovery
+    # ------------------------------------------------------------------
+
+    async def _discover_url_pattern(self, session: aiohttp.ClientSession,
+                                    photo_id: str, cid: str,
+                                    celeb_slug: str) -> Optional[Dict]:
+        """Visit a detail page to discover the full-res URL pattern.
+
+        Returns dict with keys: server, dir_pattern, name_format, ext
+        """
+        detail_url = (f'{self.BASE_URL}/section/celeb-photogallery/'
+                      f'cid-{cid}/{celeb_slug}/photo-{photo_id}.html')
+
+        await asyncio.sleep(2)  # Rate limit
+        page_html = await self._fetch_page(session, detail_url)
+        if page_html is None:
+            return None
+
+        # Look for full-res image URL in the detail page
+        # Pattern: <img src="https://euX.besteyecandy.com/section/large-photos/area-female/besteyecandy-{ID}/{Name}_{ID}_BestEyeCandyCOM.jpg">
+        # or <a href="..."> with similar pattern
+        patterns = [
+            r'(https?://[a-z0-9]+\.besteyecandy\.com/section/large-photos/[^"\'>\s]+)',
+            r'(https?://[a-z0-9]+\.besteyecandy\.com/[^"\'>\s]*besteyecandy-' + re.escape(photo_id) + r'[^"\'>\s]*)',
+        ]
+
+        full_res_url = None
+        for pattern in patterns:
+            match = re.search(pattern, page_html)
+            if match:
+                full_res_url = match.group(1)
+                break
+
+        if not full_res_url:
+            self.log(f"Could not find full-res URL on detail page for photo {photo_id}",
+                     'error')
+            return None
+
+        self.log(f"Found full-res URL: {full_res_url}", 'debug')
+
+        # Parse the URL to extract the pattern components
+        parsed = urlparse(full_res_url)
+        server = parsed.netloc  # e.g., eu4.besteyecandy.com
+
+        # Extract name format from the filename
+        # e.g., Myleene_Klass_7727820_BestEyeCandyCOM.jpg
+        filename = parsed.path.rsplit('/', 1)[-1]
+        ext = filename.rsplit('.', 1)[-1] if '.' in filename else 'jpg'
+
+        # Extract the path pattern (everything before the filename)
+        path_dir = parsed.path.rsplit('/', 1)[0]  # e.g., /section/large-photos/area-female/besteyecandy-7727820
+
+        # The directory pattern includes the photo ID, extract the base
+        # e.g., /section/large-photos/area-female/besteyecandy-{ID}
+        dir_pattern = re.sub(re.escape(photo_id), '{ID}', path_dir)
+
+        # Extract the name format by removing the photo ID
+        # e.g., Myleene_Klass_{ID}_BestEyeCandyCOM.jpg -> Myleene_Klass_{ID}_BestEyeCandyCOM
+        name_without_ext = filename.rsplit('.', 1)[0]
+        name_format = name_without_ext.replace(photo_id, '{ID}')
+
+        return {
+            'server': server,
+            'dir_pattern': dir_pattern,
+            'name_format': name_format,
+            'ext': ext,
+            'example_url': full_res_url,
+        }
+
+    def _construct_full_res_url(self, url_pattern: Dict, photo_id: str) -> str:
+        """Construct the full-res URL for a photo ID using the discovered pattern."""
+        dir_path = url_pattern['dir_pattern'].replace('{ID}', photo_id)
+        filename = url_pattern['name_format'].replace('{ID}', photo_id) + '.' + url_pattern['ext']
+        return f"https://{url_pattern['server']}{dir_path}/{filename}"
+
+    # ------------------------------------------------------------------
+    # HTML parsing helpers
+    # ------------------------------------------------------------------
+
+    def _extract_photo_ids(self, page_html: str) -> List[str]:
+        """Extract photo IDs from a listing page.
+
+        Photo links look like: href="...photo-12345.html"
+        """
+        ids = re.findall(r'href="[^"]*photo-(\d+)\.html"', page_html)
+        # Deduplicate while preserving order
+        seen = set()
+        unique_ids = []
+        for pid in ids:
+            if pid not in seen:
+                seen.add(pid)
+                unique_ids.append(pid)
+        return unique_ids
+
+    @staticmethod
+    def _extract_celeb_name(page_html: str) -> Optional[str]:
+        """Extract celebrity name from the page."""
+        # Try <title> tag: "Myleene Klass Photo Collection @ ...::: BestEyeCandy.com :::..."
+        m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
+        if m:
+            title = html.unescape(m.group(1).strip())
+            # Remove everything from "Photo Collection" or "@" onwards
+            title = re.sub(r'\s*Photo\s+Collection.*$', '', title,
+                           flags=re.IGNORECASE).strip()
+            title = re.sub(r'\s*@.*$', '', title).strip()
+            # Fallback: remove BestEyeCandy suffix
+            title = re.sub(r'\s*[-\u2013\u2014|]?\s*\.{0,3}:{0,3}\s*BestEyeCandy.*$', '',
+                           title, flags=re.IGNORECASE).strip()
+            if title:
+                return title
+
+        # Try <h1> or <h2>
+        m = re.search(r'<h[12][^>]*>([^<]+)</h[12]>', page_html)
+        if m:
+            return html.unescape(m.group(1).strip())
+
+        return None
+
+    @staticmethod
+    def _extract_total_photos(page_html: str) -> int:
+        """Extract total photo count from the page.
+
+        Handles European format (15.660) and US format (15,660).
+        """
+        # Look for "N.NNN photos" or "N,NNN photos" or "NNN photos"
+        # Require leading digit to avoid matching ", photo" from keywords
+        m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
+        if m:
+            num_str = m.group(1)
+            # European format uses dots as thousands separators: 15.660
+            # US format uses commas: 15,660
+            # Remove both dots and commas (they're thousands separators)
+            num_str = num_str.replace('.', '').replace(',', '')
+            try:
+                return int(num_str)
+            except ValueError:
+                pass
+        return 0
+
+    @staticmethod
+    def _extract_page_count(page_html: str, photos_per_page: int = 48) -> int:
+        """Extract total page count from the listing page.
+
+        Uses total photo count divided by photos per page, or falls back
+        to finding the maximum page number in pagination links.
+        """
+        # Method 1: Calculate from total photos
+        m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
+        if m:
+            num_str = m.group(1).replace('.', '').replace(',', '')
+            try:
+                total = int(num_str)
+                if total > 0:
+                    return (total + photos_per_page - 1) // photos_per_page
+            except ValueError:
+                pass
+
+        # Method 2: Find max page-N in pagination links for same celeb
+        page_nums = [int(x) for x in re.findall(r'/page-(\d+)/', page_html)]
+        if page_nums:
+            return max(page_nums)
+
+        return 1
+
+    @staticmethod
+    def _has_next_page(page_html: str) -> bool:
+        """Check if there's a 'Next Page' link on the current page."""
+        return 'alt="Next Page"' in page_html
+
+    # ------------------------------------------------------------------
+    # Utility helpers
+    # ------------------------------------------------------------------
+
+    async def _fetch_page(self, session: aiohttp.ClientSession,
+                          url: str) -> Optional[str]:
+        """Fetch a single page, return HTML or None."""
+        try:
+            async with session.get(url, headers=self.HEADERS,
+                                   allow_redirects=True) as resp:
+                if resp.status != 200:
+                    self.log(f"HTTP {resp.status} for {url}", 'warning')
+                    return None
+                return await resp.text()
+        except Exception as e:
+            self.log(f"Error fetching {url}: {e}", 'warning')
+            return None