""" BestEyeCandy.com Client for Paid Content Scrapes celebrity photo galleries from BestEyeCandy.com. Each celeb has a unique CID and paginated photo listings. Optimization: Full-res URLs follow a predictable pattern. We visit ONE detail page to determine the pattern (server hostname + name format), then construct all remaining URLs from photo IDs found on listing pages. """ import asyncio import html import json import re from datetime import datetime, timezone from typing import Dict, List, Optional, Set from urllib.parse import urlparse import aiohttp from modules.base_module import LoggingMixin from .models import Post, Attachment class BestEyeCandyClient(LoggingMixin): """Client for scraping BestEyeCandy.com celebrity photo galleries.""" SERVICE_ID = 'besteyecandy' PLATFORM = 'besteyecandy' BASE_URL = 'https://besteyecandy.com' HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', } def __init__(self, unified_db=None, log_callback=None): self._init_logger('PaidContent', log_callback, default_module='BestEyeCandy') self.unified_db = unified_db # ------------------------------------------------------------------ # Cookie support # ------------------------------------------------------------------ def _get_cookies(self) -> Optional[list]: """Load cookies from the scrapers table for besteyecandy.""" if not self.unified_db: return None try: with self.unified_db.get_connection() as conn: cursor = conn.cursor() cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?", (self.SERVICE_ID,)) row = cursor.fetchone() if row and row[0]: data = json.loads(row[0]) if isinstance(data, dict) and 'cookies' in data: return data['cookies'] elif isinstance(data, list): return data except Exception as e: self.log(f"Could not load cookies: {e}", 'debug') return None def _build_cookie_jar(self, cookies_list: list) -> aiohttp.CookieJar: """Build an aiohttp CookieJar from a list of cookie dicts.""" jar = aiohttp.CookieJar(unsafe=True) for cookie in cookies_list: from http.cookies import Morsel import types name = cookie.get('name', '') value = cookie.get('value', '') domain = cookie.get('domain', '') path = cookie.get('path', '/') # Use SimpleCookie approach from http.cookies import SimpleCookie sc = SimpleCookie() sc[name] = value sc[name]['domain'] = domain sc[name]['path'] = path if cookie.get('secure'): sc[name]['secure'] = True jar.update_cookies(sc, urlparse(f"https://{domain.lstrip('.')}")) return jar def _create_session(self, timeout: aiohttp.ClientTimeout = None) -> aiohttp.ClientSession: """Create an aiohttp session with cookies loaded from DB.""" if timeout is None: timeout = aiohttp.ClientTimeout(total=60) cookies_list = self._get_cookies() if cookies_list: jar = self._build_cookie_jar(cookies_list) self.log(f"Loaded {len(cookies_list)} cookies for session", 'debug') return aiohttp.ClientSession(timeout=timeout, cookie_jar=jar) else: self.log("No cookies found for besteyecandy, requests may fail", 'warning') return aiohttp.ClientSession(timeout=timeout) # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ async def get_profile_info(self, cid: str, celeb_slug: str) -> Optional[Dict]: """Fetch page 1 of a celeb's listing and return profile-like info.""" url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/' f'sortedby-age/page-1/{celeb_slug}.html') try: async with self._create_session() as session: async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp: if resp.status != 200: self.log(f"BestEyeCandy cid {cid} returned HTTP {resp.status}", 'warning') return None page_html = await resp.text() except Exception as e: self.log(f"Failed to fetch BestEyeCandy cid {cid}: {e}", 'error') return None # Extract celeb name from page title or heading celeb_name = self._extract_celeb_name(page_html) or celeb_slug.replace('-', ' ') # Extract total photos and pages total_photos = self._extract_total_photos(page_html) photos_per_page = len(self._extract_photo_ids(page_html)) or 48 page_count = self._extract_page_count(page_html, photos_per_page=photos_per_page) celeb_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/' f'sortedby-age/page-1/{celeb_slug}.html') return { 'username': celeb_slug, 'display_name': celeb_name, 'post_count': total_photos, 'page_count': page_count, 'celeb_url': celeb_url, } async def get_posts(self, cid: str, celeb_slug: str, known_post_ids: Optional[Set[str]] = None, progress_callback=None) -> List[Post]: """Scrape all listing pages and return posts with full-res image URLs. Each listing page becomes one Post with ~48 Attachments (one per photo). Post IDs are "page_N" (e.g. "page_1", "page_2", ...). Phase 1: Fetch page 1, get first photo ID, visit detail page to learn the full-res URL pattern. Phase 2: Paginate all listing pages, build one Post per page. """ known = known_post_ids or set() posts: List[Post] = [] total_photos = 0 url_pattern = None try: async with self._create_session() as session: # -- Phase 1: Fetch page 1 and determine full-res URL pattern -- page1_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/' f'sortedby-age/page-1/{celeb_slug}.html') page_html = await self._fetch_page(session, page1_url) if page_html is None: return [] # Estimate page count for progress display photos_per_page = len(self._extract_photo_ids(page_html)) or 48 estimated_pages = self._extract_page_count( page_html, photos_per_page=photos_per_page) self.log(f"Estimated {estimated_pages} pages of photos " f"({photos_per_page}/page)", 'info') # Discover full-res URL pattern from first photo first_page_ids = self._extract_photo_ids(page_html) if first_page_ids: url_pattern = await self._discover_url_pattern( session, first_page_ids[0], cid, celeb_slug) if not url_pattern: self.log("Could not determine full-res URL pattern", 'error') return [] self.log(f"URL pattern: server={url_pattern['server']}, " f"name_format={url_pattern['name_format']}, " f"ext={url_pattern['ext']}", 'info') # -- Phase 2: Paginate all pages, one Post per page -- page_num = 0 has_next = True # start with page 1 while has_next: page_num += 1 if page_num == 1: # Already fetched page 1 pass else: await asyncio.sleep(2) # Rate limit page_url = ( f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/' f'sortedby-age/page-{page_num}/{celeb_slug}.html') page_html = await self._fetch_page(session, page_url) if page_html is None: self.log(f"Failed to fetch page {page_num}, stopping", 'warning') break page_ids = self._extract_photo_ids(page_html) if not page_ids: self.log(f"Page {page_num}: no photos, stopping", 'info') break total_photos += len(page_ids) has_next = self._has_next_page(page_html) # Check if this page-post is already known post_id = f"page_{page_num}" if post_id in known: self.log(f"Page {page_num}: already known, skipping", 'debug') if progress_callback: progress_callback( f"Page {page_num}/~{estimated_pages} — " f"{total_photos} photos (skipped known)") continue # Build attachments for all photos on this page attachments = [] for photo_id in page_ids: dl_url = self._construct_full_res_url(url_pattern, photo_id) filename = dl_url.rsplit('/', 1)[-1] attachments.append(Attachment( name=filename, file_type='image', extension=url_pattern.get('ext', 'jpg'), server_path=dl_url, download_url=dl_url, )) post = Post( post_id=post_id, service_id=self.SERVICE_ID, platform=self.PLATFORM, creator_id=cid, title=f"Page {page_num}", content=f"{len(page_ids)} photos", published_at=datetime.now(tz=timezone.utc).isoformat(), attachments=attachments, ) posts.append(post) if progress_callback: progress_callback( f"Page {page_num}/~{estimated_pages} — " f"{total_photos} photos") self.log(f"Page {page_num}/~{estimated_pages}: " f"{len(page_ids)} photos", 'debug') except Exception as e: self.log(f"Error scraping BestEyeCandy: {e}", 'error') self.log(f"Total: {len(posts)} new page-posts with " f"{total_photos} photos across all pages", 'info') return posts # ------------------------------------------------------------------ # URL pattern discovery # ------------------------------------------------------------------ async def _discover_url_pattern(self, session: aiohttp.ClientSession, photo_id: str, cid: str, celeb_slug: str) -> Optional[Dict]: """Visit a detail page to discover the full-res URL pattern. Returns dict with keys: server, dir_pattern, name_format, ext """ detail_url = (f'{self.BASE_URL}/section/celeb-photogallery/' f'cid-{cid}/{celeb_slug}/photo-{photo_id}.html') await asyncio.sleep(2) # Rate limit page_html = await self._fetch_page(session, detail_url) if page_html is None: return None # Look for full-res image URL in the detail page # Pattern: # or with similar pattern patterns = [ r'(https?://[a-z0-9]+\.besteyecandy\.com/section/large-photos/[^"\'>\s]+)', r'(https?://[a-z0-9]+\.besteyecandy\.com/[^"\'>\s]*besteyecandy-' + re.escape(photo_id) + r'[^"\'>\s]*)', ] full_res_url = None for pattern in patterns: match = re.search(pattern, page_html) if match: full_res_url = match.group(1) break if not full_res_url: self.log(f"Could not find full-res URL on detail page for photo {photo_id}", 'error') return None self.log(f"Found full-res URL: {full_res_url}", 'debug') # Parse the URL to extract the pattern components parsed = urlparse(full_res_url) server = parsed.netloc # e.g., eu4.besteyecandy.com # Extract name format from the filename # e.g., Myleene_Klass_7727820_BestEyeCandyCOM.jpg filename = parsed.path.rsplit('/', 1)[-1] ext = filename.rsplit('.', 1)[-1] if '.' in filename else 'jpg' # Extract the path pattern (everything before the filename) path_dir = parsed.path.rsplit('/', 1)[0] # e.g., /section/large-photos/area-female/besteyecandy-7727820 # The directory pattern includes the photo ID, extract the base # e.g., /section/large-photos/area-female/besteyecandy-{ID} dir_pattern = re.sub(re.escape(photo_id), '{ID}', path_dir) # Extract the name format by removing the photo ID # e.g., Myleene_Klass_{ID}_BestEyeCandyCOM.jpg -> Myleene_Klass_{ID}_BestEyeCandyCOM name_without_ext = filename.rsplit('.', 1)[0] name_format = name_without_ext.replace(photo_id, '{ID}') return { 'server': server, 'dir_pattern': dir_pattern, 'name_format': name_format, 'ext': ext, 'example_url': full_res_url, } def _construct_full_res_url(self, url_pattern: Dict, photo_id: str) -> str: """Construct the full-res URL for a photo ID using the discovered pattern.""" dir_path = url_pattern['dir_pattern'].replace('{ID}', photo_id) filename = url_pattern['name_format'].replace('{ID}', photo_id) + '.' + url_pattern['ext'] return f"https://{url_pattern['server']}{dir_path}/{filename}" # ------------------------------------------------------------------ # HTML parsing helpers # ------------------------------------------------------------------ def _extract_photo_ids(self, page_html: str) -> List[str]: """Extract photo IDs from a listing page. Photo links look like: href="...photo-12345.html" """ ids = re.findall(r'href="[^"]*photo-(\d+)\.html"', page_html) # Deduplicate while preserving order seen = set() unique_ids = [] for pid in ids: if pid not in seen: seen.add(pid) unique_ids.append(pid) return unique_ids @staticmethod def _extract_celeb_name(page_html: str) -> Optional[str]: """Extract celebrity name from the page.""" # Try tag: "Myleene Klass Photo Collection @ ...::: BestEyeCandy.com :::..." m = re.search(r'<title>([^<]+)', page_html, re.IGNORECASE) if m: title = html.unescape(m.group(1).strip()) # Remove everything from "Photo Collection" or "@" onwards title = re.sub(r'\s*Photo\s+Collection.*$', '', title, flags=re.IGNORECASE).strip() title = re.sub(r'\s*@.*$', '', title).strip() # Fallback: remove BestEyeCandy suffix title = re.sub(r'\s*[-\u2013\u2014|]?\s*\.{0,3}:{0,3}\s*BestEyeCandy.*$', '', title, flags=re.IGNORECASE).strip() if title: return title # Try

or

m = re.search(r']*>([^<]+)', page_html) if m: return html.unescape(m.group(1).strip()) return None @staticmethod def _extract_total_photos(page_html: str) -> int: """Extract total photo count from the page. Handles European format (15.660) and US format (15,660). """ # Look for "N.NNN photos" or "N,NNN photos" or "NNN photos" # Require leading digit to avoid matching ", photo" from keywords m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE) if m: num_str = m.group(1) # European format uses dots as thousands separators: 15.660 # US format uses commas: 15,660 # Remove both dots and commas (they're thousands separators) num_str = num_str.replace('.', '').replace(',', '') try: return int(num_str) except ValueError: pass return 0 @staticmethod def _extract_page_count(page_html: str, photos_per_page: int = 48) -> int: """Extract total page count from the listing page. Uses total photo count divided by photos per page, or falls back to finding the maximum page number in pagination links. """ # Method 1: Calculate from total photos m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE) if m: num_str = m.group(1).replace('.', '').replace(',', '') try: total = int(num_str) if total > 0: return (total + photos_per_page - 1) // photos_per_page except ValueError: pass # Method 2: Find max page-N in pagination links for same celeb page_nums = [int(x) for x in re.findall(r'/page-(\d+)/', page_html)] if page_nums: return max(page_nums) return 1 @staticmethod def _has_next_page(page_html: str) -> bool: """Check if there's a 'Next Page' link on the current page.""" return 'alt="Next Page"' in page_html # ------------------------------------------------------------------ # Utility helpers # ------------------------------------------------------------------ async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]: """Fetch a single page, return HTML or None.""" try: async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp: if resp.status != 200: self.log(f"HTTP {resp.status} for {url}", 'warning') return None return await resp.text() except Exception as e: self.log(f"Error fetching {url}: {e}", 'warning') return None