468
modules/paid_content/besteyecandy_client.py
Normal file
468
modules/paid_content/besteyecandy_client.py
Normal file
@@ -0,0 +1,468 @@
|
||||
"""
|
||||
BestEyeCandy.com Client for Paid Content
|
||||
|
||||
Scrapes celebrity photo galleries from BestEyeCandy.com.
|
||||
Each celeb has a unique CID and paginated photo listings.
|
||||
|
||||
Optimization: Full-res URLs follow a predictable pattern. We visit ONE
|
||||
detail page to determine the pattern (server hostname + name format),
|
||||
then construct all remaining URLs from photo IDs found on listing pages.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Optional, Set
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Post, Attachment
|
||||
|
||||
|
||||
class BestEyeCandyClient(LoggingMixin):
|
||||
"""Client for scraping BestEyeCandy.com celebrity photo galleries."""
|
||||
|
||||
SERVICE_ID = 'besteyecandy'
|
||||
PLATFORM = 'besteyecandy'
|
||||
BASE_URL = 'https://besteyecandy.com'
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||||
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
def __init__(self, unified_db=None, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='BestEyeCandy')
|
||||
self.unified_db = unified_db
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Cookie support
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _get_cookies(self) -> Optional[list]:
|
||||
"""Load cookies from the scrapers table for besteyecandy."""
|
||||
if not self.unified_db:
|
||||
return None
|
||||
|
||||
try:
|
||||
with self.unified_db.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?",
|
||||
(self.SERVICE_ID,))
|
||||
row = cursor.fetchone()
|
||||
if row and row[0]:
|
||||
data = json.loads(row[0])
|
||||
if isinstance(data, dict) and 'cookies' in data:
|
||||
return data['cookies']
|
||||
elif isinstance(data, list):
|
||||
return data
|
||||
except Exception as e:
|
||||
self.log(f"Could not load cookies: {e}", 'debug')
|
||||
|
||||
return None
|
||||
|
||||
def _build_cookie_jar(self, cookies_list: list) -> aiohttp.CookieJar:
|
||||
"""Build an aiohttp CookieJar from a list of cookie dicts."""
|
||||
jar = aiohttp.CookieJar(unsafe=True)
|
||||
for cookie in cookies_list:
|
||||
from http.cookies import Morsel
|
||||
import types
|
||||
|
||||
name = cookie.get('name', '')
|
||||
value = cookie.get('value', '')
|
||||
domain = cookie.get('domain', '')
|
||||
path = cookie.get('path', '/')
|
||||
|
||||
# Use SimpleCookie approach
|
||||
from http.cookies import SimpleCookie
|
||||
sc = SimpleCookie()
|
||||
sc[name] = value
|
||||
sc[name]['domain'] = domain
|
||||
sc[name]['path'] = path
|
||||
if cookie.get('secure'):
|
||||
sc[name]['secure'] = True
|
||||
|
||||
jar.update_cookies(sc, urlparse(f"https://{domain.lstrip('.')}"))
|
||||
|
||||
return jar
|
||||
|
||||
def _create_session(self, timeout: aiohttp.ClientTimeout = None) -> aiohttp.ClientSession:
|
||||
"""Create an aiohttp session with cookies loaded from DB."""
|
||||
if timeout is None:
|
||||
timeout = aiohttp.ClientTimeout(total=60)
|
||||
|
||||
cookies_list = self._get_cookies()
|
||||
if cookies_list:
|
||||
jar = self._build_cookie_jar(cookies_list)
|
||||
self.log(f"Loaded {len(cookies_list)} cookies for session", 'debug')
|
||||
return aiohttp.ClientSession(timeout=timeout, cookie_jar=jar)
|
||||
else:
|
||||
self.log("No cookies found for besteyecandy, requests may fail", 'warning')
|
||||
return aiohttp.ClientSession(timeout=timeout)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_profile_info(self, cid: str, celeb_slug: str) -> Optional[Dict]:
|
||||
"""Fetch page 1 of a celeb's listing and return profile-like info."""
|
||||
url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
||||
f'sortedby-age/page-1/{celeb_slug}.html')
|
||||
|
||||
try:
|
||||
async with self._create_session() as session:
|
||||
async with session.get(url, headers=self.HEADERS,
|
||||
allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"BestEyeCandy cid {cid} returned HTTP {resp.status}",
|
||||
'warning')
|
||||
return None
|
||||
page_html = await resp.text()
|
||||
except Exception as e:
|
||||
self.log(f"Failed to fetch BestEyeCandy cid {cid}: {e}", 'error')
|
||||
return None
|
||||
|
||||
# Extract celeb name from page title or heading
|
||||
celeb_name = self._extract_celeb_name(page_html) or celeb_slug.replace('-', ' ')
|
||||
|
||||
# Extract total photos and pages
|
||||
total_photos = self._extract_total_photos(page_html)
|
||||
photos_per_page = len(self._extract_photo_ids(page_html)) or 48
|
||||
page_count = self._extract_page_count(page_html,
|
||||
photos_per_page=photos_per_page)
|
||||
|
||||
celeb_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
||||
f'sortedby-age/page-1/{celeb_slug}.html')
|
||||
|
||||
return {
|
||||
'username': celeb_slug,
|
||||
'display_name': celeb_name,
|
||||
'post_count': total_photos,
|
||||
'page_count': page_count,
|
||||
'celeb_url': celeb_url,
|
||||
}
|
||||
|
||||
async def get_posts(self, cid: str, celeb_slug: str,
|
||||
known_post_ids: Optional[Set[str]] = None,
|
||||
progress_callback=None) -> List[Post]:
|
||||
"""Scrape all listing pages and return posts with full-res image URLs.
|
||||
|
||||
Each listing page becomes one Post with ~48 Attachments (one per photo).
|
||||
Post IDs are "page_N" (e.g. "page_1", "page_2", ...).
|
||||
|
||||
Phase 1: Fetch page 1, get first photo ID, visit detail page to learn
|
||||
the full-res URL pattern.
|
||||
Phase 2: Paginate all listing pages, build one Post per page.
|
||||
"""
|
||||
known = known_post_ids or set()
|
||||
posts: List[Post] = []
|
||||
total_photos = 0
|
||||
url_pattern = None
|
||||
|
||||
try:
|
||||
async with self._create_session() as session:
|
||||
# -- Phase 1: Fetch page 1 and determine full-res URL pattern --
|
||||
page1_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
||||
f'sortedby-age/page-1/{celeb_slug}.html')
|
||||
|
||||
page_html = await self._fetch_page(session, page1_url)
|
||||
if page_html is None:
|
||||
return []
|
||||
|
||||
# Estimate page count for progress display
|
||||
photos_per_page = len(self._extract_photo_ids(page_html)) or 48
|
||||
estimated_pages = self._extract_page_count(
|
||||
page_html, photos_per_page=photos_per_page)
|
||||
self.log(f"Estimated {estimated_pages} pages of photos "
|
||||
f"({photos_per_page}/page)", 'info')
|
||||
|
||||
# Discover full-res URL pattern from first photo
|
||||
first_page_ids = self._extract_photo_ids(page_html)
|
||||
if first_page_ids:
|
||||
url_pattern = await self._discover_url_pattern(
|
||||
session, first_page_ids[0], cid, celeb_slug)
|
||||
|
||||
if not url_pattern:
|
||||
self.log("Could not determine full-res URL pattern", 'error')
|
||||
return []
|
||||
|
||||
self.log(f"URL pattern: server={url_pattern['server']}, "
|
||||
f"name_format={url_pattern['name_format']}, "
|
||||
f"ext={url_pattern['ext']}", 'info')
|
||||
|
||||
# -- Phase 2: Paginate all pages, one Post per page --
|
||||
page_num = 0
|
||||
has_next = True # start with page 1
|
||||
|
||||
while has_next:
|
||||
page_num += 1
|
||||
|
||||
if page_num == 1:
|
||||
# Already fetched page 1
|
||||
pass
|
||||
else:
|
||||
await asyncio.sleep(2) # Rate limit
|
||||
|
||||
page_url = (
|
||||
f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
||||
f'sortedby-age/page-{page_num}/{celeb_slug}.html')
|
||||
|
||||
page_html = await self._fetch_page(session, page_url)
|
||||
if page_html is None:
|
||||
self.log(f"Failed to fetch page {page_num}, stopping",
|
||||
'warning')
|
||||
break
|
||||
|
||||
page_ids = self._extract_photo_ids(page_html)
|
||||
if not page_ids:
|
||||
self.log(f"Page {page_num}: no photos, stopping", 'info')
|
||||
break
|
||||
|
||||
total_photos += len(page_ids)
|
||||
has_next = self._has_next_page(page_html)
|
||||
|
||||
# Check if this page-post is already known
|
||||
post_id = f"page_{page_num}"
|
||||
if post_id in known:
|
||||
self.log(f"Page {page_num}: already known, skipping",
|
||||
'debug')
|
||||
if progress_callback:
|
||||
progress_callback(
|
||||
f"Page {page_num}/~{estimated_pages} — "
|
||||
f"{total_photos} photos (skipped known)")
|
||||
continue
|
||||
|
||||
# Build attachments for all photos on this page
|
||||
attachments = []
|
||||
for photo_id in page_ids:
|
||||
dl_url = self._construct_full_res_url(url_pattern, photo_id)
|
||||
filename = dl_url.rsplit('/', 1)[-1]
|
||||
|
||||
attachments.append(Attachment(
|
||||
name=filename,
|
||||
file_type='image',
|
||||
extension=url_pattern.get('ext', 'jpg'),
|
||||
server_path=dl_url,
|
||||
download_url=dl_url,
|
||||
))
|
||||
|
||||
post = Post(
|
||||
post_id=post_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=cid,
|
||||
title=f"Page {page_num}",
|
||||
content=f"{len(page_ids)} photos",
|
||||
published_at=datetime.now(tz=timezone.utc).isoformat(),
|
||||
attachments=attachments,
|
||||
)
|
||||
posts.append(post)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(
|
||||
f"Page {page_num}/~{estimated_pages} — "
|
||||
f"{total_photos} photos")
|
||||
|
||||
self.log(f"Page {page_num}/~{estimated_pages}: "
|
||||
f"{len(page_ids)} photos", 'debug')
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error scraping BestEyeCandy: {e}", 'error')
|
||||
|
||||
self.log(f"Total: {len(posts)} new page-posts with "
|
||||
f"{total_photos} photos across all pages", 'info')
|
||||
return posts
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# URL pattern discovery
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _discover_url_pattern(self, session: aiohttp.ClientSession,
|
||||
photo_id: str, cid: str,
|
||||
celeb_slug: str) -> Optional[Dict]:
|
||||
"""Visit a detail page to discover the full-res URL pattern.
|
||||
|
||||
Returns dict with keys: server, dir_pattern, name_format, ext
|
||||
"""
|
||||
detail_url = (f'{self.BASE_URL}/section/celeb-photogallery/'
|
||||
f'cid-{cid}/{celeb_slug}/photo-{photo_id}.html')
|
||||
|
||||
await asyncio.sleep(2) # Rate limit
|
||||
page_html = await self._fetch_page(session, detail_url)
|
||||
if page_html is None:
|
||||
return None
|
||||
|
||||
# Look for full-res image URL in the detail page
|
||||
# Pattern: <img src="https://euX.besteyecandy.com/section/large-photos/area-female/besteyecandy-{ID}/{Name}_{ID}_BestEyeCandyCOM.jpg">
|
||||
# or <a href="..."> with similar pattern
|
||||
patterns = [
|
||||
r'(https?://[a-z0-9]+\.besteyecandy\.com/section/large-photos/[^"\'>\s]+)',
|
||||
r'(https?://[a-z0-9]+\.besteyecandy\.com/[^"\'>\s]*besteyecandy-' + re.escape(photo_id) + r'[^"\'>\s]*)',
|
||||
]
|
||||
|
||||
full_res_url = None
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, page_html)
|
||||
if match:
|
||||
full_res_url = match.group(1)
|
||||
break
|
||||
|
||||
if not full_res_url:
|
||||
self.log(f"Could not find full-res URL on detail page for photo {photo_id}",
|
||||
'error')
|
||||
return None
|
||||
|
||||
self.log(f"Found full-res URL: {full_res_url}", 'debug')
|
||||
|
||||
# Parse the URL to extract the pattern components
|
||||
parsed = urlparse(full_res_url)
|
||||
server = parsed.netloc # e.g., eu4.besteyecandy.com
|
||||
|
||||
# Extract name format from the filename
|
||||
# e.g., Myleene_Klass_7727820_BestEyeCandyCOM.jpg
|
||||
filename = parsed.path.rsplit('/', 1)[-1]
|
||||
ext = filename.rsplit('.', 1)[-1] if '.' in filename else 'jpg'
|
||||
|
||||
# Extract the path pattern (everything before the filename)
|
||||
path_dir = parsed.path.rsplit('/', 1)[0] # e.g., /section/large-photos/area-female/besteyecandy-7727820
|
||||
|
||||
# The directory pattern includes the photo ID, extract the base
|
||||
# e.g., /section/large-photos/area-female/besteyecandy-{ID}
|
||||
dir_pattern = re.sub(re.escape(photo_id), '{ID}', path_dir)
|
||||
|
||||
# Extract the name format by removing the photo ID
|
||||
# e.g., Myleene_Klass_{ID}_BestEyeCandyCOM.jpg -> Myleene_Klass_{ID}_BestEyeCandyCOM
|
||||
name_without_ext = filename.rsplit('.', 1)[0]
|
||||
name_format = name_without_ext.replace(photo_id, '{ID}')
|
||||
|
||||
return {
|
||||
'server': server,
|
||||
'dir_pattern': dir_pattern,
|
||||
'name_format': name_format,
|
||||
'ext': ext,
|
||||
'example_url': full_res_url,
|
||||
}
|
||||
|
||||
def _construct_full_res_url(self, url_pattern: Dict, photo_id: str) -> str:
|
||||
"""Construct the full-res URL for a photo ID using the discovered pattern."""
|
||||
dir_path = url_pattern['dir_pattern'].replace('{ID}', photo_id)
|
||||
filename = url_pattern['name_format'].replace('{ID}', photo_id) + '.' + url_pattern['ext']
|
||||
return f"https://{url_pattern['server']}{dir_path}/{filename}"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# HTML parsing helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _extract_photo_ids(self, page_html: str) -> List[str]:
|
||||
"""Extract photo IDs from a listing page.
|
||||
|
||||
Photo links look like: href="...photo-12345.html"
|
||||
"""
|
||||
ids = re.findall(r'href="[^"]*photo-(\d+)\.html"', page_html)
|
||||
# Deduplicate while preserving order
|
||||
seen = set()
|
||||
unique_ids = []
|
||||
for pid in ids:
|
||||
if pid not in seen:
|
||||
seen.add(pid)
|
||||
unique_ids.append(pid)
|
||||
return unique_ids
|
||||
|
||||
@staticmethod
|
||||
def _extract_celeb_name(page_html: str) -> Optional[str]:
|
||||
"""Extract celebrity name from the page."""
|
||||
# Try <title> tag: "Myleene Klass Photo Collection @ ...::: BestEyeCandy.com :::..."
|
||||
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
title = html.unescape(m.group(1).strip())
|
||||
# Remove everything from "Photo Collection" or "@" onwards
|
||||
title = re.sub(r'\s*Photo\s+Collection.*$', '', title,
|
||||
flags=re.IGNORECASE).strip()
|
||||
title = re.sub(r'\s*@.*$', '', title).strip()
|
||||
# Fallback: remove BestEyeCandy suffix
|
||||
title = re.sub(r'\s*[-\u2013\u2014|]?\s*\.{0,3}:{0,3}\s*BestEyeCandy.*$', '',
|
||||
title, flags=re.IGNORECASE).strip()
|
||||
if title:
|
||||
return title
|
||||
|
||||
# Try <h1> or <h2>
|
||||
m = re.search(r'<h[12][^>]*>([^<]+)</h[12]>', page_html)
|
||||
if m:
|
||||
return html.unescape(m.group(1).strip())
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_total_photos(page_html: str) -> int:
|
||||
"""Extract total photo count from the page.
|
||||
|
||||
Handles European format (15.660) and US format (15,660).
|
||||
"""
|
||||
# Look for "N.NNN photos" or "N,NNN photos" or "NNN photos"
|
||||
# Require leading digit to avoid matching ", photo" from keywords
|
||||
m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
num_str = m.group(1)
|
||||
# European format uses dots as thousands separators: 15.660
|
||||
# US format uses commas: 15,660
|
||||
# Remove both dots and commas (they're thousands separators)
|
||||
num_str = num_str.replace('.', '').replace(',', '')
|
||||
try:
|
||||
return int(num_str)
|
||||
except ValueError:
|
||||
pass
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
def _extract_page_count(page_html: str, photos_per_page: int = 48) -> int:
|
||||
"""Extract total page count from the listing page.
|
||||
|
||||
Uses total photo count divided by photos per page, or falls back
|
||||
to finding the maximum page number in pagination links.
|
||||
"""
|
||||
# Method 1: Calculate from total photos
|
||||
m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
num_str = m.group(1).replace('.', '').replace(',', '')
|
||||
try:
|
||||
total = int(num_str)
|
||||
if total > 0:
|
||||
return (total + photos_per_page - 1) // photos_per_page
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Method 2: Find max page-N in pagination links for same celeb
|
||||
page_nums = [int(x) for x in re.findall(r'/page-(\d+)/', page_html)]
|
||||
if page_nums:
|
||||
return max(page_nums)
|
||||
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
def _has_next_page(page_html: str) -> bool:
|
||||
"""Check if there's a 'Next Page' link on the current page."""
|
||||
return 'alt="Next Page"' in page_html
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Utility helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _fetch_page(self, session: aiohttp.ClientSession,
|
||||
url: str) -> Optional[str]:
|
||||
"""Fetch a single page, return HTML or None."""
|
||||
try:
|
||||
async with session.get(url, headers=self.HEADERS,
|
||||
allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"HTTP {resp.status} for {url}", 'warning')
|
||||
return None
|
||||
return await resp.text()
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching {url}: {e}", 'warning')
|
||||
return None
|
||||
Reference in New Issue
Block a user