469 lines
19 KiB
Python
469 lines
19 KiB
Python
"""
|
|
BestEyeCandy.com Client for Paid Content
|
|
|
|
Scrapes celebrity photo galleries from BestEyeCandy.com.
|
|
Each celeb has a unique CID and paginated photo listings.
|
|
|
|
Optimization: Full-res URLs follow a predictable pattern. We visit ONE
|
|
detail page to determine the pattern (server hostname + name format),
|
|
then construct all remaining URLs from photo IDs found on listing pages.
|
|
"""
|
|
|
|
import asyncio
|
|
import html
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Set
|
|
from urllib.parse import urlparse
|
|
|
|
import aiohttp
|
|
|
|
from modules.base_module import LoggingMixin
|
|
from .models import Post, Attachment
|
|
|
|
|
|
class BestEyeCandyClient(LoggingMixin):
|
|
"""Client for scraping BestEyeCandy.com celebrity photo galleries."""
|
|
|
|
SERVICE_ID = 'besteyecandy'
|
|
PLATFORM = 'besteyecandy'
|
|
BASE_URL = 'https://besteyecandy.com'
|
|
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
|
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
}
|
|
|
|
def __init__(self, unified_db=None, log_callback=None):
|
|
self._init_logger('PaidContent', log_callback, default_module='BestEyeCandy')
|
|
self.unified_db = unified_db
|
|
|
|
# ------------------------------------------------------------------
|
|
# Cookie support
|
|
# ------------------------------------------------------------------
|
|
|
|
def _get_cookies(self) -> Optional[list]:
|
|
"""Load cookies from the scrapers table for besteyecandy."""
|
|
if not self.unified_db:
|
|
return None
|
|
|
|
try:
|
|
with self.unified_db.get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?",
|
|
(self.SERVICE_ID,))
|
|
row = cursor.fetchone()
|
|
if row and row[0]:
|
|
data = json.loads(row[0])
|
|
if isinstance(data, dict) and 'cookies' in data:
|
|
return data['cookies']
|
|
elif isinstance(data, list):
|
|
return data
|
|
except Exception as e:
|
|
self.log(f"Could not load cookies: {e}", 'debug')
|
|
|
|
return None
|
|
|
|
def _build_cookie_jar(self, cookies_list: list) -> aiohttp.CookieJar:
|
|
"""Build an aiohttp CookieJar from a list of cookie dicts."""
|
|
jar = aiohttp.CookieJar(unsafe=True)
|
|
for cookie in cookies_list:
|
|
from http.cookies import Morsel
|
|
import types
|
|
|
|
name = cookie.get('name', '')
|
|
value = cookie.get('value', '')
|
|
domain = cookie.get('domain', '')
|
|
path = cookie.get('path', '/')
|
|
|
|
# Use SimpleCookie approach
|
|
from http.cookies import SimpleCookie
|
|
sc = SimpleCookie()
|
|
sc[name] = value
|
|
sc[name]['domain'] = domain
|
|
sc[name]['path'] = path
|
|
if cookie.get('secure'):
|
|
sc[name]['secure'] = True
|
|
|
|
jar.update_cookies(sc, urlparse(f"https://{domain.lstrip('.')}"))
|
|
|
|
return jar
|
|
|
|
def _create_session(self, timeout: aiohttp.ClientTimeout = None) -> aiohttp.ClientSession:
|
|
"""Create an aiohttp session with cookies loaded from DB."""
|
|
if timeout is None:
|
|
timeout = aiohttp.ClientTimeout(total=60)
|
|
|
|
cookies_list = self._get_cookies()
|
|
if cookies_list:
|
|
jar = self._build_cookie_jar(cookies_list)
|
|
self.log(f"Loaded {len(cookies_list)} cookies for session", 'debug')
|
|
return aiohttp.ClientSession(timeout=timeout, cookie_jar=jar)
|
|
else:
|
|
self.log("No cookies found for besteyecandy, requests may fail", 'warning')
|
|
return aiohttp.ClientSession(timeout=timeout)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Public API
|
|
# ------------------------------------------------------------------
|
|
|
|
async def get_profile_info(self, cid: str, celeb_slug: str) -> Optional[Dict]:
|
|
"""Fetch page 1 of a celeb's listing and return profile-like info."""
|
|
url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
|
f'sortedby-age/page-1/{celeb_slug}.html')
|
|
|
|
try:
|
|
async with self._create_session() as session:
|
|
async with session.get(url, headers=self.HEADERS,
|
|
allow_redirects=True) as resp:
|
|
if resp.status != 200:
|
|
self.log(f"BestEyeCandy cid {cid} returned HTTP {resp.status}",
|
|
'warning')
|
|
return None
|
|
page_html = await resp.text()
|
|
except Exception as e:
|
|
self.log(f"Failed to fetch BestEyeCandy cid {cid}: {e}", 'error')
|
|
return None
|
|
|
|
# Extract celeb name from page title or heading
|
|
celeb_name = self._extract_celeb_name(page_html) or celeb_slug.replace('-', ' ')
|
|
|
|
# Extract total photos and pages
|
|
total_photos = self._extract_total_photos(page_html)
|
|
photos_per_page = len(self._extract_photo_ids(page_html)) or 48
|
|
page_count = self._extract_page_count(page_html,
|
|
photos_per_page=photos_per_page)
|
|
|
|
celeb_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
|
f'sortedby-age/page-1/{celeb_slug}.html')
|
|
|
|
return {
|
|
'username': celeb_slug,
|
|
'display_name': celeb_name,
|
|
'post_count': total_photos,
|
|
'page_count': page_count,
|
|
'celeb_url': celeb_url,
|
|
}
|
|
|
|
async def get_posts(self, cid: str, celeb_slug: str,
|
|
known_post_ids: Optional[Set[str]] = None,
|
|
progress_callback=None) -> List[Post]:
|
|
"""Scrape all listing pages and return posts with full-res image URLs.
|
|
|
|
Each listing page becomes one Post with ~48 Attachments (one per photo).
|
|
Post IDs are "page_N" (e.g. "page_1", "page_2", ...).
|
|
|
|
Phase 1: Fetch page 1, get first photo ID, visit detail page to learn
|
|
the full-res URL pattern.
|
|
Phase 2: Paginate all listing pages, build one Post per page.
|
|
"""
|
|
known = known_post_ids or set()
|
|
posts: List[Post] = []
|
|
total_photos = 0
|
|
url_pattern = None
|
|
|
|
try:
|
|
async with self._create_session() as session:
|
|
# -- Phase 1: Fetch page 1 and determine full-res URL pattern --
|
|
page1_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
|
f'sortedby-age/page-1/{celeb_slug}.html')
|
|
|
|
page_html = await self._fetch_page(session, page1_url)
|
|
if page_html is None:
|
|
return []
|
|
|
|
# Estimate page count for progress display
|
|
photos_per_page = len(self._extract_photo_ids(page_html)) or 48
|
|
estimated_pages = self._extract_page_count(
|
|
page_html, photos_per_page=photos_per_page)
|
|
self.log(f"Estimated {estimated_pages} pages of photos "
|
|
f"({photos_per_page}/page)", 'info')
|
|
|
|
# Discover full-res URL pattern from first photo
|
|
first_page_ids = self._extract_photo_ids(page_html)
|
|
if first_page_ids:
|
|
url_pattern = await self._discover_url_pattern(
|
|
session, first_page_ids[0], cid, celeb_slug)
|
|
|
|
if not url_pattern:
|
|
self.log("Could not determine full-res URL pattern", 'error')
|
|
return []
|
|
|
|
self.log(f"URL pattern: server={url_pattern['server']}, "
|
|
f"name_format={url_pattern['name_format']}, "
|
|
f"ext={url_pattern['ext']}", 'info')
|
|
|
|
# -- Phase 2: Paginate all pages, one Post per page --
|
|
page_num = 0
|
|
has_next = True # start with page 1
|
|
|
|
while has_next:
|
|
page_num += 1
|
|
|
|
if page_num == 1:
|
|
# Already fetched page 1
|
|
pass
|
|
else:
|
|
await asyncio.sleep(2) # Rate limit
|
|
|
|
page_url = (
|
|
f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
|
|
f'sortedby-age/page-{page_num}/{celeb_slug}.html')
|
|
|
|
page_html = await self._fetch_page(session, page_url)
|
|
if page_html is None:
|
|
self.log(f"Failed to fetch page {page_num}, stopping",
|
|
'warning')
|
|
break
|
|
|
|
page_ids = self._extract_photo_ids(page_html)
|
|
if not page_ids:
|
|
self.log(f"Page {page_num}: no photos, stopping", 'info')
|
|
break
|
|
|
|
total_photos += len(page_ids)
|
|
has_next = self._has_next_page(page_html)
|
|
|
|
# Check if this page-post is already known
|
|
post_id = f"page_{page_num}"
|
|
if post_id in known:
|
|
self.log(f"Page {page_num}: already known, skipping",
|
|
'debug')
|
|
if progress_callback:
|
|
progress_callback(
|
|
f"Page {page_num}/~{estimated_pages} — "
|
|
f"{total_photos} photos (skipped known)")
|
|
continue
|
|
|
|
# Build attachments for all photos on this page
|
|
attachments = []
|
|
for photo_id in page_ids:
|
|
dl_url = self._construct_full_res_url(url_pattern, photo_id)
|
|
filename = dl_url.rsplit('/', 1)[-1]
|
|
|
|
attachments.append(Attachment(
|
|
name=filename,
|
|
file_type='image',
|
|
extension=url_pattern.get('ext', 'jpg'),
|
|
server_path=dl_url,
|
|
download_url=dl_url,
|
|
))
|
|
|
|
post = Post(
|
|
post_id=post_id,
|
|
service_id=self.SERVICE_ID,
|
|
platform=self.PLATFORM,
|
|
creator_id=cid,
|
|
title=f"Page {page_num}",
|
|
content=f"{len(page_ids)} photos",
|
|
published_at=datetime.now(tz=timezone.utc).isoformat(),
|
|
attachments=attachments,
|
|
)
|
|
posts.append(post)
|
|
|
|
if progress_callback:
|
|
progress_callback(
|
|
f"Page {page_num}/~{estimated_pages} — "
|
|
f"{total_photos} photos")
|
|
|
|
self.log(f"Page {page_num}/~{estimated_pages}: "
|
|
f"{len(page_ids)} photos", 'debug')
|
|
|
|
except Exception as e:
|
|
self.log(f"Error scraping BestEyeCandy: {e}", 'error')
|
|
|
|
self.log(f"Total: {len(posts)} new page-posts with "
|
|
f"{total_photos} photos across all pages", 'info')
|
|
return posts
|
|
|
|
# ------------------------------------------------------------------
|
|
# URL pattern discovery
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _discover_url_pattern(self, session: aiohttp.ClientSession,
|
|
photo_id: str, cid: str,
|
|
celeb_slug: str) -> Optional[Dict]:
|
|
"""Visit a detail page to discover the full-res URL pattern.
|
|
|
|
Returns dict with keys: server, dir_pattern, name_format, ext
|
|
"""
|
|
detail_url = (f'{self.BASE_URL}/section/celeb-photogallery/'
|
|
f'cid-{cid}/{celeb_slug}/photo-{photo_id}.html')
|
|
|
|
await asyncio.sleep(2) # Rate limit
|
|
page_html = await self._fetch_page(session, detail_url)
|
|
if page_html is None:
|
|
return None
|
|
|
|
# Look for full-res image URL in the detail page
|
|
# Pattern: <img src="https://euX.besteyecandy.com/section/large-photos/area-female/besteyecandy-{ID}/{Name}_{ID}_BestEyeCandyCOM.jpg">
|
|
# or <a href="..."> with similar pattern
|
|
patterns = [
|
|
r'(https?://[a-z0-9]+\.besteyecandy\.com/section/large-photos/[^"\'>\s]+)',
|
|
r'(https?://[a-z0-9]+\.besteyecandy\.com/[^"\'>\s]*besteyecandy-' + re.escape(photo_id) + r'[^"\'>\s]*)',
|
|
]
|
|
|
|
full_res_url = None
|
|
for pattern in patterns:
|
|
match = re.search(pattern, page_html)
|
|
if match:
|
|
full_res_url = match.group(1)
|
|
break
|
|
|
|
if not full_res_url:
|
|
self.log(f"Could not find full-res URL on detail page for photo {photo_id}",
|
|
'error')
|
|
return None
|
|
|
|
self.log(f"Found full-res URL: {full_res_url}", 'debug')
|
|
|
|
# Parse the URL to extract the pattern components
|
|
parsed = urlparse(full_res_url)
|
|
server = parsed.netloc # e.g., eu4.besteyecandy.com
|
|
|
|
# Extract name format from the filename
|
|
# e.g., Myleene_Klass_7727820_BestEyeCandyCOM.jpg
|
|
filename = parsed.path.rsplit('/', 1)[-1]
|
|
ext = filename.rsplit('.', 1)[-1] if '.' in filename else 'jpg'
|
|
|
|
# Extract the path pattern (everything before the filename)
|
|
path_dir = parsed.path.rsplit('/', 1)[0] # e.g., /section/large-photos/area-female/besteyecandy-7727820
|
|
|
|
# The directory pattern includes the photo ID, extract the base
|
|
# e.g., /section/large-photos/area-female/besteyecandy-{ID}
|
|
dir_pattern = re.sub(re.escape(photo_id), '{ID}', path_dir)
|
|
|
|
# Extract the name format by removing the photo ID
|
|
# e.g., Myleene_Klass_{ID}_BestEyeCandyCOM.jpg -> Myleene_Klass_{ID}_BestEyeCandyCOM
|
|
name_without_ext = filename.rsplit('.', 1)[0]
|
|
name_format = name_without_ext.replace(photo_id, '{ID}')
|
|
|
|
return {
|
|
'server': server,
|
|
'dir_pattern': dir_pattern,
|
|
'name_format': name_format,
|
|
'ext': ext,
|
|
'example_url': full_res_url,
|
|
}
|
|
|
|
def _construct_full_res_url(self, url_pattern: Dict, photo_id: str) -> str:
|
|
"""Construct the full-res URL for a photo ID using the discovered pattern."""
|
|
dir_path = url_pattern['dir_pattern'].replace('{ID}', photo_id)
|
|
filename = url_pattern['name_format'].replace('{ID}', photo_id) + '.' + url_pattern['ext']
|
|
return f"https://{url_pattern['server']}{dir_path}/{filename}"
|
|
|
|
# ------------------------------------------------------------------
|
|
# HTML parsing helpers
|
|
# ------------------------------------------------------------------
|
|
|
|
def _extract_photo_ids(self, page_html: str) -> List[str]:
|
|
"""Extract photo IDs from a listing page.
|
|
|
|
Photo links look like: href="...photo-12345.html"
|
|
"""
|
|
ids = re.findall(r'href="[^"]*photo-(\d+)\.html"', page_html)
|
|
# Deduplicate while preserving order
|
|
seen = set()
|
|
unique_ids = []
|
|
for pid in ids:
|
|
if pid not in seen:
|
|
seen.add(pid)
|
|
unique_ids.append(pid)
|
|
return unique_ids
|
|
|
|
@staticmethod
|
|
def _extract_celeb_name(page_html: str) -> Optional[str]:
|
|
"""Extract celebrity name from the page."""
|
|
# Try <title> tag: "Myleene Klass Photo Collection @ ...::: BestEyeCandy.com :::..."
|
|
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
|
|
if m:
|
|
title = html.unescape(m.group(1).strip())
|
|
# Remove everything from "Photo Collection" or "@" onwards
|
|
title = re.sub(r'\s*Photo\s+Collection.*$', '', title,
|
|
flags=re.IGNORECASE).strip()
|
|
title = re.sub(r'\s*@.*$', '', title).strip()
|
|
# Fallback: remove BestEyeCandy suffix
|
|
title = re.sub(r'\s*[-\u2013\u2014|]?\s*\.{0,3}:{0,3}\s*BestEyeCandy.*$', '',
|
|
title, flags=re.IGNORECASE).strip()
|
|
if title:
|
|
return title
|
|
|
|
# Try <h1> or <h2>
|
|
m = re.search(r'<h[12][^>]*>([^<]+)</h[12]>', page_html)
|
|
if m:
|
|
return html.unescape(m.group(1).strip())
|
|
|
|
return None
|
|
|
|
@staticmethod
|
|
def _extract_total_photos(page_html: str) -> int:
|
|
"""Extract total photo count from the page.
|
|
|
|
Handles European format (15.660) and US format (15,660).
|
|
"""
|
|
# Look for "N.NNN photos" or "N,NNN photos" or "NNN photos"
|
|
# Require leading digit to avoid matching ", photo" from keywords
|
|
m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
|
|
if m:
|
|
num_str = m.group(1)
|
|
# European format uses dots as thousands separators: 15.660
|
|
# US format uses commas: 15,660
|
|
# Remove both dots and commas (they're thousands separators)
|
|
num_str = num_str.replace('.', '').replace(',', '')
|
|
try:
|
|
return int(num_str)
|
|
except ValueError:
|
|
pass
|
|
return 0
|
|
|
|
@staticmethod
|
|
def _extract_page_count(page_html: str, photos_per_page: int = 48) -> int:
|
|
"""Extract total page count from the listing page.
|
|
|
|
Uses total photo count divided by photos per page, or falls back
|
|
to finding the maximum page number in pagination links.
|
|
"""
|
|
# Method 1: Calculate from total photos
|
|
m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
|
|
if m:
|
|
num_str = m.group(1).replace('.', '').replace(',', '')
|
|
try:
|
|
total = int(num_str)
|
|
if total > 0:
|
|
return (total + photos_per_page - 1) // photos_per_page
|
|
except ValueError:
|
|
pass
|
|
|
|
# Method 2: Find max page-N in pagination links for same celeb
|
|
page_nums = [int(x) for x in re.findall(r'/page-(\d+)/', page_html)]
|
|
if page_nums:
|
|
return max(page_nums)
|
|
|
|
return 1
|
|
|
|
@staticmethod
|
|
def _has_next_page(page_html: str) -> bool:
|
|
"""Check if there's a 'Next Page' link on the current page."""
|
|
return 'alt="Next Page"' in page_html
|
|
|
|
# ------------------------------------------------------------------
|
|
# Utility helpers
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _fetch_page(self, session: aiohttp.ClientSession,
|
|
url: str) -> Optional[str]:
|
|
"""Fetch a single page, return HTML or None."""
|
|
try:
|
|
async with session.get(url, headers=self.HEADERS,
|
|
allow_redirects=True) as resp:
|
|
if resp.status != 200:
|
|
self.log(f"HTTP {resp.status} for {url}", 'warning')
|
|
return None
|
|
return await resp.text()
|
|
except Exception as e:
|
|
self.log(f"Error fetching {url}: {e}", 'warning')
|
|
return None
|