Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

View File

@@ -0,0 +1,468 @@
"""
BestEyeCandy.com Client for Paid Content
Scrapes celebrity photo galleries from BestEyeCandy.com.
Each celeb has a unique CID and paginated photo listings.
Optimization: Full-res URLs follow a predictable pattern. We visit ONE
detail page to determine the pattern (server hostname + name format),
then construct all remaining URLs from photo IDs found on listing pages.
"""
import asyncio
import html
import json
import re
from datetime import datetime, timezone
from typing import Dict, List, Optional, Set
from urllib.parse import urlparse
import aiohttp
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class BestEyeCandyClient(LoggingMixin):
"""Client for scraping BestEyeCandy.com celebrity photo galleries."""
SERVICE_ID = 'besteyecandy'
PLATFORM = 'besteyecandy'
BASE_URL = 'https://besteyecandy.com'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
}
def __init__(self, unified_db=None, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='BestEyeCandy')
self.unified_db = unified_db
# ------------------------------------------------------------------
# Cookie support
# ------------------------------------------------------------------
def _get_cookies(self) -> Optional[list]:
"""Load cookies from the scrapers table for besteyecandy."""
if not self.unified_db:
return None
try:
with self.unified_db.get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT cookies_json FROM scrapers WHERE id = ?",
(self.SERVICE_ID,))
row = cursor.fetchone()
if row and row[0]:
data = json.loads(row[0])
if isinstance(data, dict) and 'cookies' in data:
return data['cookies']
elif isinstance(data, list):
return data
except Exception as e:
self.log(f"Could not load cookies: {e}", 'debug')
return None
def _build_cookie_jar(self, cookies_list: list) -> aiohttp.CookieJar:
"""Build an aiohttp CookieJar from a list of cookie dicts."""
jar = aiohttp.CookieJar(unsafe=True)
for cookie in cookies_list:
from http.cookies import Morsel
import types
name = cookie.get('name', '')
value = cookie.get('value', '')
domain = cookie.get('domain', '')
path = cookie.get('path', '/')
# Use SimpleCookie approach
from http.cookies import SimpleCookie
sc = SimpleCookie()
sc[name] = value
sc[name]['domain'] = domain
sc[name]['path'] = path
if cookie.get('secure'):
sc[name]['secure'] = True
jar.update_cookies(sc, urlparse(f"https://{domain.lstrip('.')}"))
return jar
def _create_session(self, timeout: aiohttp.ClientTimeout = None) -> aiohttp.ClientSession:
"""Create an aiohttp session with cookies loaded from DB."""
if timeout is None:
timeout = aiohttp.ClientTimeout(total=60)
cookies_list = self._get_cookies()
if cookies_list:
jar = self._build_cookie_jar(cookies_list)
self.log(f"Loaded {len(cookies_list)} cookies for session", 'debug')
return aiohttp.ClientSession(timeout=timeout, cookie_jar=jar)
else:
self.log("No cookies found for besteyecandy, requests may fail", 'warning')
return aiohttp.ClientSession(timeout=timeout)
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
async def get_profile_info(self, cid: str, celeb_slug: str) -> Optional[Dict]:
"""Fetch page 1 of a celeb's listing and return profile-like info."""
url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
f'sortedby-age/page-1/{celeb_slug}.html')
try:
async with self._create_session() as session:
async with session.get(url, headers=self.HEADERS,
allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"BestEyeCandy cid {cid} returned HTTP {resp.status}",
'warning')
return None
page_html = await resp.text()
except Exception as e:
self.log(f"Failed to fetch BestEyeCandy cid {cid}: {e}", 'error')
return None
# Extract celeb name from page title or heading
celeb_name = self._extract_celeb_name(page_html) or celeb_slug.replace('-', ' ')
# Extract total photos and pages
total_photos = self._extract_total_photos(page_html)
photos_per_page = len(self._extract_photo_ids(page_html)) or 48
page_count = self._extract_page_count(page_html,
photos_per_page=photos_per_page)
celeb_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
f'sortedby-age/page-1/{celeb_slug}.html')
return {
'username': celeb_slug,
'display_name': celeb_name,
'post_count': total_photos,
'page_count': page_count,
'celeb_url': celeb_url,
}
async def get_posts(self, cid: str, celeb_slug: str,
known_post_ids: Optional[Set[str]] = None,
progress_callback=None) -> List[Post]:
"""Scrape all listing pages and return posts with full-res image URLs.
Each listing page becomes one Post with ~48 Attachments (one per photo).
Post IDs are "page_N" (e.g. "page_1", "page_2", ...).
Phase 1: Fetch page 1, get first photo ID, visit detail page to learn
the full-res URL pattern.
Phase 2: Paginate all listing pages, build one Post per page.
"""
known = known_post_ids or set()
posts: List[Post] = []
total_photos = 0
url_pattern = None
try:
async with self._create_session() as session:
# -- Phase 1: Fetch page 1 and determine full-res URL pattern --
page1_url = (f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
f'sortedby-age/page-1/{celeb_slug}.html')
page_html = await self._fetch_page(session, page1_url)
if page_html is None:
return []
# Estimate page count for progress display
photos_per_page = len(self._extract_photo_ids(page_html)) or 48
estimated_pages = self._extract_page_count(
page_html, photos_per_page=photos_per_page)
self.log(f"Estimated {estimated_pages} pages of photos "
f"({photos_per_page}/page)", 'info')
# Discover full-res URL pattern from first photo
first_page_ids = self._extract_photo_ids(page_html)
if first_page_ids:
url_pattern = await self._discover_url_pattern(
session, first_page_ids[0], cid, celeb_slug)
if not url_pattern:
self.log("Could not determine full-res URL pattern", 'error')
return []
self.log(f"URL pattern: server={url_pattern['server']}, "
f"name_format={url_pattern['name_format']}, "
f"ext={url_pattern['ext']}", 'info')
# -- Phase 2: Paginate all pages, one Post per page --
page_num = 0
has_next = True # start with page 1
while has_next:
page_num += 1
if page_num == 1:
# Already fetched page 1
pass
else:
await asyncio.sleep(2) # Rate limit
page_url = (
f'{self.BASE_URL}/section/celeb-photogallery/cid-{cid}/'
f'sortedby-age/page-{page_num}/{celeb_slug}.html')
page_html = await self._fetch_page(session, page_url)
if page_html is None:
self.log(f"Failed to fetch page {page_num}, stopping",
'warning')
break
page_ids = self._extract_photo_ids(page_html)
if not page_ids:
self.log(f"Page {page_num}: no photos, stopping", 'info')
break
total_photos += len(page_ids)
has_next = self._has_next_page(page_html)
# Check if this page-post is already known
post_id = f"page_{page_num}"
if post_id in known:
self.log(f"Page {page_num}: already known, skipping",
'debug')
if progress_callback:
progress_callback(
f"Page {page_num}/~{estimated_pages}"
f"{total_photos} photos (skipped known)")
continue
# Build attachments for all photos on this page
attachments = []
for photo_id in page_ids:
dl_url = self._construct_full_res_url(url_pattern, photo_id)
filename = dl_url.rsplit('/', 1)[-1]
attachments.append(Attachment(
name=filename,
file_type='image',
extension=url_pattern.get('ext', 'jpg'),
server_path=dl_url,
download_url=dl_url,
))
post = Post(
post_id=post_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=cid,
title=f"Page {page_num}",
content=f"{len(page_ids)} photos",
published_at=datetime.now(tz=timezone.utc).isoformat(),
attachments=attachments,
)
posts.append(post)
if progress_callback:
progress_callback(
f"Page {page_num}/~{estimated_pages}"
f"{total_photos} photos")
self.log(f"Page {page_num}/~{estimated_pages}: "
f"{len(page_ids)} photos", 'debug')
except Exception as e:
self.log(f"Error scraping BestEyeCandy: {e}", 'error')
self.log(f"Total: {len(posts)} new page-posts with "
f"{total_photos} photos across all pages", 'info')
return posts
# ------------------------------------------------------------------
# URL pattern discovery
# ------------------------------------------------------------------
async def _discover_url_pattern(self, session: aiohttp.ClientSession,
photo_id: str, cid: str,
celeb_slug: str) -> Optional[Dict]:
"""Visit a detail page to discover the full-res URL pattern.
Returns dict with keys: server, dir_pattern, name_format, ext
"""
detail_url = (f'{self.BASE_URL}/section/celeb-photogallery/'
f'cid-{cid}/{celeb_slug}/photo-{photo_id}.html')
await asyncio.sleep(2) # Rate limit
page_html = await self._fetch_page(session, detail_url)
if page_html is None:
return None
# Look for full-res image URL in the detail page
# Pattern: <img src="https://euX.besteyecandy.com/section/large-photos/area-female/besteyecandy-{ID}/{Name}_{ID}_BestEyeCandyCOM.jpg">
# or <a href="..."> with similar pattern
patterns = [
r'(https?://[a-z0-9]+\.besteyecandy\.com/section/large-photos/[^"\'>\s]+)',
r'(https?://[a-z0-9]+\.besteyecandy\.com/[^"\'>\s]*besteyecandy-' + re.escape(photo_id) + r'[^"\'>\s]*)',
]
full_res_url = None
for pattern in patterns:
match = re.search(pattern, page_html)
if match:
full_res_url = match.group(1)
break
if not full_res_url:
self.log(f"Could not find full-res URL on detail page for photo {photo_id}",
'error')
return None
self.log(f"Found full-res URL: {full_res_url}", 'debug')
# Parse the URL to extract the pattern components
parsed = urlparse(full_res_url)
server = parsed.netloc # e.g., eu4.besteyecandy.com
# Extract name format from the filename
# e.g., Myleene_Klass_7727820_BestEyeCandyCOM.jpg
filename = parsed.path.rsplit('/', 1)[-1]
ext = filename.rsplit('.', 1)[-1] if '.' in filename else 'jpg'
# Extract the path pattern (everything before the filename)
path_dir = parsed.path.rsplit('/', 1)[0] # e.g., /section/large-photos/area-female/besteyecandy-7727820
# The directory pattern includes the photo ID, extract the base
# e.g., /section/large-photos/area-female/besteyecandy-{ID}
dir_pattern = re.sub(re.escape(photo_id), '{ID}', path_dir)
# Extract the name format by removing the photo ID
# e.g., Myleene_Klass_{ID}_BestEyeCandyCOM.jpg -> Myleene_Klass_{ID}_BestEyeCandyCOM
name_without_ext = filename.rsplit('.', 1)[0]
name_format = name_without_ext.replace(photo_id, '{ID}')
return {
'server': server,
'dir_pattern': dir_pattern,
'name_format': name_format,
'ext': ext,
'example_url': full_res_url,
}
def _construct_full_res_url(self, url_pattern: Dict, photo_id: str) -> str:
"""Construct the full-res URL for a photo ID using the discovered pattern."""
dir_path = url_pattern['dir_pattern'].replace('{ID}', photo_id)
filename = url_pattern['name_format'].replace('{ID}', photo_id) + '.' + url_pattern['ext']
return f"https://{url_pattern['server']}{dir_path}/{filename}"
# ------------------------------------------------------------------
# HTML parsing helpers
# ------------------------------------------------------------------
def _extract_photo_ids(self, page_html: str) -> List[str]:
"""Extract photo IDs from a listing page.
Photo links look like: href="...photo-12345.html"
"""
ids = re.findall(r'href="[^"]*photo-(\d+)\.html"', page_html)
# Deduplicate while preserving order
seen = set()
unique_ids = []
for pid in ids:
if pid not in seen:
seen.add(pid)
unique_ids.append(pid)
return unique_ids
@staticmethod
def _extract_celeb_name(page_html: str) -> Optional[str]:
"""Extract celebrity name from the page."""
# Try <title> tag: "Myleene Klass Photo Collection @ ...::: BestEyeCandy.com :::..."
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
if m:
title = html.unescape(m.group(1).strip())
# Remove everything from "Photo Collection" or "@" onwards
title = re.sub(r'\s*Photo\s+Collection.*$', '', title,
flags=re.IGNORECASE).strip()
title = re.sub(r'\s*@.*$', '', title).strip()
# Fallback: remove BestEyeCandy suffix
title = re.sub(r'\s*[-\u2013\u2014|]?\s*\.{0,3}:{0,3}\s*BestEyeCandy.*$', '',
title, flags=re.IGNORECASE).strip()
if title:
return title
# Try <h1> or <h2>
m = re.search(r'<h[12][^>]*>([^<]+)</h[12]>', page_html)
if m:
return html.unescape(m.group(1).strip())
return None
@staticmethod
def _extract_total_photos(page_html: str) -> int:
"""Extract total photo count from the page.
Handles European format (15.660) and US format (15,660).
"""
# Look for "N.NNN photos" or "N,NNN photos" or "NNN photos"
# Require leading digit to avoid matching ", photo" from keywords
m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
if m:
num_str = m.group(1)
# European format uses dots as thousands separators: 15.660
# US format uses commas: 15,660
# Remove both dots and commas (they're thousands separators)
num_str = num_str.replace('.', '').replace(',', '')
try:
return int(num_str)
except ValueError:
pass
return 0
@staticmethod
def _extract_page_count(page_html: str, photos_per_page: int = 48) -> int:
"""Extract total page count from the listing page.
Uses total photo count divided by photos per page, or falls back
to finding the maximum page number in pagination links.
"""
# Method 1: Calculate from total photos
m = re.search(r'(\d[\d.,]*)\s+photos?', page_html, re.IGNORECASE)
if m:
num_str = m.group(1).replace('.', '').replace(',', '')
try:
total = int(num_str)
if total > 0:
return (total + photos_per_page - 1) // photos_per_page
except ValueError:
pass
# Method 2: Find max page-N in pagination links for same celeb
page_nums = [int(x) for x in re.findall(r'/page-(\d+)/', page_html)]
if page_nums:
return max(page_nums)
return 1
@staticmethod
def _has_next_page(page_html: str) -> bool:
"""Check if there's a 'Next Page' link on the current page."""
return 'alt="Next Page"' in page_html
# ------------------------------------------------------------------
# Utility helpers
# ------------------------------------------------------------------
async def _fetch_page(self, session: aiohttp.ClientSession,
url: str) -> Optional[str]:
"""Fetch a single page, return HTML or None."""
try:
async with session.get(url, headers=self.HEADERS,
allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"HTTP {resp.status} for {url}", 'warning')
return None
return await resp.text()
except Exception as e:
self.log(f"Error fetching {url}: {e}", 'warning')
return None