Files
media-downloader/modules/paid_content/bellazon_client.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

390 lines
15 KiB
Python

"""
Bellazon Forum Thread Client for Paid Content
Scrapes Bellazon forum threads (Invision Power Suite) treating each thread
as a "creator" and each reply with media as a post.
Only bellazon-hosted uploads are captured (external image host links are
unreliable/ephemeral). Video attachments (attachment.php) are also captured.
"""
import asyncio
import html
import json
import re
from datetime import datetime, timezone
from typing import Dict, List, Optional, Set
from urllib.parse import urlparse
import aiohttp
from modules.base_module import LoggingMixin
from .models import Post, Attachment
class BellazonClient(LoggingMixin):
"""Client for scraping Bellazon forum threads."""
SERVICE_ID = 'bellazon'
PLATFORM = 'bellazon'
BASE_URL = 'https://www.bellazon.com/main'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
}
# Extensions considered images
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
# Extensions considered videos
VIDEO_EXTS = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
def __init__(self, log_callback=None):
self._init_logger('PaidContent', log_callback, default_module='Bellazon')
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
async def get_profile_info(self, topic_id: str) -> Optional[Dict]:
"""Fetch first page of a thread and return profile-like info.
Returns dict with: username (slug), display_name, post_count, topic_url
"""
# Bellazon requires a slug in the URL but redirects to the correct one
url = f'{self.BASE_URL}/topic/{topic_id}-x/'
timeout = aiohttp.ClientTimeout(total=30)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"Bellazon topic {topic_id} returned HTTP {resp.status}", 'warning')
return None
final_url = str(resp.url)
page_html = await resp.text()
except Exception as e:
self.log(f"Failed to fetch Bellazon topic {topic_id}: {e}", 'error')
return None
# Extract slug from final URL: /topic/{id}-{slug}/
slug = self._extract_slug(final_url, topic_id)
# Extract thread title from <h1>
title = self._extract_title(page_html)
# Extract page count from "Page X of Y"
page_count = self._extract_page_count(page_html)
# Count comments on this page to estimate total
comment_ids = re.findall(r'data-commentid="(\d+)"', page_html)
per_page = len(comment_ids) or 20
estimated_comments = per_page * page_count
return {
'username': slug,
'display_name': title or slug,
'post_count': estimated_comments,
'page_count': page_count,
'topic_url': final_url.split('?')[0].rstrip('/'),
}
async def get_posts(self, topic_id: str, topic_url: str,
known_post_ids: Optional[Set[str]] = None,
progress_callback=None) -> List[Post]:
"""Scrape all pages of a thread and return posts with media."""
known = known_post_ids or set()
posts: List[Post] = []
# Fetch page 1 to get page count
page1_url = f'{topic_url}/page/1/'
timeout = aiohttp.ClientTimeout(total=30)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
page_html = await self._fetch_page(session, page1_url)
if page_html is None:
return posts
page_count = self._extract_page_count(page_html)
self.log(f"Thread has {page_count} pages", 'info')
# Parse page 1
page_posts = self._parse_page(page_html, topic_id, known)
posts.extend(page_posts)
if progress_callback:
progress_callback(len(posts))
# Parse remaining pages
for page_num in range(2, page_count + 1):
page_url = f'{topic_url}/page/{page_num}/'
await asyncio.sleep(1) # Rate limit
page_html = await self._fetch_page(session, page_url)
if page_html is None:
self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
break
page_posts = self._parse_page(page_html, topic_id, known)
posts.extend(page_posts)
if progress_callback:
progress_callback(len(posts))
self.log(f"Page {page_num}/{page_count}: {len(page_posts)} posts with media", 'debug')
except Exception as e:
self.log(f"Error scraping Bellazon thread: {e}", 'error')
self.log(f"Total: {len(posts)} posts with media from {page_count} pages", 'info')
return posts
# ------------------------------------------------------------------
# HTML parsing helpers
# ------------------------------------------------------------------
def _parse_page(self, page_html: str, topic_id: str, known: Set[str]) -> List[Post]:
"""Parse a single page of HTML and return Post objects for comments with media."""
posts: List[Post] = []
# Split HTML into comment blocks using data-commentid markers
# Each comment starts with data-commentid="..." and contains a content block
comment_pattern = re.compile(
r'data-commentid="(\d+)"\s+data-quotedata="([^"]*)"',
re.DOTALL
)
matches = list(comment_pattern.finditer(page_html))
if not matches:
return posts
for i, match in enumerate(matches):
comment_id = match.group(1)
post_id = f"comment_{comment_id}"
if post_id in known:
continue
quotedata_raw = match.group(2)
# Parse quote data for username and timestamp
username, timestamp = self._parse_quotedata(quotedata_raw)
# Extract the content block for this comment
start = match.end()
end = matches[i + 1].start() if i + 1 < len(matches) else len(page_html)
content_block = page_html[start:end]
# Find the actual content within data-role="commentContent"
# The closing pattern is </div> followed by blank lines then </div>
content_match = re.search(
r'data-role="commentContent"[^>]*>(.*?)</div>\s*\n\s*\n\s*</div>',
content_block, re.DOTALL
)
if not content_match:
# Fallback: grab everything from commentContent to ipsEntry__foot
content_match = re.search(
r'data-role="commentContent"[^>]*>(.*?)(?=ipsEntry__foot)',
content_block, re.DOTALL
)
if not content_match:
continue
content_html = content_match.group(1)
# Extract media from content
attachments = self._extract_media(content_html)
if not attachments:
continue # Skip text-only replies
# Build published_at from timestamp
published_at = None
if timestamp:
try:
dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
published_at = dt.isoformat()
except (ValueError, OSError):
pass
post = Post(
post_id=post_id,
service_id=self.SERVICE_ID,
platform=self.PLATFORM,
creator_id=topic_id,
title='',
content=f"Posted by {username}" if username else '',
published_at=published_at,
attachments=attachments,
)
posts.append(post)
known.add(post_id)
return posts
def _extract_media(self, content_html: str) -> List[Attachment]:
"""Extract image and video attachments from a comment's HTML content."""
attachments: List[Attachment] = []
seen_urls: set = set()
# 1. Bellazon-hosted images: <a class="ipsAttachLink ipsAttachLink_image" href="...full..."><img src="...thumb...">
for m in re.finditer(
r'ipsAttachLink_image"\s+href="([^"]+)"[^>]*><img[^>]*src="([^"]+)"',
content_html
):
full_url = self._normalize_url(m.group(1))
if full_url in seen_urls:
continue
# Skip thumbnails as the full URL
if '_thumb.' in full_url or '.thumb.' in full_url:
continue
seen_urls.add(full_url)
attachments.append(self._make_attachment(full_url, 'image'))
# 2. Direct image/video links from bellazon uploads not caught by pattern 1
for m in re.finditer(
r'href="([^"]*bellazon\.com/main/uploads/[^"]+)"',
content_html
):
url = self._normalize_url(m.group(1))
if url in seen_urls:
continue
if '_thumb.' in url or '.thumb.' in url:
continue
ext = self._get_extension(url)
if ext in self.IMAGE_EXTS or ext in self.VIDEO_EXTS:
seen_urls.add(url)
file_type = 'image' if ext in self.IMAGE_EXTS else 'video'
attachments.append(self._make_attachment(url, file_type))
# 3. Video <source> tags: <source src="//www.bellazon.com/main/uploads/...MP4" type="video/mp4">
for m in re.finditer(
r'<source\s+src="([^"]+)"[^>]*type="video/',
content_html
):
url = self._normalize_url(m.group(1))
if url in seen_urls:
continue
seen_urls.add(url)
name = self._filename_from_url(url)
attachments.append(self._make_attachment(url, 'video', name=name))
# 4. Video/file attachments: <a href="...attachment.php?id=XXX">filename.MP4</a>
# These are protocol-relative URLs like //www.bellazon.com/main/applications/...
for m in re.finditer(
r'href="([^"]*attachment\.php\?id=\d+[^"]*)"[^>]*>([^<]+)',
content_html
):
att_url = self._normalize_url(m.group(1))
filename = m.group(2).strip()
if att_url in seen_urls:
continue
ext = self._get_extension(filename)
if ext in self.VIDEO_EXTS or ext in self.IMAGE_EXTS:
seen_urls.add(att_url)
file_type = 'video' if ext in self.VIDEO_EXTS else 'image'
attachments.append(self._make_attachment(att_url, file_type, name=filename))
return attachments
def _make_attachment(self, url: str, file_type: str, name: str = None) -> Attachment:
"""Create an Attachment from a URL."""
if name is None:
name = self._filename_from_url(url)
ext = self._get_extension(name)
return Attachment(
name=name,
file_type=file_type,
extension=ext if ext else None,
server_path=url, # Used as dedup key
download_url=url,
)
# ------------------------------------------------------------------
# Utility helpers
# ------------------------------------------------------------------
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
"""Fetch a single page, return HTML or None."""
try:
async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
if resp.status != 200:
self.log(f"HTTP {resp.status} for {url}", 'warning')
return None
return await resp.text()
except Exception as e:
self.log(f"Error fetching {url}: {e}", 'warning')
return None
@staticmethod
def _extract_slug(url: str, topic_id: str) -> str:
"""Extract slug from URL like /topic/39089-india-reynolds/"""
m = re.search(rf'/topic/{re.escape(topic_id)}-([^/?#]+)', url)
if m:
return m.group(1).strip('/')
return topic_id
@staticmethod
def _extract_title(page_html: str) -> Optional[str]:
"""Extract thread title from <h1>."""
m = re.search(r'<h1[^>]*>([^<]+)</h1>', page_html)
if m:
return html.unescape(m.group(1).strip())
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
if m:
title = html.unescape(m.group(1).strip())
# Remove site suffix
title = re.sub(r'\s*[-–—]\s*Bellazon.*$', '', title, flags=re.IGNORECASE).strip()
return title
return None
@staticmethod
def _extract_page_count(page_html: str) -> int:
"""Extract total page count from 'Page X of Y'."""
m = re.search(r'Page\s+\d+\s+of\s+(\d+)', page_html)
if m:
return int(m.group(1))
return 1
@staticmethod
def _parse_quotedata(raw: str) -> tuple:
"""Parse HTML-encoded JSON quotedata, return (username, unix_timestamp)."""
try:
decoded = html.unescape(raw)
data = json.loads(decoded)
return data.get('username', ''), data.get('timestamp')
except (json.JSONDecodeError, ValueError):
return '', None
@staticmethod
def _normalize_url(url: str) -> str:
"""Normalize a URL: handle protocol-relative, decode HTML entities, make absolute."""
url = html.unescape(url) # &amp; → &
if url.startswith('//'):
url = 'https:' + url
elif url.startswith('/'):
url = 'https://www.bellazon.com' + url
elif not url.startswith('http'):
url = 'https://www.bellazon.com/main/' + url
return url
@staticmethod
def _get_extension(filename_or_url: str) -> str:
"""Get lowercase file extension from a filename or URL."""
# Strip query params
clean = filename_or_url.split('?')[0].split('#')[0]
if '.' in clean.split('/')[-1]:
return clean.rsplit('.', 1)[-1].lower()
return ''
@staticmethod
def _filename_from_url(url: str) -> str:
"""Extract filename from URL path."""
path = urlparse(url).path
name = path.rstrip('/').split('/')[-1]
return name if name else 'unnamed'