389
modules/paid_content/bellazon_client.py
Normal file
389
modules/paid_content/bellazon_client.py
Normal file
@@ -0,0 +1,389 @@
|
||||
"""
|
||||
Bellazon Forum Thread Client for Paid Content
|
||||
|
||||
Scrapes Bellazon forum threads (Invision Power Suite) treating each thread
|
||||
as a "creator" and each reply with media as a post.
|
||||
|
||||
Only bellazon-hosted uploads are captured (external image host links are
|
||||
unreliable/ephemeral). Video attachments (attachment.php) are also captured.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Optional, Set
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
|
||||
from modules.base_module import LoggingMixin
|
||||
from .models import Post, Attachment
|
||||
|
||||
|
||||
class BellazonClient(LoggingMixin):
|
||||
"""Client for scraping Bellazon forum threads."""
|
||||
|
||||
SERVICE_ID = 'bellazon'
|
||||
PLATFORM = 'bellazon'
|
||||
BASE_URL = 'https://www.bellazon.com/main'
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
||||
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
}
|
||||
|
||||
# Extensions considered images
|
||||
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
|
||||
# Extensions considered videos
|
||||
VIDEO_EXTS = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
|
||||
|
||||
def __init__(self, log_callback=None):
|
||||
self._init_logger('PaidContent', log_callback, default_module='Bellazon')
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_profile_info(self, topic_id: str) -> Optional[Dict]:
|
||||
"""Fetch first page of a thread and return profile-like info.
|
||||
|
||||
Returns dict with: username (slug), display_name, post_count, topic_url
|
||||
"""
|
||||
# Bellazon requires a slug in the URL but redirects to the correct one
|
||||
url = f'{self.BASE_URL}/topic/{topic_id}-x/'
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"Bellazon topic {topic_id} returned HTTP {resp.status}", 'warning')
|
||||
return None
|
||||
final_url = str(resp.url)
|
||||
page_html = await resp.text()
|
||||
except Exception as e:
|
||||
self.log(f"Failed to fetch Bellazon topic {topic_id}: {e}", 'error')
|
||||
return None
|
||||
|
||||
# Extract slug from final URL: /topic/{id}-{slug}/
|
||||
slug = self._extract_slug(final_url, topic_id)
|
||||
|
||||
# Extract thread title from <h1>
|
||||
title = self._extract_title(page_html)
|
||||
|
||||
# Extract page count from "Page X of Y"
|
||||
page_count = self._extract_page_count(page_html)
|
||||
|
||||
# Count comments on this page to estimate total
|
||||
comment_ids = re.findall(r'data-commentid="(\d+)"', page_html)
|
||||
per_page = len(comment_ids) or 20
|
||||
estimated_comments = per_page * page_count
|
||||
|
||||
return {
|
||||
'username': slug,
|
||||
'display_name': title or slug,
|
||||
'post_count': estimated_comments,
|
||||
'page_count': page_count,
|
||||
'topic_url': final_url.split('?')[0].rstrip('/'),
|
||||
}
|
||||
|
||||
async def get_posts(self, topic_id: str, topic_url: str,
|
||||
known_post_ids: Optional[Set[str]] = None,
|
||||
progress_callback=None) -> List[Post]:
|
||||
"""Scrape all pages of a thread and return posts with media."""
|
||||
known = known_post_ids or set()
|
||||
posts: List[Post] = []
|
||||
|
||||
# Fetch page 1 to get page count
|
||||
page1_url = f'{topic_url}/page/1/'
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
page_html = await self._fetch_page(session, page1_url)
|
||||
if page_html is None:
|
||||
return posts
|
||||
|
||||
page_count = self._extract_page_count(page_html)
|
||||
self.log(f"Thread has {page_count} pages", 'info')
|
||||
|
||||
# Parse page 1
|
||||
page_posts = self._parse_page(page_html, topic_id, known)
|
||||
posts.extend(page_posts)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(posts))
|
||||
|
||||
# Parse remaining pages
|
||||
for page_num in range(2, page_count + 1):
|
||||
page_url = f'{topic_url}/page/{page_num}/'
|
||||
await asyncio.sleep(1) # Rate limit
|
||||
|
||||
page_html = await self._fetch_page(session, page_url)
|
||||
if page_html is None:
|
||||
self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
|
||||
break
|
||||
|
||||
page_posts = self._parse_page(page_html, topic_id, known)
|
||||
posts.extend(page_posts)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(len(posts))
|
||||
|
||||
self.log(f"Page {page_num}/{page_count}: {len(page_posts)} posts with media", 'debug')
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error scraping Bellazon thread: {e}", 'error')
|
||||
|
||||
self.log(f"Total: {len(posts)} posts with media from {page_count} pages", 'info')
|
||||
return posts
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# HTML parsing helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _parse_page(self, page_html: str, topic_id: str, known: Set[str]) -> List[Post]:
|
||||
"""Parse a single page of HTML and return Post objects for comments with media."""
|
||||
posts: List[Post] = []
|
||||
|
||||
# Split HTML into comment blocks using data-commentid markers
|
||||
# Each comment starts with data-commentid="..." and contains a content block
|
||||
comment_pattern = re.compile(
|
||||
r'data-commentid="(\d+)"\s+data-quotedata="([^"]*)"',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
matches = list(comment_pattern.finditer(page_html))
|
||||
if not matches:
|
||||
return posts
|
||||
|
||||
for i, match in enumerate(matches):
|
||||
comment_id = match.group(1)
|
||||
post_id = f"comment_{comment_id}"
|
||||
|
||||
if post_id in known:
|
||||
continue
|
||||
|
||||
quotedata_raw = match.group(2)
|
||||
|
||||
# Parse quote data for username and timestamp
|
||||
username, timestamp = self._parse_quotedata(quotedata_raw)
|
||||
|
||||
# Extract the content block for this comment
|
||||
start = match.end()
|
||||
end = matches[i + 1].start() if i + 1 < len(matches) else len(page_html)
|
||||
content_block = page_html[start:end]
|
||||
|
||||
# Find the actual content within data-role="commentContent"
|
||||
# The closing pattern is </div> followed by blank lines then </div>
|
||||
content_match = re.search(
|
||||
r'data-role="commentContent"[^>]*>(.*?)</div>\s*\n\s*\n\s*</div>',
|
||||
content_block, re.DOTALL
|
||||
)
|
||||
if not content_match:
|
||||
# Fallback: grab everything from commentContent to ipsEntry__foot
|
||||
content_match = re.search(
|
||||
r'data-role="commentContent"[^>]*>(.*?)(?=ipsEntry__foot)',
|
||||
content_block, re.DOTALL
|
||||
)
|
||||
if not content_match:
|
||||
continue
|
||||
|
||||
content_html = content_match.group(1)
|
||||
|
||||
# Extract media from content
|
||||
attachments = self._extract_media(content_html)
|
||||
|
||||
if not attachments:
|
||||
continue # Skip text-only replies
|
||||
|
||||
# Build published_at from timestamp
|
||||
published_at = None
|
||||
if timestamp:
|
||||
try:
|
||||
dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
|
||||
published_at = dt.isoformat()
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
post = Post(
|
||||
post_id=post_id,
|
||||
service_id=self.SERVICE_ID,
|
||||
platform=self.PLATFORM,
|
||||
creator_id=topic_id,
|
||||
title='',
|
||||
content=f"Posted by {username}" if username else '',
|
||||
published_at=published_at,
|
||||
attachments=attachments,
|
||||
)
|
||||
posts.append(post)
|
||||
known.add(post_id)
|
||||
|
||||
return posts
|
||||
|
||||
def _extract_media(self, content_html: str) -> List[Attachment]:
|
||||
"""Extract image and video attachments from a comment's HTML content."""
|
||||
attachments: List[Attachment] = []
|
||||
seen_urls: set = set()
|
||||
|
||||
# 1. Bellazon-hosted images: <a class="ipsAttachLink ipsAttachLink_image" href="...full..."><img src="...thumb...">
|
||||
for m in re.finditer(
|
||||
r'ipsAttachLink_image"\s+href="([^"]+)"[^>]*><img[^>]*src="([^"]+)"',
|
||||
content_html
|
||||
):
|
||||
full_url = self._normalize_url(m.group(1))
|
||||
if full_url in seen_urls:
|
||||
continue
|
||||
# Skip thumbnails as the full URL
|
||||
if '_thumb.' in full_url or '.thumb.' in full_url:
|
||||
continue
|
||||
seen_urls.add(full_url)
|
||||
attachments.append(self._make_attachment(full_url, 'image'))
|
||||
|
||||
# 2. Direct image/video links from bellazon uploads not caught by pattern 1
|
||||
for m in re.finditer(
|
||||
r'href="([^"]*bellazon\.com/main/uploads/[^"]+)"',
|
||||
content_html
|
||||
):
|
||||
url = self._normalize_url(m.group(1))
|
||||
if url in seen_urls:
|
||||
continue
|
||||
if '_thumb.' in url or '.thumb.' in url:
|
||||
continue
|
||||
ext = self._get_extension(url)
|
||||
if ext in self.IMAGE_EXTS or ext in self.VIDEO_EXTS:
|
||||
seen_urls.add(url)
|
||||
file_type = 'image' if ext in self.IMAGE_EXTS else 'video'
|
||||
attachments.append(self._make_attachment(url, file_type))
|
||||
|
||||
# 3. Video <source> tags: <source src="//www.bellazon.com/main/uploads/...MP4" type="video/mp4">
|
||||
for m in re.finditer(
|
||||
r'<source\s+src="([^"]+)"[^>]*type="video/',
|
||||
content_html
|
||||
):
|
||||
url = self._normalize_url(m.group(1))
|
||||
if url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
name = self._filename_from_url(url)
|
||||
attachments.append(self._make_attachment(url, 'video', name=name))
|
||||
|
||||
# 4. Video/file attachments: <a href="...attachment.php?id=XXX">filename.MP4</a>
|
||||
# These are protocol-relative URLs like //www.bellazon.com/main/applications/...
|
||||
for m in re.finditer(
|
||||
r'href="([^"]*attachment\.php\?id=\d+[^"]*)"[^>]*>([^<]+)',
|
||||
content_html
|
||||
):
|
||||
att_url = self._normalize_url(m.group(1))
|
||||
filename = m.group(2).strip()
|
||||
if att_url in seen_urls:
|
||||
continue
|
||||
ext = self._get_extension(filename)
|
||||
if ext in self.VIDEO_EXTS or ext in self.IMAGE_EXTS:
|
||||
seen_urls.add(att_url)
|
||||
file_type = 'video' if ext in self.VIDEO_EXTS else 'image'
|
||||
attachments.append(self._make_attachment(att_url, file_type, name=filename))
|
||||
|
||||
return attachments
|
||||
|
||||
def _make_attachment(self, url: str, file_type: str, name: str = None) -> Attachment:
|
||||
"""Create an Attachment from a URL."""
|
||||
if name is None:
|
||||
name = self._filename_from_url(url)
|
||||
ext = self._get_extension(name)
|
||||
|
||||
return Attachment(
|
||||
name=name,
|
||||
file_type=file_type,
|
||||
extension=ext if ext else None,
|
||||
server_path=url, # Used as dedup key
|
||||
download_url=url,
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Utility helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
|
||||
"""Fetch a single page, return HTML or None."""
|
||||
try:
|
||||
async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
|
||||
if resp.status != 200:
|
||||
self.log(f"HTTP {resp.status} for {url}", 'warning')
|
||||
return None
|
||||
return await resp.text()
|
||||
except Exception as e:
|
||||
self.log(f"Error fetching {url}: {e}", 'warning')
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_slug(url: str, topic_id: str) -> str:
|
||||
"""Extract slug from URL like /topic/39089-india-reynolds/"""
|
||||
m = re.search(rf'/topic/{re.escape(topic_id)}-([^/?#]+)', url)
|
||||
if m:
|
||||
return m.group(1).strip('/')
|
||||
return topic_id
|
||||
|
||||
@staticmethod
|
||||
def _extract_title(page_html: str) -> Optional[str]:
|
||||
"""Extract thread title from <h1>."""
|
||||
m = re.search(r'<h1[^>]*>([^<]+)</h1>', page_html)
|
||||
if m:
|
||||
return html.unescape(m.group(1).strip())
|
||||
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
|
||||
if m:
|
||||
title = html.unescape(m.group(1).strip())
|
||||
# Remove site suffix
|
||||
title = re.sub(r'\s*[-–—]\s*Bellazon.*$', '', title, flags=re.IGNORECASE).strip()
|
||||
return title
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_page_count(page_html: str) -> int:
|
||||
"""Extract total page count from 'Page X of Y'."""
|
||||
m = re.search(r'Page\s+\d+\s+of\s+(\d+)', page_html)
|
||||
if m:
|
||||
return int(m.group(1))
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
def _parse_quotedata(raw: str) -> tuple:
|
||||
"""Parse HTML-encoded JSON quotedata, return (username, unix_timestamp)."""
|
||||
try:
|
||||
decoded = html.unescape(raw)
|
||||
data = json.loads(decoded)
|
||||
return data.get('username', ''), data.get('timestamp')
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return '', None
|
||||
|
||||
@staticmethod
|
||||
def _normalize_url(url: str) -> str:
|
||||
"""Normalize a URL: handle protocol-relative, decode HTML entities, make absolute."""
|
||||
url = html.unescape(url) # & → &
|
||||
if url.startswith('//'):
|
||||
url = 'https:' + url
|
||||
elif url.startswith('/'):
|
||||
url = 'https://www.bellazon.com' + url
|
||||
elif not url.startswith('http'):
|
||||
url = 'https://www.bellazon.com/main/' + url
|
||||
return url
|
||||
|
||||
@staticmethod
|
||||
def _get_extension(filename_or_url: str) -> str:
|
||||
"""Get lowercase file extension from a filename or URL."""
|
||||
# Strip query params
|
||||
clean = filename_or_url.split('?')[0].split('#')[0]
|
||||
if '.' in clean.split('/')[-1]:
|
||||
return clean.rsplit('.', 1)[-1].lower()
|
||||
return ''
|
||||
|
||||
@staticmethod
|
||||
def _filename_from_url(url: str) -> str:
|
||||
"""Extract filename from URL path."""
|
||||
path = urlparse(url).path
|
||||
name = path.rstrip('/').split('/')[-1]
|
||||
return name if name else 'unnamed'
|
||||
Reference in New Issue
Block a user