390 lines
15 KiB
Python
390 lines
15 KiB
Python
"""
|
|
Bellazon Forum Thread Client for Paid Content
|
|
|
|
Scrapes Bellazon forum threads (Invision Power Suite) treating each thread
|
|
as a "creator" and each reply with media as a post.
|
|
|
|
Only bellazon-hosted uploads are captured (external image host links are
|
|
unreliable/ephemeral). Video attachments (attachment.php) are also captured.
|
|
"""
|
|
|
|
import asyncio
|
|
import html
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Set
|
|
from urllib.parse import urlparse
|
|
|
|
import aiohttp
|
|
|
|
from modules.base_module import LoggingMixin
|
|
from .models import Post, Attachment
|
|
|
|
|
|
class BellazonClient(LoggingMixin):
|
|
"""Client for scraping Bellazon forum threads."""
|
|
|
|
SERVICE_ID = 'bellazon'
|
|
PLATFORM = 'bellazon'
|
|
BASE_URL = 'https://www.bellazon.com/main'
|
|
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
|
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
}
|
|
|
|
# Extensions considered images
|
|
IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
|
|
# Extensions considered videos
|
|
VIDEO_EXTS = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
|
|
|
|
def __init__(self, log_callback=None):
|
|
self._init_logger('PaidContent', log_callback, default_module='Bellazon')
|
|
|
|
# ------------------------------------------------------------------
|
|
# Public API
|
|
# ------------------------------------------------------------------
|
|
|
|
async def get_profile_info(self, topic_id: str) -> Optional[Dict]:
|
|
"""Fetch first page of a thread and return profile-like info.
|
|
|
|
Returns dict with: username (slug), display_name, post_count, topic_url
|
|
"""
|
|
# Bellazon requires a slug in the URL but redirects to the correct one
|
|
url = f'{self.BASE_URL}/topic/{topic_id}-x/'
|
|
timeout = aiohttp.ClientTimeout(total=30)
|
|
|
|
try:
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
|
|
if resp.status != 200:
|
|
self.log(f"Bellazon topic {topic_id} returned HTTP {resp.status}", 'warning')
|
|
return None
|
|
final_url = str(resp.url)
|
|
page_html = await resp.text()
|
|
except Exception as e:
|
|
self.log(f"Failed to fetch Bellazon topic {topic_id}: {e}", 'error')
|
|
return None
|
|
|
|
# Extract slug from final URL: /topic/{id}-{slug}/
|
|
slug = self._extract_slug(final_url, topic_id)
|
|
|
|
# Extract thread title from <h1>
|
|
title = self._extract_title(page_html)
|
|
|
|
# Extract page count from "Page X of Y"
|
|
page_count = self._extract_page_count(page_html)
|
|
|
|
# Count comments on this page to estimate total
|
|
comment_ids = re.findall(r'data-commentid="(\d+)"', page_html)
|
|
per_page = len(comment_ids) or 20
|
|
estimated_comments = per_page * page_count
|
|
|
|
return {
|
|
'username': slug,
|
|
'display_name': title or slug,
|
|
'post_count': estimated_comments,
|
|
'page_count': page_count,
|
|
'topic_url': final_url.split('?')[0].rstrip('/'),
|
|
}
|
|
|
|
async def get_posts(self, topic_id: str, topic_url: str,
|
|
known_post_ids: Optional[Set[str]] = None,
|
|
progress_callback=None) -> List[Post]:
|
|
"""Scrape all pages of a thread and return posts with media."""
|
|
known = known_post_ids or set()
|
|
posts: List[Post] = []
|
|
|
|
# Fetch page 1 to get page count
|
|
page1_url = f'{topic_url}/page/1/'
|
|
timeout = aiohttp.ClientTimeout(total=30)
|
|
|
|
try:
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
page_html = await self._fetch_page(session, page1_url)
|
|
if page_html is None:
|
|
return posts
|
|
|
|
page_count = self._extract_page_count(page_html)
|
|
self.log(f"Thread has {page_count} pages", 'info')
|
|
|
|
# Parse page 1
|
|
page_posts = self._parse_page(page_html, topic_id, known)
|
|
posts.extend(page_posts)
|
|
|
|
if progress_callback:
|
|
progress_callback(len(posts))
|
|
|
|
# Parse remaining pages
|
|
for page_num in range(2, page_count + 1):
|
|
page_url = f'{topic_url}/page/{page_num}/'
|
|
await asyncio.sleep(1) # Rate limit
|
|
|
|
page_html = await self._fetch_page(session, page_url)
|
|
if page_html is None:
|
|
self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
|
|
break
|
|
|
|
page_posts = self._parse_page(page_html, topic_id, known)
|
|
posts.extend(page_posts)
|
|
|
|
if progress_callback:
|
|
progress_callback(len(posts))
|
|
|
|
self.log(f"Page {page_num}/{page_count}: {len(page_posts)} posts with media", 'debug')
|
|
|
|
except Exception as e:
|
|
self.log(f"Error scraping Bellazon thread: {e}", 'error')
|
|
|
|
self.log(f"Total: {len(posts)} posts with media from {page_count} pages", 'info')
|
|
return posts
|
|
|
|
# ------------------------------------------------------------------
|
|
# HTML parsing helpers
|
|
# ------------------------------------------------------------------
|
|
|
|
def _parse_page(self, page_html: str, topic_id: str, known: Set[str]) -> List[Post]:
|
|
"""Parse a single page of HTML and return Post objects for comments with media."""
|
|
posts: List[Post] = []
|
|
|
|
# Split HTML into comment blocks using data-commentid markers
|
|
# Each comment starts with data-commentid="..." and contains a content block
|
|
comment_pattern = re.compile(
|
|
r'data-commentid="(\d+)"\s+data-quotedata="([^"]*)"',
|
|
re.DOTALL
|
|
)
|
|
|
|
matches = list(comment_pattern.finditer(page_html))
|
|
if not matches:
|
|
return posts
|
|
|
|
for i, match in enumerate(matches):
|
|
comment_id = match.group(1)
|
|
post_id = f"comment_{comment_id}"
|
|
|
|
if post_id in known:
|
|
continue
|
|
|
|
quotedata_raw = match.group(2)
|
|
|
|
# Parse quote data for username and timestamp
|
|
username, timestamp = self._parse_quotedata(quotedata_raw)
|
|
|
|
# Extract the content block for this comment
|
|
start = match.end()
|
|
end = matches[i + 1].start() if i + 1 < len(matches) else len(page_html)
|
|
content_block = page_html[start:end]
|
|
|
|
# Find the actual content within data-role="commentContent"
|
|
# The closing pattern is </div> followed by blank lines then </div>
|
|
content_match = re.search(
|
|
r'data-role="commentContent"[^>]*>(.*?)</div>\s*\n\s*\n\s*</div>',
|
|
content_block, re.DOTALL
|
|
)
|
|
if not content_match:
|
|
# Fallback: grab everything from commentContent to ipsEntry__foot
|
|
content_match = re.search(
|
|
r'data-role="commentContent"[^>]*>(.*?)(?=ipsEntry__foot)',
|
|
content_block, re.DOTALL
|
|
)
|
|
if not content_match:
|
|
continue
|
|
|
|
content_html = content_match.group(1)
|
|
|
|
# Extract media from content
|
|
attachments = self._extract_media(content_html)
|
|
|
|
if not attachments:
|
|
continue # Skip text-only replies
|
|
|
|
# Build published_at from timestamp
|
|
published_at = None
|
|
if timestamp:
|
|
try:
|
|
dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
|
|
published_at = dt.isoformat()
|
|
except (ValueError, OSError):
|
|
pass
|
|
|
|
post = Post(
|
|
post_id=post_id,
|
|
service_id=self.SERVICE_ID,
|
|
platform=self.PLATFORM,
|
|
creator_id=topic_id,
|
|
title='',
|
|
content=f"Posted by {username}" if username else '',
|
|
published_at=published_at,
|
|
attachments=attachments,
|
|
)
|
|
posts.append(post)
|
|
known.add(post_id)
|
|
|
|
return posts
|
|
|
|
def _extract_media(self, content_html: str) -> List[Attachment]:
|
|
"""Extract image and video attachments from a comment's HTML content."""
|
|
attachments: List[Attachment] = []
|
|
seen_urls: set = set()
|
|
|
|
# 1. Bellazon-hosted images: <a class="ipsAttachLink ipsAttachLink_image" href="...full..."><img src="...thumb...">
|
|
for m in re.finditer(
|
|
r'ipsAttachLink_image"\s+href="([^"]+)"[^>]*><img[^>]*src="([^"]+)"',
|
|
content_html
|
|
):
|
|
full_url = self._normalize_url(m.group(1))
|
|
if full_url in seen_urls:
|
|
continue
|
|
# Skip thumbnails as the full URL
|
|
if '_thumb.' in full_url or '.thumb.' in full_url:
|
|
continue
|
|
seen_urls.add(full_url)
|
|
attachments.append(self._make_attachment(full_url, 'image'))
|
|
|
|
# 2. Direct image/video links from bellazon uploads not caught by pattern 1
|
|
for m in re.finditer(
|
|
r'href="([^"]*bellazon\.com/main/uploads/[^"]+)"',
|
|
content_html
|
|
):
|
|
url = self._normalize_url(m.group(1))
|
|
if url in seen_urls:
|
|
continue
|
|
if '_thumb.' in url or '.thumb.' in url:
|
|
continue
|
|
ext = self._get_extension(url)
|
|
if ext in self.IMAGE_EXTS or ext in self.VIDEO_EXTS:
|
|
seen_urls.add(url)
|
|
file_type = 'image' if ext in self.IMAGE_EXTS else 'video'
|
|
attachments.append(self._make_attachment(url, file_type))
|
|
|
|
# 3. Video <source> tags: <source src="//www.bellazon.com/main/uploads/...MP4" type="video/mp4">
|
|
for m in re.finditer(
|
|
r'<source\s+src="([^"]+)"[^>]*type="video/',
|
|
content_html
|
|
):
|
|
url = self._normalize_url(m.group(1))
|
|
if url in seen_urls:
|
|
continue
|
|
seen_urls.add(url)
|
|
name = self._filename_from_url(url)
|
|
attachments.append(self._make_attachment(url, 'video', name=name))
|
|
|
|
# 4. Video/file attachments: <a href="...attachment.php?id=XXX">filename.MP4</a>
|
|
# These are protocol-relative URLs like //www.bellazon.com/main/applications/...
|
|
for m in re.finditer(
|
|
r'href="([^"]*attachment\.php\?id=\d+[^"]*)"[^>]*>([^<]+)',
|
|
content_html
|
|
):
|
|
att_url = self._normalize_url(m.group(1))
|
|
filename = m.group(2).strip()
|
|
if att_url in seen_urls:
|
|
continue
|
|
ext = self._get_extension(filename)
|
|
if ext in self.VIDEO_EXTS or ext in self.IMAGE_EXTS:
|
|
seen_urls.add(att_url)
|
|
file_type = 'video' if ext in self.VIDEO_EXTS else 'image'
|
|
attachments.append(self._make_attachment(att_url, file_type, name=filename))
|
|
|
|
return attachments
|
|
|
|
def _make_attachment(self, url: str, file_type: str, name: str = None) -> Attachment:
|
|
"""Create an Attachment from a URL."""
|
|
if name is None:
|
|
name = self._filename_from_url(url)
|
|
ext = self._get_extension(name)
|
|
|
|
return Attachment(
|
|
name=name,
|
|
file_type=file_type,
|
|
extension=ext if ext else None,
|
|
server_path=url, # Used as dedup key
|
|
download_url=url,
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Utility helpers
|
|
# ------------------------------------------------------------------
|
|
|
|
async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
|
|
"""Fetch a single page, return HTML or None."""
|
|
try:
|
|
async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
|
|
if resp.status != 200:
|
|
self.log(f"HTTP {resp.status} for {url}", 'warning')
|
|
return None
|
|
return await resp.text()
|
|
except Exception as e:
|
|
self.log(f"Error fetching {url}: {e}", 'warning')
|
|
return None
|
|
|
|
@staticmethod
|
|
def _extract_slug(url: str, topic_id: str) -> str:
|
|
"""Extract slug from URL like /topic/39089-india-reynolds/"""
|
|
m = re.search(rf'/topic/{re.escape(topic_id)}-([^/?#]+)', url)
|
|
if m:
|
|
return m.group(1).strip('/')
|
|
return topic_id
|
|
|
|
@staticmethod
|
|
def _extract_title(page_html: str) -> Optional[str]:
|
|
"""Extract thread title from <h1>."""
|
|
m = re.search(r'<h1[^>]*>([^<]+)</h1>', page_html)
|
|
if m:
|
|
return html.unescape(m.group(1).strip())
|
|
m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
|
|
if m:
|
|
title = html.unescape(m.group(1).strip())
|
|
# Remove site suffix
|
|
title = re.sub(r'\s*[-–—]\s*Bellazon.*$', '', title, flags=re.IGNORECASE).strip()
|
|
return title
|
|
return None
|
|
|
|
@staticmethod
|
|
def _extract_page_count(page_html: str) -> int:
|
|
"""Extract total page count from 'Page X of Y'."""
|
|
m = re.search(r'Page\s+\d+\s+of\s+(\d+)', page_html)
|
|
if m:
|
|
return int(m.group(1))
|
|
return 1
|
|
|
|
@staticmethod
|
|
def _parse_quotedata(raw: str) -> tuple:
|
|
"""Parse HTML-encoded JSON quotedata, return (username, unix_timestamp)."""
|
|
try:
|
|
decoded = html.unescape(raw)
|
|
data = json.loads(decoded)
|
|
return data.get('username', ''), data.get('timestamp')
|
|
except (json.JSONDecodeError, ValueError):
|
|
return '', None
|
|
|
|
@staticmethod
|
|
def _normalize_url(url: str) -> str:
|
|
"""Normalize a URL: handle protocol-relative, decode HTML entities, make absolute."""
|
|
url = html.unescape(url) # & → &
|
|
if url.startswith('//'):
|
|
url = 'https:' + url
|
|
elif url.startswith('/'):
|
|
url = 'https://www.bellazon.com' + url
|
|
elif not url.startswith('http'):
|
|
url = 'https://www.bellazon.com/main/' + url
|
|
return url
|
|
|
|
@staticmethod
|
|
def _get_extension(filename_or_url: str) -> str:
|
|
"""Get lowercase file extension from a filename or URL."""
|
|
# Strip query params
|
|
clean = filename_or_url.split('?')[0].split('#')[0]
|
|
if '.' in clean.split('/')[-1]:
|
|
return clean.rsplit('.', 1)[-1].lower()
|
|
return ''
|
|
|
|
@staticmethod
|
|
def _filename_from_url(url: str) -> str:
|
|
"""Extract filename from URL path."""
|
|
path = urlparse(url).path
|
|
name = path.rstrip('/').split('/')[-1]
|
|
return name if name else 'unnamed'
|