""" Bellazon Forum Thread Client for Paid Content Scrapes Bellazon forum threads (Invision Power Suite) treating each thread as a "creator" and each reply with media as a post. Only bellazon-hosted uploads are captured (external image host links are unreliable/ephemeral). Video attachments (attachment.php) are also captured. """ import asyncio import html import json import re from datetime import datetime, timezone from typing import Dict, List, Optional, Set from urllib.parse import urlparse import aiohttp from modules.base_module import LoggingMixin from .models import Post, Attachment class BellazonClient(LoggingMixin): """Client for scraping Bellazon forum threads.""" SERVICE_ID = 'bellazon' PLATFORM = 'bellazon' BASE_URL = 'https://www.bellazon.com/main' HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', } # Extensions considered images IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'} # Extensions considered videos VIDEO_EXTS = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'} def __init__(self, log_callback=None): self._init_logger('PaidContent', log_callback, default_module='Bellazon') # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ async def get_profile_info(self, topic_id: str) -> Optional[Dict]: """Fetch first page of a thread and return profile-like info. Returns dict with: username (slug), display_name, post_count, topic_url """ # Bellazon requires a slug in the URL but redirects to the correct one url = f'{self.BASE_URL}/topic/{topic_id}-x/' timeout = aiohttp.ClientTimeout(total=30) try: async with aiohttp.ClientSession(timeout=timeout) as session: async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp: if resp.status != 200: self.log(f"Bellazon topic {topic_id} returned HTTP {resp.status}", 'warning') return None final_url = str(resp.url) page_html = await resp.text() except Exception as e: self.log(f"Failed to fetch Bellazon topic {topic_id}: {e}", 'error') return None # Extract slug from final URL: /topic/{id}-{slug}/ slug = self._extract_slug(final_url, topic_id) # Extract thread title from