""" Bellazon Forum Thread Client for Paid Content Scrapes Bellazon forum threads (Invision Power Suite) treating each thread as a "creator" and each reply with media as a post. Only bellazon-hosted uploads are captured (external image host links are unreliable/ephemeral). Video attachments (attachment.php) are also captured. """ import asyncio import html import json import re from datetime import datetime, timezone from typing import Dict, List, Optional, Set from urllib.parse import urlparse import aiohttp from modules.base_module import LoggingMixin from .models import Post, Attachment class BellazonClient(LoggingMixin): """Client for scraping Bellazon forum threads.""" SERVICE_ID = 'bellazon' PLATFORM = 'bellazon' BASE_URL = 'https://www.bellazon.com/main' HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', } # Extensions considered images IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'} # Extensions considered videos VIDEO_EXTS = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'} def __init__(self, log_callback=None): self._init_logger('PaidContent', log_callback, default_module='Bellazon') # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ async def get_profile_info(self, topic_id: str) -> Optional[Dict]: """Fetch first page of a thread and return profile-like info. Returns dict with: username (slug), display_name, post_count, topic_url """ # Bellazon requires a slug in the URL but redirects to the correct one url = f'{self.BASE_URL}/topic/{topic_id}-x/' timeout = aiohttp.ClientTimeout(total=30) try: async with aiohttp.ClientSession(timeout=timeout) as session: async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp: if resp.status != 200: self.log(f"Bellazon topic {topic_id} returned HTTP {resp.status}", 'warning') return None final_url = str(resp.url) page_html = await resp.text() except Exception as e: self.log(f"Failed to fetch Bellazon topic {topic_id}: {e}", 'error') return None # Extract slug from final URL: /topic/{id}-{slug}/ slug = self._extract_slug(final_url, topic_id) # Extract thread title from

title = self._extract_title(page_html) # Extract page count from "Page X of Y" page_count = self._extract_page_count(page_html) # Count comments on this page to estimate total comment_ids = re.findall(r'data-commentid="(\d+)"', page_html) per_page = len(comment_ids) or 20 estimated_comments = per_page * page_count return { 'username': slug, 'display_name': title or slug, 'post_count': estimated_comments, 'page_count': page_count, 'topic_url': final_url.split('?')[0].rstrip('/'), } async def get_posts(self, topic_id: str, topic_url: str, known_post_ids: Optional[Set[str]] = None, progress_callback=None) -> List[Post]: """Scrape all pages of a thread and return posts with media.""" known = known_post_ids or set() posts: List[Post] = [] # Fetch page 1 to get page count page1_url = f'{topic_url}/page/1/' timeout = aiohttp.ClientTimeout(total=30) try: async with aiohttp.ClientSession(timeout=timeout) as session: page_html = await self._fetch_page(session, page1_url) if page_html is None: return posts page_count = self._extract_page_count(page_html) self.log(f"Thread has {page_count} pages", 'info') # Parse page 1 page_posts = self._parse_page(page_html, topic_id, known) posts.extend(page_posts) if progress_callback: progress_callback(len(posts)) # Parse remaining pages for page_num in range(2, page_count + 1): page_url = f'{topic_url}/page/{page_num}/' await asyncio.sleep(1) # Rate limit page_html = await self._fetch_page(session, page_url) if page_html is None: self.log(f"Failed to fetch page {page_num}, stopping", 'warning') break page_posts = self._parse_page(page_html, topic_id, known) posts.extend(page_posts) if progress_callback: progress_callback(len(posts)) self.log(f"Page {page_num}/{page_count}: {len(page_posts)} posts with media", 'debug') except Exception as e: self.log(f"Error scraping Bellazon thread: {e}", 'error') self.log(f"Total: {len(posts)} posts with media from {page_count} pages", 'info') return posts # ------------------------------------------------------------------ # HTML parsing helpers # ------------------------------------------------------------------ def _parse_page(self, page_html: str, topic_id: str, known: Set[str]) -> List[Post]: """Parse a single page of HTML and return Post objects for comments with media.""" posts: List[Post] = [] # Split HTML into comment blocks using data-commentid markers # Each comment starts with data-commentid="..." and contains a content block comment_pattern = re.compile( r'data-commentid="(\d+)"\s+data-quotedata="([^"]*)"', re.DOTALL ) matches = list(comment_pattern.finditer(page_html)) if not matches: return posts for i, match in enumerate(matches): comment_id = match.group(1) post_id = f"comment_{comment_id}" if post_id in known: continue quotedata_raw = match.group(2) # Parse quote data for username and timestamp username, timestamp = self._parse_quotedata(quotedata_raw) # Extract the content block for this comment start = match.end() end = matches[i + 1].start() if i + 1 < len(matches) else len(page_html) content_block = page_html[start:end] # Find the actual content within data-role="commentContent" # The closing pattern is followed by blank lines then content_match = re.search( r'data-role="commentContent"[^>]*>(.*?)\s*\n\s*\n\s*', content_block, re.DOTALL ) if not content_match: # Fallback: grab everything from commentContent to ipsEntry__foot content_match = re.search( r'data-role="commentContent"[^>]*>(.*?)(?=ipsEntry__foot)', content_block, re.DOTALL ) if not content_match: continue content_html = content_match.group(1) # Extract media from content attachments = self._extract_media(content_html) if not attachments: continue # Skip text-only replies # Build published_at from timestamp published_at = None if timestamp: try: dt = datetime.fromtimestamp(timestamp, tz=timezone.utc) published_at = dt.isoformat() except (ValueError, OSError): pass post = Post( post_id=post_id, service_id=self.SERVICE_ID, platform=self.PLATFORM, creator_id=topic_id, title='', content=f"Posted by {username}" if username else '', published_at=published_at, attachments=attachments, ) posts.append(post) known.add(post_id) return posts def _extract_media(self, content_html: str) -> List[Attachment]: """Extract image and video attachments from a comment's HTML content.""" attachments: List[Attachment] = [] seen_urls: set = set() # 1. Bellazon-hosted images: for m in re.finditer( r'ipsAttachLink_image"\s+href="([^"]+)"[^>]*>]*src="([^"]+)"', content_html ): full_url = self._normalize_url(m.group(1)) if full_url in seen_urls: continue # Skip thumbnails as the full URL if '_thumb.' in full_url or '.thumb.' in full_url: continue seen_urls.add(full_url) attachments.append(self._make_attachment(full_url, 'image')) # 2. Direct image/video links from bellazon uploads not caught by pattern 1 for m in re.finditer( r'href="([^"]*bellazon\.com/main/uploads/[^"]+)"', content_html ): url = self._normalize_url(m.group(1)) if url in seen_urls: continue if '_thumb.' in url or '.thumb.' in url: continue ext = self._get_extension(url) if ext in self.IMAGE_EXTS or ext in self.VIDEO_EXTS: seen_urls.add(url) file_type = 'image' if ext in self.IMAGE_EXTS else 'video' attachments.append(self._make_attachment(url, file_type)) # 3. Video tags: for m in re.finditer( r']*type="video/', content_html ): url = self._normalize_url(m.group(1)) if url in seen_urls: continue seen_urls.add(url) name = self._filename_from_url(url) attachments.append(self._make_attachment(url, 'video', name=name)) # 4. Video/file attachments: filename.MP4 # These are protocol-relative URLs like //www.bellazon.com/main/applications/... for m in re.finditer( r'href="([^"]*attachment\.php\?id=\d+[^"]*)"[^>]*>([^<]+)', content_html ): att_url = self._normalize_url(m.group(1)) filename = m.group(2).strip() if att_url in seen_urls: continue ext = self._get_extension(filename) if ext in self.VIDEO_EXTS or ext in self.IMAGE_EXTS: seen_urls.add(att_url) file_type = 'video' if ext in self.VIDEO_EXTS else 'image' attachments.append(self._make_attachment(att_url, file_type, name=filename)) return attachments def _make_attachment(self, url: str, file_type: str, name: str = None) -> Attachment: """Create an Attachment from a URL.""" if name is None: name = self._filename_from_url(url) ext = self._get_extension(name) return Attachment( name=name, file_type=file_type, extension=ext if ext else None, server_path=url, # Used as dedup key download_url=url, ) # ------------------------------------------------------------------ # Utility helpers # ------------------------------------------------------------------ async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]: """Fetch a single page, return HTML or None.""" try: async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp: if resp.status != 200: self.log(f"HTTP {resp.status} for {url}", 'warning') return None return await resp.text() except Exception as e: self.log(f"Error fetching {url}: {e}", 'warning') return None @staticmethod def _extract_slug(url: str, topic_id: str) -> str: """Extract slug from URL like /topic/39089-india-reynolds/""" m = re.search(rf'/topic/{re.escape(topic_id)}-([^/?#]+)', url) if m: return m.group(1).strip('/') return topic_id @staticmethod def _extract_title(page_html: str) -> Optional[str]: """Extract thread title from

.""" m = re.search(r']*>([^<]+)

', page_html) if m: return html.unescape(m.group(1).strip()) m = re.search(r'([^<]+)', page_html, re.IGNORECASE) if m: title = html.unescape(m.group(1).strip()) # Remove site suffix title = re.sub(r'\s*[-–—]\s*Bellazon.*$', '', title, flags=re.IGNORECASE).strip() return title return None @staticmethod def _extract_page_count(page_html: str) -> int: """Extract total page count from 'Page X of Y'.""" m = re.search(r'Page\s+\d+\s+of\s+(\d+)', page_html) if m: return int(m.group(1)) return 1 @staticmethod def _parse_quotedata(raw: str) -> tuple: """Parse HTML-encoded JSON quotedata, return (username, unix_timestamp).""" try: decoded = html.unescape(raw) data = json.loads(decoded) return data.get('username', ''), data.get('timestamp') except (json.JSONDecodeError, ValueError): return '', None @staticmethod def _normalize_url(url: str) -> str: """Normalize a URL: handle protocol-relative, decode HTML entities, make absolute.""" url = html.unescape(url) # & → & if url.startswith('//'): url = 'https:' + url elif url.startswith('/'): url = 'https://www.bellazon.com' + url elif not url.startswith('http'): url = 'https://www.bellazon.com/main/' + url return url @staticmethod def _get_extension(filename_or_url: str) -> str: """Get lowercase file extension from a filename or URL.""" # Strip query params clean = filename_or_url.split('?')[0].split('#')[0] if '.' in clean.split('/')[-1]: return clean.rsplit('.', 1)[-1].lower() return '' @staticmethod def _filename_from_url(url: str) -> str: """Extract filename from URL path.""" path = urlparse(url).path name = path.rstrip('/').split('/')[-1] return name if name else 'unnamed'