Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/modules/paid_content/bellazon_client.py
+++ b/modules/paid_content/bellazon_client.py
@@ -0,0 +1,389 @@
+"""
+Bellazon Forum Thread Client for Paid Content
+
+Scrapes Bellazon forum threads (Invision Power Suite) treating each thread
+as a "creator" and each reply with media as a post.
+
+Only bellazon-hosted uploads are captured (external image host links are
+unreliable/ephemeral). Video attachments (attachment.php) are also captured.
+"""
+
+import asyncio
+import html
+import json
+import re
+from datetime import datetime, timezone
+from typing import Dict, List, Optional, Set
+from urllib.parse import urlparse
+
+import aiohttp
+
+from modules.base_module import LoggingMixin
+from .models import Post, Attachment
+
+
+class BellazonClient(LoggingMixin):
+    """Client for scraping Bellazon forum threads."""
+
+    SERVICE_ID = 'bellazon'
+    PLATFORM = 'bellazon'
+    BASE_URL = 'https://www.bellazon.com/main'
+
+    HEADERS = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+                       '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.9',
+    }
+
+    # Extensions considered images
+    IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
+    # Extensions considered videos
+    VIDEO_EXTS = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}
+
+    def __init__(self, log_callback=None):
+        self._init_logger('PaidContent', log_callback, default_module='Bellazon')
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    async def get_profile_info(self, topic_id: str) -> Optional[Dict]:
+        """Fetch first page of a thread and return profile-like info.
+
+        Returns dict with: username (slug), display_name, post_count, topic_url
+        """
+        # Bellazon requires a slug in the URL but redirects to the correct one
+        url = f'{self.BASE_URL}/topic/{topic_id}-x/'
+        timeout = aiohttp.ClientTimeout(total=30)
+
+        try:
+            async with aiohttp.ClientSession(timeout=timeout) as session:
+                async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
+                    if resp.status != 200:
+                        self.log(f"Bellazon topic {topic_id} returned HTTP {resp.status}", 'warning')
+                        return None
+                    final_url = str(resp.url)
+                    page_html = await resp.text()
+        except Exception as e:
+            self.log(f"Failed to fetch Bellazon topic {topic_id}: {e}", 'error')
+            return None
+
+        # Extract slug from final URL: /topic/{id}-{slug}/
+        slug = self._extract_slug(final_url, topic_id)
+
+        # Extract thread title from <h1>
+        title = self._extract_title(page_html)
+
+        # Extract page count from "Page X of Y"
+        page_count = self._extract_page_count(page_html)
+
+        # Count comments on this page to estimate total
+        comment_ids = re.findall(r'data-commentid="(\d+)"', page_html)
+        per_page = len(comment_ids) or 20
+        estimated_comments = per_page * page_count
+
+        return {
+            'username': slug,
+            'display_name': title or slug,
+            'post_count': estimated_comments,
+            'page_count': page_count,
+            'topic_url': final_url.split('?')[0].rstrip('/'),
+        }
+
+    async def get_posts(self, topic_id: str, topic_url: str,
+                        known_post_ids: Optional[Set[str]] = None,
+                        progress_callback=None) -> List[Post]:
+        """Scrape all pages of a thread and return posts with media."""
+        known = known_post_ids or set()
+        posts: List[Post] = []
+
+        # Fetch page 1 to get page count
+        page1_url = f'{topic_url}/page/1/'
+        timeout = aiohttp.ClientTimeout(total=30)
+
+        try:
+            async with aiohttp.ClientSession(timeout=timeout) as session:
+                page_html = await self._fetch_page(session, page1_url)
+                if page_html is None:
+                    return posts
+
+                page_count = self._extract_page_count(page_html)
+                self.log(f"Thread has {page_count} pages", 'info')
+
+                # Parse page 1
+                page_posts = self._parse_page(page_html, topic_id, known)
+                posts.extend(page_posts)
+
+                if progress_callback:
+                    progress_callback(len(posts))
+
+                # Parse remaining pages
+                for page_num in range(2, page_count + 1):
+                    page_url = f'{topic_url}/page/{page_num}/'
+                    await asyncio.sleep(1)  # Rate limit
+
+                    page_html = await self._fetch_page(session, page_url)
+                    if page_html is None:
+                        self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
+                        break
+
+                    page_posts = self._parse_page(page_html, topic_id, known)
+                    posts.extend(page_posts)
+
+                    if progress_callback:
+                        progress_callback(len(posts))
+
+                    self.log(f"Page {page_num}/{page_count}: {len(page_posts)} posts with media", 'debug')
+
+        except Exception as e:
+            self.log(f"Error scraping Bellazon thread: {e}", 'error')
+
+        self.log(f"Total: {len(posts)} posts with media from {page_count} pages", 'info')
+        return posts
+
+    # ------------------------------------------------------------------
+    # HTML parsing helpers
+    # ------------------------------------------------------------------
+
+    def _parse_page(self, page_html: str, topic_id: str, known: Set[str]) -> List[Post]:
+        """Parse a single page of HTML and return Post objects for comments with media."""
+        posts: List[Post] = []
+
+        # Split HTML into comment blocks using data-commentid markers
+        # Each comment starts with data-commentid="..." and contains a content block
+        comment_pattern = re.compile(
+            r'data-commentid="(\d+)"\s+data-quotedata="([^"]*)"',
+            re.DOTALL
+        )
+
+        matches = list(comment_pattern.finditer(page_html))
+        if not matches:
+            return posts
+
+        for i, match in enumerate(matches):
+            comment_id = match.group(1)
+            post_id = f"comment_{comment_id}"
+
+            if post_id in known:
+                continue
+
+            quotedata_raw = match.group(2)
+
+            # Parse quote data for username and timestamp
+            username, timestamp = self._parse_quotedata(quotedata_raw)
+
+            # Extract the content block for this comment
+            start = match.end()
+            end = matches[i + 1].start() if i + 1 < len(matches) else len(page_html)
+            content_block = page_html[start:end]
+
+            # Find the actual content within data-role="commentContent"
+            # The closing pattern is </div> followed by blank lines then </div>
+            content_match = re.search(
+                r'data-role="commentContent"[^>]*>(.*?)</div>\s*\n\s*\n\s*</div>',
+                content_block, re.DOTALL
+            )
+            if not content_match:
+                # Fallback: grab everything from commentContent to ipsEntry__foot
+                content_match = re.search(
+                    r'data-role="commentContent"[^>]*>(.*?)(?=ipsEntry__foot)',
+                    content_block, re.DOTALL
+                )
+            if not content_match:
+                continue
+
+            content_html = content_match.group(1)
+
+            # Extract media from content
+            attachments = self._extract_media(content_html)
+
+            if not attachments:
+                continue  # Skip text-only replies
+
+            # Build published_at from timestamp
+            published_at = None
+            if timestamp:
+                try:
+                    dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
+                    published_at = dt.isoformat()
+                except (ValueError, OSError):
+                    pass
+
+            post = Post(
+                post_id=post_id,
+                service_id=self.SERVICE_ID,
+                platform=self.PLATFORM,
+                creator_id=topic_id,
+                title='',
+                content=f"Posted by {username}" if username else '',
+                published_at=published_at,
+                attachments=attachments,
+            )
+            posts.append(post)
+            known.add(post_id)
+
+        return posts
+
+    def _extract_media(self, content_html: str) -> List[Attachment]:
+        """Extract image and video attachments from a comment's HTML content."""
+        attachments: List[Attachment] = []
+        seen_urls: set = set()
+
+        # 1. Bellazon-hosted images: <a class="ipsAttachLink ipsAttachLink_image" href="...full..."><img src="...thumb...">
+        for m in re.finditer(
+            r'ipsAttachLink_image"\s+href="([^"]+)"[^>]*><img[^>]*src="([^"]+)"',
+            content_html
+        ):
+            full_url = self._normalize_url(m.group(1))
+            if full_url in seen_urls:
+                continue
+            # Skip thumbnails as the full URL
+            if '_thumb.' in full_url or '.thumb.' in full_url:
+                continue
+            seen_urls.add(full_url)
+            attachments.append(self._make_attachment(full_url, 'image'))
+
+        # 2. Direct image/video links from bellazon uploads not caught by pattern 1
+        for m in re.finditer(
+            r'href="([^"]*bellazon\.com/main/uploads/[^"]+)"',
+            content_html
+        ):
+            url = self._normalize_url(m.group(1))
+            if url in seen_urls:
+                continue
+            if '_thumb.' in url or '.thumb.' in url:
+                continue
+            ext = self._get_extension(url)
+            if ext in self.IMAGE_EXTS or ext in self.VIDEO_EXTS:
+                seen_urls.add(url)
+                file_type = 'image' if ext in self.IMAGE_EXTS else 'video'
+                attachments.append(self._make_attachment(url, file_type))
+
+        # 3. Video <source> tags: <source src="//www.bellazon.com/main/uploads/...MP4" type="video/mp4">
+        for m in re.finditer(
+            r'<source\s+src="([^"]+)"[^>]*type="video/',
+            content_html
+        ):
+            url = self._normalize_url(m.group(1))
+            if url in seen_urls:
+                continue
+            seen_urls.add(url)
+            name = self._filename_from_url(url)
+            attachments.append(self._make_attachment(url, 'video', name=name))
+
+        # 4. Video/file attachments: <a href="...attachment.php?id=XXX">filename.MP4</a>
+        # These are protocol-relative URLs like //www.bellazon.com/main/applications/...
+        for m in re.finditer(
+            r'href="([^"]*attachment\.php\?id=\d+[^"]*)"[^>]*>([^<]+)',
+            content_html
+        ):
+            att_url = self._normalize_url(m.group(1))
+            filename = m.group(2).strip()
+            if att_url in seen_urls:
+                continue
+            ext = self._get_extension(filename)
+            if ext in self.VIDEO_EXTS or ext in self.IMAGE_EXTS:
+                seen_urls.add(att_url)
+                file_type = 'video' if ext in self.VIDEO_EXTS else 'image'
+                attachments.append(self._make_attachment(att_url, file_type, name=filename))
+
+        return attachments
+
+    def _make_attachment(self, url: str, file_type: str, name: str = None) -> Attachment:
+        """Create an Attachment from a URL."""
+        if name is None:
+            name = self._filename_from_url(url)
+        ext = self._get_extension(name)
+
+        return Attachment(
+            name=name,
+            file_type=file_type,
+            extension=ext if ext else None,
+            server_path=url,  # Used as dedup key
+            download_url=url,
+        )
+
+    # ------------------------------------------------------------------
+    # Utility helpers
+    # ------------------------------------------------------------------
+
+    async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
+        """Fetch a single page, return HTML or None."""
+        try:
+            async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
+                if resp.status != 200:
+                    self.log(f"HTTP {resp.status} for {url}", 'warning')
+                    return None
+                return await resp.text()
+        except Exception as e:
+            self.log(f"Error fetching {url}: {e}", 'warning')
+            return None
+
+    @staticmethod
+    def _extract_slug(url: str, topic_id: str) -> str:
+        """Extract slug from URL like /topic/39089-india-reynolds/"""
+        m = re.search(rf'/topic/{re.escape(topic_id)}-([^/?#]+)', url)
+        if m:
+            return m.group(1).strip('/')
+        return topic_id
+
+    @staticmethod
+    def _extract_title(page_html: str) -> Optional[str]:
+        """Extract thread title from <h1>."""
+        m = re.search(r'<h1[^>]*>([^<]+)</h1>', page_html)
+        if m:
+            return html.unescape(m.group(1).strip())
+        m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
+        if m:
+            title = html.unescape(m.group(1).strip())
+            # Remove site suffix
+            title = re.sub(r'\s*[-–—]\s*Bellazon.*$', '', title, flags=re.IGNORECASE).strip()
+            return title
+        return None
+
+    @staticmethod
+    def _extract_page_count(page_html: str) -> int:
+        """Extract total page count from 'Page X of Y'."""
+        m = re.search(r'Page\s+\d+\s+of\s+(\d+)', page_html)
+        if m:
+            return int(m.group(1))
+        return 1
+
+    @staticmethod
+    def _parse_quotedata(raw: str) -> tuple:
+        """Parse HTML-encoded JSON quotedata, return (username, unix_timestamp)."""
+        try:
+            decoded = html.unescape(raw)
+            data = json.loads(decoded)
+            return data.get('username', ''), data.get('timestamp')
+        except (json.JSONDecodeError, ValueError):
+            return '', None
+
+    @staticmethod
+    def _normalize_url(url: str) -> str:
+        """Normalize a URL: handle protocol-relative, decode HTML entities, make absolute."""
+        url = html.unescape(url)  # &amp; → &
+        if url.startswith('//'):
+            url = 'https:' + url
+        elif url.startswith('/'):
+            url = 'https://www.bellazon.com' + url
+        elif not url.startswith('http'):
+            url = 'https://www.bellazon.com/main/' + url
+        return url
+
+    @staticmethod
+    def _get_extension(filename_or_url: str) -> str:
+        """Get lowercase file extension from a filename or URL."""
+        # Strip query params
+        clean = filename_or_url.split('?')[0].split('#')[0]
+        if '.' in clean.split('/')[-1]:
+            return clean.rsplit('.', 1)[-1].lower()
+        return ''
+
+    @staticmethod
+    def _filename_from_url(url: str) -> str:
+        """Extract filename from URL path."""
+        path = urlparse(url).path
+        name = path.rstrip('/').split('/')[-1]
+        return name if name else 'unnamed'