media-downloader/modules/paid_content/bellazon_client.py

"""
Bellazon Forum Thread Client for Paid Content

Scrapes Bellazon forum threads (Invision Power Suite) treating each thread
as a "creator" and each reply with media as a post.

Only bellazon-hosted uploads are captured (external image host links are
unreliable/ephemeral). Video attachments (attachment.php) are also captured.
"""

import asyncio
import html
import json
import re
from datetime import datetime, timezone
from typing import Dict, List, Optional, Set
from urllib.parse import urlparse

import aiohttp

from modules.base_module import LoggingMixin
from .models import Post, Attachment


class BellazonClient(LoggingMixin):
    """Client for scraping Bellazon forum threads."""

    SERVICE_ID = 'bellazon'
    PLATFORM = 'bellazon'
    BASE_URL = 'https://www.bellazon.com/main'

    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                       '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9',
    }

    # Extensions considered images
    IMAGE_EXTS = {'jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'tiff'}
    # Extensions considered videos
    VIDEO_EXTS = {'mp4', 'mov', 'avi', 'mkv', 'webm', 'm4v', 'wmv', 'flv'}

    def __init__(self, log_callback=None):
        self._init_logger('PaidContent', log_callback, default_module='Bellazon')

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    async def get_profile_info(self, topic_id: str) -> Optional[Dict]:
        """Fetch first page of a thread and return profile-like info.

        Returns dict with: username (slug), display_name, post_count, topic_url
        """
        # Bellazon requires a slug in the URL but redirects to the correct one
        url = f'{self.BASE_URL}/topic/{topic_id}-x/'
        timeout = aiohttp.ClientTimeout(total=30)

        try:
            async with aiohttp.ClientSession(timeout=timeout) as session:
                async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
                    if resp.status != 200:
                        self.log(f"Bellazon topic {topic_id} returned HTTP {resp.status}", 'warning')
                        return None
                    final_url = str(resp.url)
                    page_html = await resp.text()
        except Exception as e:
            self.log(f"Failed to fetch Bellazon topic {topic_id}: {e}", 'error')
            return None

        # Extract slug from final URL: /topic/{id}-{slug}/
        slug = self._extract_slug(final_url, topic_id)

        # Extract thread title from <h1>
        title = self._extract_title(page_html)

        # Extract page count from "Page X of Y"
        page_count = self._extract_page_count(page_html)

        # Count comments on this page to estimate total
        comment_ids = re.findall(r'data-commentid="(\d+)"', page_html)
        per_page = len(comment_ids) or 20
        estimated_comments = per_page * page_count

        return {
            'username': slug,
            'display_name': title or slug,
            'post_count': estimated_comments,
            'page_count': page_count,
            'topic_url': final_url.split('?')[0].rstrip('/'),
        }

    async def get_posts(self, topic_id: str, topic_url: str,
                        known_post_ids: Optional[Set[str]] = None,
                        progress_callback=None) -> List[Post]:
        """Scrape all pages of a thread and return posts with media."""
        known = known_post_ids or set()
        posts: List[Post] = []

        # Fetch page 1 to get page count
        page1_url = f'{topic_url}/page/1/'
        timeout = aiohttp.ClientTimeout(total=30)

        try:
            async with aiohttp.ClientSession(timeout=timeout) as session:
                page_html = await self._fetch_page(session, page1_url)
                if page_html is None:
                    return posts

                page_count = self._extract_page_count(page_html)
                self.log(f"Thread has {page_count} pages", 'info')

                # Parse page 1
                page_posts = self._parse_page(page_html, topic_id, known)
                posts.extend(page_posts)

                if progress_callback:
                    progress_callback(len(posts))

                # Parse remaining pages
                for page_num in range(2, page_count + 1):
                    page_url = f'{topic_url}/page/{page_num}/'
                    await asyncio.sleep(1)  # Rate limit

                    page_html = await self._fetch_page(session, page_url)
                    if page_html is None:
                        self.log(f"Failed to fetch page {page_num}, stopping", 'warning')
                        break

                    page_posts = self._parse_page(page_html, topic_id, known)
                    posts.extend(page_posts)

                    if progress_callback:
                        progress_callback(len(posts))

                    self.log(f"Page {page_num}/{page_count}: {len(page_posts)} posts with media", 'debug')

        except Exception as e:
            self.log(f"Error scraping Bellazon thread: {e}", 'error')

        self.log(f"Total: {len(posts)} posts with media from {page_count} pages", 'info')
        return posts

    # ------------------------------------------------------------------
    # HTML parsing helpers
    # ------------------------------------------------------------------

    def _parse_page(self, page_html: str, topic_id: str, known: Set[str]) -> List[Post]:
        """Parse a single page of HTML and return Post objects for comments with media."""
        posts: List[Post] = []

        # Split HTML into comment blocks using data-commentid markers
        # Each comment starts with data-commentid="..." and contains a content block
        comment_pattern = re.compile(
            r'data-commentid="(\d+)"\s+data-quotedata="([^"]*)"',
            re.DOTALL
        )

        matches = list(comment_pattern.finditer(page_html))
        if not matches:
            return posts

        for i, match in enumerate(matches):
            comment_id = match.group(1)
            post_id = f"comment_{comment_id}"

            if post_id in known:
                continue

            quotedata_raw = match.group(2)

            # Parse quote data for username and timestamp
            username, timestamp = self._parse_quotedata(quotedata_raw)

            # Extract the content block for this comment
            start = match.end()
            end = matches[i + 1].start() if i + 1 < len(matches) else len(page_html)
            content_block = page_html[start:end]

            # Find the actual content within data-role="commentContent"
            # The closing pattern is </div> followed by blank lines then </div>
            content_match = re.search(
                r'data-role="commentContent"[^>]*>(.*?)</div>\s*\n\s*\n\s*</div>',
                content_block, re.DOTALL
            )
            if not content_match:
                # Fallback: grab everything from commentContent to ipsEntry__foot
                content_match = re.search(
                    r'data-role="commentContent"[^>]*>(.*?)(?=ipsEntry__foot)',
                    content_block, re.DOTALL
                )
            if not content_match:
                continue

            content_html = content_match.group(1)

            # Extract media from content
            attachments = self._extract_media(content_html)

            if not attachments:
                continue  # Skip text-only replies

            # Build published_at from timestamp
            published_at = None
            if timestamp:
                try:
                    dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
                    published_at = dt.isoformat()
                except (ValueError, OSError):
                    pass

            post = Post(
                post_id=post_id,
                service_id=self.SERVICE_ID,
                platform=self.PLATFORM,
                creator_id=topic_id,
                title='',
                content=f"Posted by {username}" if username else '',
                published_at=published_at,
                attachments=attachments,
            )
            posts.append(post)
            known.add(post_id)

        return posts

    def _extract_media(self, content_html: str) -> List[Attachment]:
        """Extract image and video attachments from a comment's HTML content."""
        attachments: List[Attachment] = []
        seen_urls: set = set()

        # 1. Bellazon-hosted images: <a class="ipsAttachLink ipsAttachLink_image" href="...full..."><img src="...thumb...">
        for m in re.finditer(
            r'ipsAttachLink_image"\s+href="([^"]+)"[^>]*><img[^>]*src="([^"]+)"',
            content_html
        ):
            full_url = self._normalize_url(m.group(1))
            if full_url in seen_urls:
                continue
            # Skip thumbnails as the full URL
            if '_thumb.' in full_url or '.thumb.' in full_url:
                continue
            seen_urls.add(full_url)
            attachments.append(self._make_attachment(full_url, 'image'))

        # 2. Direct image/video links from bellazon uploads not caught by pattern 1
        for m in re.finditer(
            r'href="([^"]*bellazon\.com/main/uploads/[^"]+)"',
            content_html
        ):
            url = self._normalize_url(m.group(1))
            if url in seen_urls:
                continue
            if '_thumb.' in url or '.thumb.' in url:
                continue
            ext = self._get_extension(url)
            if ext in self.IMAGE_EXTS or ext in self.VIDEO_EXTS:
                seen_urls.add(url)
                file_type = 'image' if ext in self.IMAGE_EXTS else 'video'
                attachments.append(self._make_attachment(url, file_type))

        # 3. Video <source> tags: <source src="//www.bellazon.com/main/uploads/...MP4" type="video/mp4">
        for m in re.finditer(
            r'<source\s+src="([^"]+)"[^>]*type="video/',
            content_html
        ):
            url = self._normalize_url(m.group(1))
            if url in seen_urls:
                continue
            seen_urls.add(url)
            name = self._filename_from_url(url)
            attachments.append(self._make_attachment(url, 'video', name=name))

        # 4. Video/file attachments: <a href="...attachment.php?id=XXX">filename.MP4</a>
        # These are protocol-relative URLs like //www.bellazon.com/main/applications/...
        for m in re.finditer(
            r'href="([^"]*attachment\.php\?id=\d+[^"]*)"[^>]*>([^<]+)',
            content_html
        ):
            att_url = self._normalize_url(m.group(1))
            filename = m.group(2).strip()
            if att_url in seen_urls:
                continue
            ext = self._get_extension(filename)
            if ext in self.VIDEO_EXTS or ext in self.IMAGE_EXTS:
                seen_urls.add(att_url)
                file_type = 'video' if ext in self.VIDEO_EXTS else 'image'
                attachments.append(self._make_attachment(att_url, file_type, name=filename))

        return attachments

    def _make_attachment(self, url: str, file_type: str, name: str = None) -> Attachment:
        """Create an Attachment from a URL."""
        if name is None:
            name = self._filename_from_url(url)
        ext = self._get_extension(name)

        return Attachment(
            name=name,
            file_type=file_type,
            extension=ext if ext else None,
            server_path=url,  # Used as dedup key
            download_url=url,
        )

    # ------------------------------------------------------------------
    # Utility helpers
    # ------------------------------------------------------------------

    async def _fetch_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
        """Fetch a single page, return HTML or None."""
        try:
            async with session.get(url, headers=self.HEADERS, allow_redirects=True) as resp:
                if resp.status != 200:
                    self.log(f"HTTP {resp.status} for {url}", 'warning')
                    return None
                return await resp.text()
        except Exception as e:
            self.log(f"Error fetching {url}: {e}", 'warning')
            return None

    @staticmethod
    def _extract_slug(url: str, topic_id: str) -> str:
        """Extract slug from URL like /topic/39089-india-reynolds/"""
        m = re.search(rf'/topic/{re.escape(topic_id)}-([^/?#]+)', url)
        if m:
            return m.group(1).strip('/')
        return topic_id

    @staticmethod
    def _extract_title(page_html: str) -> Optional[str]:
        """Extract thread title from <h1>."""
        m = re.search(r'<h1[^>]*>([^<]+)</h1>', page_html)
        if m:
            return html.unescape(m.group(1).strip())
        m = re.search(r'<title>([^<]+)</title>', page_html, re.IGNORECASE)
        if m:
            title = html.unescape(m.group(1).strip())
            # Remove site suffix
            title = re.sub(r'\s*[-–—]\s*Bellazon.*$', '', title, flags=re.IGNORECASE).strip()
            return title
        return None

    @staticmethod
    def _extract_page_count(page_html: str) -> int:
        """Extract total page count from 'Page X of Y'."""
        m = re.search(r'Page\s+\d+\s+of\s+(\d+)', page_html)
        if m:
            return int(m.group(1))
        return 1

    @staticmethod
    def _parse_quotedata(raw: str) -> tuple:
        """Parse HTML-encoded JSON quotedata, return (username, unix_timestamp)."""
        try:
            decoded = html.unescape(raw)
            data = json.loads(decoded)
            return data.get('username', ''), data.get('timestamp')
        except (json.JSONDecodeError, ValueError):
            return '', None

    @staticmethod
    def _normalize_url(url: str) -> str:
        """Normalize a URL: handle protocol-relative, decode HTML entities, make absolute."""
        url = html.unescape(url)  # &amp; → &
        if url.startswith('//'):
            url = 'https:' + url
        elif url.startswith('/'):
            url = 'https://www.bellazon.com' + url
        elif not url.startswith('http'):
            url = 'https://www.bellazon.com/main/' + url
        return url

    @staticmethod
    def _get_extension(filename_or_url: str) -> str:
        """Get lowercase file extension from a filename or URL."""
        # Strip query params
        clean = filename_or_url.split('?')[0].split('#')[0]
        if '.' in clean.split('/')[-1]:
            return clean.rsplit('.', 1)[-1].lower()
        return ''

    @staticmethod
    def _filename_from_url(url: str) -> str:
        """Extract filename from URL path."""
        path = urlparse(url).path
        name = path.rstrip('/').split('/')[-1]
        return name if name else 'unnamed'